ff8fdbc21a455d0b8d1868e8ba18b6fffc2719c1
[linux-2.6.git] / kernel / power / swsusp.c
1 /*
2  * linux/kernel/power/swsusp.c
3  *
4  * This file is to realize architecture-independent
5  * machine suspend feature using pretty near only high-level routines
6  *
7  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8  * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
9  *
10  * This file is released under the GPLv2.
11  *
12  * I'd like to thank the following people for their work:
13  * 
14  * Pavel Machek <pavel@ucw.cz>:
15  * Modifications, defectiveness pointing, being with me at the very beginning,
16  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17  *
18  * Steve Doddi <dirk@loth.demon.co.uk>: 
19  * Support the possibility of hardware state restoring.
20  *
21  * Raph <grey.havens@earthling.net>:
22  * Support for preserving states of network devices and virtual console
23  * (including X and svgatextmode)
24  *
25  * Kurt Garloff <garloff@suse.de>:
26  * Straightened the critical function in order to prevent compilers from
27  * playing tricks with local variables.
28  *
29  * Andreas Mohr <a.mohr@mailto.de>
30  *
31  * Alex Badea <vampire@go.ro>:
32  * Fixed runaway init
33  *
34  * More state savers are welcome. Especially for the scsi layer...
35  *
36  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
37  */
38
39 #include <linux/module.h>
40 #include <linux/mm.h>
41 #include <linux/suspend.h>
42 #include <linux/smp_lock.h>
43 #include <linux/file.h>
44 #include <linux/utsname.h>
45 #include <linux/version.h>
46 #include <linux/delay.h>
47 #include <linux/reboot.h>
48 #include <linux/bitops.h>
49 #include <linux/vt_kern.h>
50 #include <linux/kbd_kern.h>
51 #include <linux/keyboard.h>
52 #include <linux/spinlock.h>
53 #include <linux/genhd.h>
54 #include <linux/kernel.h>
55 #include <linux/major.h>
56 #include <linux/swap.h>
57 #include <linux/pm.h>
58 #include <linux/device.h>
59 #include <linux/buffer_head.h>
60 #include <linux/swapops.h>
61 #include <linux/bootmem.h>
62 #include <linux/syscalls.h>
63 #include <linux/console.h>
64 #include <linux/highmem.h>
65
66 #include <asm/uaccess.h>
67 #include <asm/mmu_context.h>
68 #include <asm/pgtable.h>
69 #include <asm/io.h>
70
71 #include "power.h"
72
73 unsigned char software_suspend_enabled = 0;
74
75 #define NORESUME                1
76 #define RESUME_SPECIFIED        2
77
78 /* References to section boundaries */
79 extern char __nosave_begin, __nosave_end;
80
81 extern int is_head_of_free_region(struct page *);
82
83 /* Locks */
84 spinlock_t suspend_pagedir_lock __nosavedata = SPIN_LOCK_UNLOCKED;
85
86 /* Variables to be preserved over suspend */
87 static int pagedir_order_check;
88 static int nr_copy_pages_check;
89
90 static int resume_status;
91 static char resume_file[256] = "";                      /* For resume= kernel option */
92 static dev_t resume_device;
93 /* Local variables that should not be affected by save */
94 unsigned int nr_copy_pages __nosavedata = 0;
95
96 /* Suspend pagedir is allocated before final copy, therefore it
97    must be freed after resume 
98
99    Warning: this is evil. There are actually two pagedirs at time of
100    resume. One is "pagedir_save", which is empty frame allocated at
101    time of suspend, that must be freed. Second is "pagedir_nosave", 
102    allocated at time of resume, that travels through memory not to
103    collide with anything.
104
105    Warning: this is even more evil than it seems. Pagedirs this file
106    talks about are completely different from page directories used by
107    MMU hardware.
108  */
109 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
110 static suspend_pagedir_t *pagedir_save;
111 static int pagedir_order __nosavedata = 0;
112
113 struct link {
114         char dummy[PAGE_SIZE - sizeof(swp_entry_t)];
115         swp_entry_t next;
116 };
117
118 union diskpage {
119         union swap_header swh;
120         struct link link;
121         struct suspend_header sh;
122 };
123
124 /*
125  * XXX: We try to keep some more pages free so that I/O operations succeed
126  * without paging. Might this be more?
127  */
128 #define PAGES_FOR_IO    512
129
130 static const char name_suspend[] = "Suspend Machine: ";
131 static const char name_resume[] = "Resume Machine: ";
132
133 /*
134  * Debug
135  */
136 #define DEBUG_DEFAULT
137 #undef  DEBUG_PROCESS
138 #undef  DEBUG_SLOW
139 #define TEST_SWSUSP 0           /* Set to 1 to reboot instead of halt machine after suspension */
140
141 #ifdef DEBUG_DEFAULT
142 # define PRINTK(f, a...)        printk(f, ## a)
143 #else
144 # define PRINTK(f, a...)        do { } while(0)
145 #endif
146
147 #ifdef DEBUG_SLOW
148 #define MDELAY(a) mdelay(a)
149 #else
150 #define MDELAY(a) do { } while(0)
151 #endif
152
153 /*
154  * Saving part...
155  */
156
157 static __inline__ int fill_suspend_header(struct suspend_header *sh)
158 {
159         memset((char *)sh, 0, sizeof(*sh));
160
161         sh->version_code = LINUX_VERSION_CODE;
162         sh->num_physpages = num_physpages;
163         strncpy(sh->machine, system_utsname.machine, 8);
164         strncpy(sh->version, system_utsname.version, 20);
165         /* FIXME: Is this bogus? --RR */
166         sh->num_cpus = num_online_cpus();
167         sh->page_size = PAGE_SIZE;
168         sh->suspend_pagedir = pagedir_nosave;
169         BUG_ON (pagedir_save != pagedir_nosave);
170         sh->num_pbes = nr_copy_pages;
171         /* TODO: needed? mounted fs' last mounted date comparison
172          * [so they haven't been mounted since last suspend.
173          * Maybe it isn't.] [we'd need to do this for _all_ fs-es]
174          */
175         return 0;
176 }
177
178 /* We memorize in swapfile_used what swap devices are used for suspension */
179 #define SWAPFILE_UNUSED    0
180 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
181 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
182
183 static unsigned short swapfile_used[MAX_SWAPFILES];
184 static unsigned short root_swap;
185 #define MARK_SWAP_SUSPEND 0
186 #define MARK_SWAP_RESUME 2
187
188 static void mark_swapfiles(swp_entry_t prev, int mode)
189 {
190         swp_entry_t entry;
191         union diskpage *cur;
192         struct page *page;
193
194         if (root_swap == 0xFFFF)  /* ignored */
195                 return;
196
197         page = alloc_page(GFP_ATOMIC);
198         if (!page)
199                 panic("Out of memory in mark_swapfiles");
200         cur = page_address(page);
201         /* XXX: this is dirty hack to get first page of swap file */
202         entry = swp_entry(root_swap, 0);
203         rw_swap_page_sync(READ, entry, page);
204
205         if (mode == MARK_SWAP_RESUME) {
206                 if (!memcmp("S1",cur->swh.magic.magic,2))
207                         memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
208                 else if (!memcmp("S2",cur->swh.magic.magic,2))
209                         memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
210                 else printk("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
211                         name_resume, cur->swh.magic.magic);
212         } else {
213                 if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)))
214                         memcpy(cur->swh.magic.magic,"S1SUSP....",10);
215                 else if ((!memcmp("SWAPSPACE2",cur->swh.magic.magic,10)))
216                         memcpy(cur->swh.magic.magic,"S2SUSP....",10);
217                 else panic("\nSwapspace is not swapspace (%.10s)\n", cur->swh.magic.magic);
218                 cur->link.next = prev; /* prev is the first/last swap page of the resume area */
219                 /* link.next lies *no more* in last 4/8 bytes of magic */
220         }
221         rw_swap_page_sync(WRITE, entry, page);
222         __free_page(page);
223 }
224
225
226 /*
227  * Check whether the swap device is the specified resume
228  * device, irrespective of whether they are specified by
229  * identical names.
230  *
231  * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
232  * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
233  * and they'll be considered the same device.  This is *necessary* for
234  * devfs, since the resume code can only recognize the form /dev/hda4,
235  * but the suspend code would see the long name.)
236  */
237 static int is_resume_device(const struct swap_info_struct *swap_info)
238 {
239         struct file *file = swap_info->swap_file;
240         struct inode *inode = file->f_dentry->d_inode;
241
242         return S_ISBLK(inode->i_mode) &&
243                 resume_device == MKDEV(imajor(inode), iminor(inode));
244 }
245
246 static void read_swapfiles(void) /* This is called before saving image */
247 {
248         int i, len;
249         
250         len=strlen(resume_file);
251         root_swap = 0xFFFF;
252         
253         swap_list_lock();
254         for(i=0; i<MAX_SWAPFILES; i++) {
255                 if (swap_info[i].flags == 0) {
256                         swapfile_used[i]=SWAPFILE_UNUSED;
257                 } else {
258                         if(!len) {
259                                 printk(KERN_WARNING "resume= option should be used to set suspend device" );
260                                 if(root_swap == 0xFFFF) {
261                                         swapfile_used[i] = SWAPFILE_SUSPEND;
262                                         root_swap = i;
263                                 } else
264                                         swapfile_used[i] = SWAPFILE_IGNORED;                              
265                         } else {
266                                 /* we ignore all swap devices that are not the resume_file */
267                                 if (is_resume_device(&swap_info[i])) {
268                                         swapfile_used[i] = SWAPFILE_SUSPEND;
269                                         root_swap = i;
270                                 } else {
271                                         swapfile_used[i] = SWAPFILE_IGNORED;
272                                 }
273                         }
274                 }
275         }
276         swap_list_unlock();
277 }
278
279 static void lock_swapdevices(void) /* This is called after saving image so modification
280                                       will be lost after resume... and that's what we want. */
281 {
282         int i;
283
284         swap_list_lock();
285         for(i = 0; i< MAX_SWAPFILES; i++)
286                 if(swapfile_used[i] == SWAPFILE_IGNORED) {
287                         swap_info[i].flags ^= 0xFF; /* we make the device unusable. A new call to
288                                                        lock_swapdevices can unlock the devices. */
289                 }
290         swap_list_unlock();
291 }
292
293 /**
294  *    write_suspend_image - Write entire image to disk.
295  *
296  *    After writing suspend signature to the disk, suspend may no
297  *    longer fail: we have ready-to-run image in swap, and rollback
298  *    would happen on next reboot -- corrupting data.
299  *
300  *    Note: The buffer we allocate to use to write the suspend header is
301  *    not freed; its not needed since the system is going down anyway
302  *    (plus it causes an oops and I'm lazy^H^H^H^Htoo busy).
303  */
304 static int write_suspend_image(void)
305 {
306         int i;
307         swp_entry_t entry, prev = { 0 };
308         int nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
309         union diskpage *cur,  *buffer = (union diskpage *)get_zeroed_page(GFP_ATOMIC);
310         unsigned long address;
311         struct page *page;
312
313         if (!buffer)
314                 return -ENOMEM;
315
316         printk( "Writing data to swap (%d pages): ", nr_copy_pages );
317         for (i=0; i<nr_copy_pages; i++) {
318                 if (!(i%100))
319                         printk( "." );
320                 if (!(entry = get_swap_page()).val)
321                         panic("\nNot enough swapspace when writing data" );
322                 
323                 if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
324                         panic("\nPage %d: not enough swapspace on suspend device", i );
325             
326                 address = (pagedir_nosave+i)->address;
327                 page = virt_to_page(address);
328                 rw_swap_page_sync(WRITE, entry, page);
329                 (pagedir_nosave+i)->swap_address = entry;
330         }
331         printk( "|\n" );
332         printk( "Writing pagedir (%d pages): ", nr_pgdir_pages);
333         for (i=0; i<nr_pgdir_pages; i++) {
334                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
335                 BUG_ON ((char *) cur != (((char *) pagedir_nosave) + i*PAGE_SIZE));
336                 printk( "." );
337                 if (!(entry = get_swap_page()).val) {
338                         printk(KERN_CRIT "Not enough swapspace when writing pgdir\n" );
339                         panic("Don't know how to recover");
340                         free_page((unsigned long) buffer);
341                         return -ENOSPC;
342                 }
343
344                 if(swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
345                         panic("\nNot enough swapspace for pagedir on suspend device" );
346
347                 BUG_ON (sizeof(swp_entry_t) != sizeof(long));
348                 BUG_ON (PAGE_SIZE % sizeof(struct pbe));
349
350                 cur->link.next = prev;                          
351                 page = virt_to_page((unsigned long)cur);
352                 rw_swap_page_sync(WRITE, entry, page);
353                 prev = entry;
354         }
355         printk("H");
356         BUG_ON (sizeof(struct suspend_header) > PAGE_SIZE-sizeof(swp_entry_t));
357         BUG_ON (sizeof(union diskpage) != PAGE_SIZE);
358         BUG_ON (sizeof(struct link) != PAGE_SIZE);
359         if (!(entry = get_swap_page()).val)
360                 panic( "\nNot enough swapspace when writing header" );
361         if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
362                 panic("\nNot enough swapspace for header on suspend device" );
363
364         cur = (void *) buffer;
365         if (fill_suspend_header(&cur->sh))
366                 BUG();          /* Not a BUG_ON(): we want fill_suspend_header to be called, always */
367                 
368         cur->link.next = prev;
369
370         page = virt_to_page((unsigned long)cur);
371         rw_swap_page_sync(WRITE, entry, page);
372         prev = entry;
373
374         printk( "S" );
375         mark_swapfiles(prev, MARK_SWAP_SUSPEND);
376         printk( "|\n" );
377
378         MDELAY(1000);
379         return 0;
380 }
381
382 #ifdef CONFIG_HIGHMEM
383 struct highmem_page {
384         char *data;
385         struct page *page;
386         struct highmem_page *next;
387 };
388
389 struct highmem_page *highmem_copy = NULL;
390
391 static int save_highmem_zone(struct zone *zone)
392 {
393         unsigned long zone_pfn;
394         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
395                 struct page *page;
396                 struct highmem_page *save;
397                 void *kaddr;
398                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
399                 int chunk_size;
400
401                 if (!(pfn%1000))
402                         printk(".");
403                 if (!pfn_valid(pfn))
404                         continue;
405                 page = pfn_to_page(pfn);
406                 /*
407                  * This condition results from rvmalloc() sans vmalloc_32()
408                  * and architectural memory reservations. This should be
409                  * corrected eventually when the cases giving rise to this
410                  * are better understood.
411                  */
412                 if (PageReserved(page)) {
413                         printk("highmem reserved page?!\n");
414                         continue;
415                 }
416                 if ((chunk_size = is_head_of_free_region(page))) {
417                         pfn += chunk_size - 1;
418                         zone_pfn += chunk_size - 1;
419                         continue;
420                 }
421                 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
422                 if (!save)
423                         return -ENOMEM;
424                 save->next = highmem_copy;
425                 save->page = page;
426                 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
427                 if (!save->data) {
428                         kfree(save);
429                         return -ENOMEM;
430                 }
431                 kaddr = kmap_atomic(page, KM_USER0);
432                 memcpy(save->data, kaddr, PAGE_SIZE);
433                 kunmap_atomic(kaddr, KM_USER0);
434                 highmem_copy = save;
435         }
436         return 0;
437 }
438
439 static int save_highmem(void)
440 {
441         struct zone *zone;
442         int res = 0;
443         for_each_zone(zone) {
444                 if (is_highmem(zone))
445                         res = save_highmem_zone(zone);
446                 if (res)
447                         return res;
448         }
449         return 0;
450 }
451
452 static int restore_highmem(void)
453 {
454         while (highmem_copy) {
455                 struct highmem_page *save = highmem_copy;
456                 void *kaddr;
457                 highmem_copy = save->next;
458
459                 kaddr = kmap_atomic(save->page, KM_USER0);
460                 memcpy(kaddr, save->data, PAGE_SIZE);
461                 kunmap_atomic(kaddr, KM_USER0);
462                 free_page((long) save->data);
463                 kfree(save);
464         }
465         return 0;
466 }
467 #endif
468
469 static int pfn_is_nosave(unsigned long pfn)
470 {
471         unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
472         unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
473         return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
474 }
475
476 /* if *pagedir_p != NULL it also copies the counted pages */
477 static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
478 {
479         unsigned long zone_pfn, chunk_size, nr_copy_pages = 0;
480         struct pbe *pbe = *pagedir_p;
481         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
482                 struct page *page;
483                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
484
485                 if (!(pfn%1000))
486                         printk(".");
487                 if (!pfn_valid(pfn))
488                         continue;
489                 page = pfn_to_page(pfn);
490                 BUG_ON(PageReserved(page) && PageNosave(page));
491                 if (PageNosave(page))
492                         continue;
493                 if (PageReserved(page) && pfn_is_nosave(pfn)) {
494                         PRINTK("[nosave pfn 0x%lx]", pfn);
495                         continue;
496                 }
497                 if ((chunk_size = is_head_of_free_region(page))) {
498                         pfn += chunk_size - 1;
499                         zone_pfn += chunk_size - 1;
500                         continue;
501                 }
502                 nr_copy_pages++;
503                 if (!pbe)
504                         continue;
505                 pbe->orig_address = (long) page_address(page);
506                 /* Copy page is dangerous: it likes to mess with
507                    preempt count on specific cpus. Wrong preempt count is then copied,
508                    oops. */
509                 copy_page((void *)pbe->address, (void *)pbe->orig_address);
510                 pbe++;
511         }
512         *pagedir_p = pbe;
513         return nr_copy_pages;
514 }
515
516 static int count_and_copy_data_pages(struct pbe *pagedir_p)
517 {
518         int nr_copy_pages = 0;
519         struct zone *zone;
520         for_each_zone(zone) {
521                 if (!is_highmem(zone))
522                         nr_copy_pages += count_and_copy_zone(zone, &pagedir_p);
523         }
524         return nr_copy_pages;
525 }
526
527 static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir)
528 {
529         unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn;
530         pagedir_end = pagedir + (PAGE_SIZE << pagedir_order);
531         pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT;
532         pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT;
533         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
534                 struct page *page;
535                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
536                 if (!pfn_valid(pfn))
537                         continue;
538                 page = pfn_to_page(pfn);
539                 if (!TestClearPageNosave(page))
540                         continue;
541                 else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn)
542                         continue;
543                 __free_page(page);
544         }
545 }
546
547 static void free_suspend_pagedir(unsigned long this_pagedir)
548 {
549         struct zone *zone;
550         for_each_zone(zone) {
551                 if (!is_highmem(zone))
552                         free_suspend_pagedir_zone(zone, this_pagedir);
553         }
554         free_pages(this_pagedir, pagedir_order);
555 }
556
557 static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
558 {
559         int i;
560         suspend_pagedir_t *pagedir;
561         struct pbe *p;
562         struct page *page;
563
564         pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
565
566         p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
567         if (!pagedir)
568                 return NULL;
569
570         page = virt_to_page(pagedir);
571         for(i=0; i < 1<<pagedir_order; i++)
572                 SetPageNosave(page++);
573                 
574         while(nr_copy_pages--) {
575                 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
576                 if (!p->address) {
577                         free_suspend_pagedir((unsigned long) pagedir);
578                         return NULL;
579                 }
580                 SetPageNosave(virt_to_page(p->address));
581                 p->orig_address = 0;
582                 p++;
583         }
584         return pagedir;
585 }
586
587 static int prepare_suspend_processes(void)
588 {
589         sys_sync();     /* Syncing needs pdflushd, so do it before stopping processes */
590         if (freeze_processes()) {
591                 printk( KERN_ERR "Suspend failed: Not all processes stopped!\n" );
592                 thaw_processes();
593                 return 1;
594         }
595         return 0;
596 }
597
598 /*
599  * Try to free as much memory as possible, but do not OOM-kill anyone
600  *
601  * Notice: all userland should be stopped at this point, or livelock is possible.
602  */
603 static void free_some_memory(void)
604 {
605         printk("Freeing memory: ");
606         while (shrink_all_memory(10000))
607                 printk(".");
608         printk("|\n");
609 }
610
611 static int suspend_prepare_image(void)
612 {
613         struct sysinfo i;
614         unsigned int nr_needed_pages = 0;
615
616         pagedir_nosave = NULL;
617         printk( "/critical section: ");
618 #ifdef CONFIG_HIGHMEM
619         printk( "handling highmem" );
620         if (save_highmem()) {
621                 printk(KERN_CRIT "%sNot enough free pages for highmem\n", name_suspend);
622                 return -ENOMEM;
623         }
624         printk(", ");
625 #endif
626
627         printk("counting pages to copy" );
628         drain_local_pages();
629         nr_copy_pages = count_and_copy_data_pages(NULL);
630         nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
631         
632         printk(" (pages needed: %d+%d=%d free: %d)\n",nr_copy_pages,PAGES_FOR_IO,nr_needed_pages,nr_free_pages());
633         if(nr_free_pages() < nr_needed_pages) {
634                 printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n",
635                        name_suspend, nr_needed_pages-nr_free_pages());
636                 root_swap = 0xFFFF;
637                 return -ENOMEM;
638         }
639         si_swapinfo(&i);        /* FIXME: si_swapinfo(&i) returns all swap devices information.
640                                    We should only consider resume_device. */
641         if (i.freeswap < nr_needed_pages)  {
642                 printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n",
643                        name_suspend, nr_needed_pages-i.freeswap);
644                 return -ENOSPC;
645         }
646
647         PRINTK( "Alloc pagedir\n" ); 
648         pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages);
649         if (!pagedir_nosave) {
650                 /* Pagedir is big, one-chunk allocation. It is easily possible for this allocation to fail */
651                 printk(KERN_CRIT "%sCouldn't allocate continuous pagedir\n", name_suspend);
652                 return -ENOMEM;
653         }
654         nr_copy_pages_check = nr_copy_pages;
655         pagedir_order_check = pagedir_order;
656
657         drain_local_pages();    /* During allocating of suspend pagedir, new cold pages may appear. Kill them */
658         if (nr_copy_pages != count_and_copy_data_pages(pagedir_nosave)) /* copy */
659                 BUG();
660
661         /*
662          * End of critical section. From now on, we can write to memory,
663          * but we should not touch disk. This specially means we must _not_
664          * touch swap space! Except we must write out our image of course.
665          */
666
667         printk( "critical section/: done (%d pages copied)\n", nr_copy_pages );
668         return 0;
669 }
670
671 static void suspend_save_image(void)
672 {
673         device_resume();
674
675         lock_swapdevices();
676         write_suspend_image();
677         lock_swapdevices();     /* This will unlock ignored swap devices since writing is finished */
678
679         /* It is important _NOT_ to umount filesystems at this point. We want
680          * them synced (in case something goes wrong) but we DO not want to mark
681          * filesystem clean: it is not. (And it does not matter, if we resume
682          * correctly, we'll mark system clean, anyway.)
683          */
684 }
685
686 static void suspend_power_down(void)
687 {
688         extern int C_A_D;
689         C_A_D = 0;
690         printk(KERN_EMERG "%s%s Trying to power down.\n", name_suspend, TEST_SWSUSP ? "Disable TEST_SWSUSP. NOT ": "");
691 #ifdef CONFIG_VT
692         PRINTK(KERN_EMERG "shift_state: %04x\n", shift_state);
693         mdelay(1000);
694         if (TEST_SWSUSP ^ (!!(shift_state & (1 << KG_CTRL))))
695                 machine_restart(NULL);
696         else
697 #endif
698         {
699                 device_shutdown();
700                 machine_power_off();
701         }
702
703         printk(KERN_EMERG "%sProbably not capable for powerdown. System halted.\n", name_suspend);
704         machine_halt();
705         while (1);
706         /* NOTREACHED */
707 }
708
709 /*
710  * Magic happens here
711  */
712
713 asmlinkage void do_magic_resume_1(void)
714 {
715         barrier();
716         mb();
717         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
718
719         device_power_down(4);
720         PRINTK( "Waiting for DMAs to settle down...\n");
721         mdelay(1000);   /* We do not want some readahead with DMA to corrupt our memory, right?
722                            Do it with disabled interrupts for best effect. That way, if some
723                            driver scheduled DMA, we have good chance for DMA to finish ;-). */
724 }
725
726 asmlinkage void do_magic_resume_2(void)
727 {
728         BUG_ON (nr_copy_pages_check != nr_copy_pages);
729         BUG_ON (pagedir_order_check != pagedir_order);
730
731         __flush_tlb_global();           /* Even mappings of "global" things (vmalloc) need to be fixed */
732
733         PRINTK( "Freeing prev allocated pagedir\n" );
734         free_suspend_pagedir((unsigned long) pagedir_save);
735
736 #ifdef CONFIG_HIGHMEM
737         printk( "Restoring highmem\n" );
738         restore_highmem();
739 #endif
740         printk("done, devices\n");
741
742         device_power_up();
743         spin_unlock_irq(&suspend_pagedir_lock);
744         device_resume();
745
746         /* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */
747         PRINTK( "Fixing swap signatures... " );
748         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
749         PRINTK( "ok\n" );
750
751 #ifdef SUSPEND_CONSOLE
752         acquire_console_sem();
753         update_screen(fg_console);
754         release_console_sem();
755 #endif
756 }
757
758 /* do_magic() is implemented in arch/?/kernel/suspend_asm.S, and basically does:
759
760         if (!resume) {
761                 do_magic_suspend_1();
762                 save_processor_state();
763                 SAVE_REGISTERS
764                 do_magic_suspend_2();
765                 return;
766         }
767         GO_TO_SWAPPER_PAGE_TABLES
768         do_magic_resume_1();
769         COPY_PAGES_BACK
770         RESTORE_REGISTERS
771         restore_processor_state();
772         do_magic_resume_2();
773
774  */
775
776 asmlinkage void do_magic_suspend_1(void)
777 {
778         mb();
779         barrier();
780         BUG_ON(in_atomic());
781         spin_lock_irq(&suspend_pagedir_lock);
782 }
783
784 asmlinkage void do_magic_suspend_2(void)
785 {
786         int is_problem;
787         read_swapfiles();
788         device_power_down(4);
789         is_problem = suspend_prepare_image();
790         device_power_up();
791         spin_unlock_irq(&suspend_pagedir_lock);
792         if (!is_problem) {
793                 kernel_fpu_end();       /* save_processor_state() does kernel_fpu_begin, and we need to revert it in order to pass in_atomic() checks */
794                 BUG_ON(in_atomic());
795                 suspend_save_image();
796                 suspend_power_down();   /* FIXME: if suspend_power_down is commented out, console is lost after few suspends ?! */
797         }
798
799         printk(KERN_EMERG "%sSuspend failed, trying to recover...\n", name_suspend);
800         MDELAY(1000); /* So user can wait and report us messages if armageddon comes :-) */
801
802         barrier();
803         mb();
804         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
805         mdelay(1000);
806
807         free_pages((unsigned long) pagedir_nosave, pagedir_order);
808         spin_unlock_irq(&suspend_pagedir_lock);
809
810         device_resume();
811         PRINTK( "Fixing swap signatures... " );
812         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
813         PRINTK( "ok\n" );
814 }
815
816 /*
817  * This is main interface to the outside world. It needs to be
818  * called from process context.
819  */
820 int software_suspend(void)
821 {
822         int res;
823         if (!software_suspend_enabled)
824                 return -EAGAIN;
825
826         software_suspend_enabled = 0;
827         might_sleep();
828
829         if (arch_prepare_suspend()) {
830                 printk("%sArchitecture failed to prepare\n", name_suspend);
831                 return -EPERM;
832         }               
833         if (pm_prepare_console())
834                 printk( "%sCan't allocate a console... proceeding\n", name_suspend);
835         if (!prepare_suspend_processes()) {
836
837                 /* At this point, all user processes and "dangerous"
838                    kernel threads are stopped. Free some memory, as we
839                    need half of memory free. */
840
841                 free_some_memory();
842                 
843                 /* Save state of all device drivers, and stop them. */             
844                 if ((res = device_suspend(4))==0)
845                         /* If stopping device drivers worked, we proceed basically into
846                          * suspend_save_image.
847                          *
848                          * do_magic(0) returns after system is resumed.
849                          *
850                          * do_magic() copies all "used" memory to "free" memory, then
851                          * unsuspends all device drivers, and writes memory to disk
852                          * using normal kernel mechanism.
853                          */
854                         do_magic(0);
855                 thaw_processes();
856         } else
857                 res = -EBUSY;
858         software_suspend_enabled = 1;
859         MDELAY(1000);
860         pm_restore_console();
861         return res;
862 }
863
864 /* More restore stuff */
865
866 #define does_collide(addr) does_collide_order(pagedir_nosave, addr, 0)
867
868 /*
869  * Returns true if given address/order collides with any orig_address 
870  */
871 static int does_collide_order(suspend_pagedir_t *pagedir, unsigned long addr,
872                 int order)
873 {
874         int i;
875         unsigned long addre = addr + (PAGE_SIZE<<order);
876         
877         for(i=0; i < nr_copy_pages; i++)
878                 if((pagedir+i)->orig_address >= addr &&
879                         (pagedir+i)->orig_address < addre)
880                         return 1;
881
882         return 0;
883 }
884
885 /*
886  * We check here that pagedir & pages it points to won't collide with pages
887  * where we're going to restore from the loaded pages later
888  */
889 static int check_pagedir(void)
890 {
891         int i;
892
893         for(i=0; i < nr_copy_pages; i++) {
894                 unsigned long addr;
895
896                 do {
897                         addr = get_zeroed_page(GFP_ATOMIC);
898                         if(!addr)
899                                 return -ENOMEM;
900                 } while (does_collide(addr));
901
902                 (pagedir_nosave+i)->address = addr;
903         }
904         return 0;
905 }
906
907 static int relocate_pagedir(void)
908 {
909         /*
910          * We have to avoid recursion (not to overflow kernel stack),
911          * and that's why code looks pretty cryptic 
912          */
913         suspend_pagedir_t *old_pagedir = pagedir_nosave;
914         void **eaten_memory = NULL;
915         void **c = eaten_memory, *m, *f;
916         int ret = 0;
917
918         printk("Relocating pagedir ");
919
920         if(!does_collide_order(old_pagedir, (unsigned long)old_pagedir, pagedir_order)) {
921                 printk("not necessary\n");
922                 return 0;
923         }
924
925         while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order))) {
926                 if (!does_collide_order(old_pagedir, (unsigned long)m, pagedir_order))
927                         break;
928                 eaten_memory = m;
929                 printk( "." ); 
930                 *eaten_memory = c;
931                 c = eaten_memory;
932         }
933
934         if (!m) {
935                 printk("out of memory\n");
936                 ret = -ENOMEM;
937         } else {
938                 pagedir_nosave =
939                         memcpy(m, old_pagedir, PAGE_SIZE << pagedir_order);
940         }
941
942         c = eaten_memory;
943         while (c) {
944                 printk(":");
945                 f = c;
946                 c = *c;
947                 free_pages((unsigned long)f, pagedir_order);
948         }
949         printk("|\n");
950         return ret;
951 }
952
953 /*
954  * Sanity check if this image makes sense with this kernel/swap context
955  * I really don't think that it's foolproof but more than nothing..
956  */
957
958 static int sanity_check_failed(char *reason)
959 {
960         printk(KERN_ERR "%s%s\n", name_resume, reason);
961         return -EPERM;
962 }
963
964 static int sanity_check(struct suspend_header *sh)
965 {
966         if (sh->version_code != LINUX_VERSION_CODE)
967                 return sanity_check_failed("Incorrect kernel version");
968         if (sh->num_physpages != num_physpages)
969                 return sanity_check_failed("Incorrect memory size");
970         if (strncmp(sh->machine, system_utsname.machine, 8))
971                 return sanity_check_failed("Incorrect machine type");
972         if (strncmp(sh->version, system_utsname.version, 20))
973                 return sanity_check_failed("Incorrect version");
974         if (sh->num_cpus != num_online_cpus())
975                 return sanity_check_failed("Incorrect number of cpus");
976         if (sh->page_size != PAGE_SIZE)
977                 return sanity_check_failed("Incorrect PAGE_SIZE");
978         return 0;
979 }
980
981 static int bdev_read_page(struct block_device *bdev, long pos, void *buf)
982 {
983         struct buffer_head *bh;
984         BUG_ON (pos%PAGE_SIZE);
985         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
986         if (!bh || (!bh->b_data)) {
987                 return -1;
988         }
989         memcpy(buf, bh->b_data, PAGE_SIZE);     /* FIXME: may need kmap() */
990         BUG_ON(!buffer_uptodate(bh));
991         brelse(bh);
992         return 0;
993
994
995 static int bdev_write_page(struct block_device *bdev, long pos, void *buf)
996 {
997 #if 0
998         struct buffer_head *bh;
999         BUG_ON (pos%PAGE_SIZE);
1000         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
1001         if (!bh || (!bh->b_data)) {
1002                 return -1;
1003         }
1004         memcpy(bh->b_data, buf, PAGE_SIZE);     /* FIXME: may need kmap() */
1005         BUG_ON(!buffer_uptodate(bh));
1006         generic_make_request(WRITE, bh);
1007         if (!buffer_uptodate(bh))
1008                 printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unsuccessful...\n", name_resume, resume_file);
1009         wait_on_buffer(bh);
1010         brelse(bh);
1011         return 0;
1012 #endif
1013         printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unimplemented...\n", name_resume, resume_file);
1014         return 0;
1015 }
1016
1017 extern dev_t __init name_to_dev_t(const char *line);
1018
1019 static int __init __read_suspend_image(struct block_device *bdev, union diskpage *cur, int noresume)
1020 {
1021         swp_entry_t next;
1022         int i, nr_pgdir_pages;
1023
1024 #define PREPARENEXT \
1025         {       next = cur->link.next; \
1026                 next.val = swp_offset(next) * PAGE_SIZE; \
1027         }
1028
1029         if (bdev_read_page(bdev, 0, cur)) return -EIO;
1030
1031         if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)) ||
1032             (!memcmp("SWAPSPACE2",cur->swh.magic.magic,10))) {
1033                 printk(KERN_ERR "%sThis is normal swap space\n", name_resume );
1034                 return -EINVAL;
1035         }
1036
1037         PREPARENEXT; /* We have to read next position before we overwrite it */
1038
1039         if (!memcmp("S1",cur->swh.magic.magic,2))
1040                 memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
1041         else if (!memcmp("S2",cur->swh.magic.magic,2))
1042                 memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
1043         else {
1044                 if (noresume)
1045                         return -EINVAL;
1046                 panic("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
1047                         name_resume, cur->swh.magic.magic);
1048         }
1049         if (noresume) {
1050                 /* We don't do a sanity check here: we want to restore the swap
1051                    whatever version of kernel made the suspend image;
1052                    We need to write swap, but swap is *not* enabled so
1053                    we must write the device directly */
1054                 printk("%s: Fixing swap signatures %s...\n", name_resume, resume_file);
1055                 bdev_write_page(bdev, 0, cur);
1056         }
1057
1058         printk( "%sSignature found, resuming\n", name_resume );
1059         MDELAY(1000);
1060
1061         if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1062         if (sanity_check(&cur->sh))     /* Is this same machine? */     
1063                 return -EPERM;
1064         PREPARENEXT;
1065
1066         pagedir_save = cur->sh.suspend_pagedir;
1067         nr_copy_pages = cur->sh.num_pbes;
1068         nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
1069         pagedir_order = get_bitmask_order(nr_pgdir_pages);
1070
1071         pagedir_nosave = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC, pagedir_order);
1072         if (!pagedir_nosave)
1073                 return -ENOMEM;
1074
1075         PRINTK( "%sReading pagedir, ", name_resume );
1076
1077         /* We get pages in reverse order of saving! */
1078         for (i=nr_pgdir_pages-1; i>=0; i--) {
1079                 BUG_ON (!next.val);
1080                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
1081                 if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1082                 PREPARENEXT;
1083         }
1084         BUG_ON (next.val);
1085
1086         if (relocate_pagedir())
1087                 return -ENOMEM;
1088         if (check_pagedir())
1089                 return -ENOMEM;
1090
1091         printk( "Reading image data (%d pages): ", nr_copy_pages );
1092         for(i=0; i < nr_copy_pages; i++) {
1093                 swp_entry_t swap_address = (pagedir_nosave+i)->swap_address;
1094                 if (!(i%100))
1095                         printk( "." );
1096                 /* You do not need to check for overlaps...
1097                    ... check_pagedir already did this work */
1098                 if (bdev_read_page(bdev, swp_offset(swap_address) * PAGE_SIZE, (char *)((pagedir_nosave+i)->address)))
1099                         return -EIO;
1100         }
1101         printk( "|\n" );
1102         return 0;
1103 }
1104
1105 static int __init read_suspend_image(const char * specialfile, int noresume)
1106 {
1107         union diskpage *cur;
1108         unsigned long scratch_page = 0;
1109         int error;
1110         char b[BDEVNAME_SIZE];
1111
1112         resume_device = name_to_dev_t(specialfile);
1113         scratch_page = get_zeroed_page(GFP_ATOMIC);
1114         cur = (void *) scratch_page;
1115         if (cur) {
1116                 struct block_device *bdev;
1117                 printk("Resuming from device %s\n",
1118                                 __bdevname(resume_device, b));
1119                 bdev = open_by_devnum(resume_device, FMODE_READ);
1120                 if (IS_ERR(bdev)) {
1121                         error = PTR_ERR(bdev);
1122                 } else {
1123                         set_blocksize(bdev, PAGE_SIZE);
1124                         error = __read_suspend_image(bdev, cur, noresume);
1125                         blkdev_put(bdev);
1126                 }
1127         } else error = -ENOMEM;
1128
1129         if (scratch_page)
1130                 free_page(scratch_page);
1131         switch (error) {
1132                 case 0:
1133                         PRINTK("Reading resume file was successful\n");
1134                         break;
1135                 case -EINVAL:
1136                         break;
1137                 case -EIO:
1138                         printk( "%sI/O error\n", name_resume);
1139                         break;
1140                 case -ENOENT:
1141                         printk( "%s%s: No such file or directory\n", name_resume, specialfile);
1142                         break;
1143                 case -ENOMEM:
1144                         printk( "%sNot enough memory\n", name_resume);
1145                         break;
1146                 default:
1147                         printk( "%sError %d resuming\n", name_resume, error );
1148         }
1149         MDELAY(1000);
1150         return error;
1151 }
1152
1153 /**
1154  *      software_resume - Resume from a saved image.
1155  *
1156  *      Called as a late_initcall (so all devices are discovered and 
1157  *      initialized), we call swsusp to see if we have a saved image or not.
1158  *      If so, we quiesce devices, then restore the saved image. We will 
1159  *      return above (in pm_suspend_disk() ) if everything goes well. 
1160  *      Otherwise, we fail gracefully and return to the normally 
1161  *      scheduled program.
1162  *
1163  */
1164 static int __init software_resume(void)
1165 {
1166         if (num_online_cpus() > 1) {
1167                 printk(KERN_WARNING "Software Suspend has malfunctioning SMP support. Disabled :(\n");  
1168                 return -EINVAL;
1169         }
1170         /* We enable the possibility of machine suspend */
1171         software_suspend_enabled = 1;
1172         if (!resume_status)
1173                 return 0;
1174
1175         printk( "%s", name_resume );
1176         if (resume_status == NORESUME) {
1177                 if(resume_file[0])
1178                         read_suspend_image(resume_file, 1);
1179                 printk( "disabled\n" );
1180                 return 0;
1181         }
1182         MDELAY(1000);
1183
1184         if (pm_prepare_console())
1185                 printk("swsusp: Can't allocate a console... proceeding\n");
1186
1187         if (!resume_file[0] && resume_status == RESUME_SPECIFIED) {
1188                 printk( "suspension device unspecified\n" );
1189                 return -EINVAL;
1190         }
1191
1192         printk( "resuming from %s\n", resume_file);
1193         if (read_suspend_image(resume_file, 0))
1194                 goto read_failure;
1195         device_suspend(4);
1196         do_magic(1);
1197         panic("This never returns");
1198
1199 read_failure:
1200         pm_restore_console();
1201         return 0;
1202 }
1203
1204 late_initcall(software_resume);
1205
1206 static int __init resume_setup(char *str)
1207 {
1208         if (resume_status == NORESUME)
1209                 return 1;
1210
1211         strncpy( resume_file, str, 255 );
1212         resume_status = RESUME_SPECIFIED;
1213
1214         return 1;
1215 }
1216
1217 static int __init noresume_setup(char *str)
1218 {
1219         resume_status = NORESUME;
1220         return 1;
1221 }
1222
1223 __setup("noresume", noresume_setup);
1224 __setup("resume=", resume_setup);
1225
1226 EXPORT_SYMBOL(software_suspend);
1227 EXPORT_SYMBOL(software_suspend_enabled);