ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / kernel / power / pmdisk.c
1 /*
2  * kernel/power/pmdisk.c - Suspend-to-disk implmentation
3  *
4  * This STD implementation is initially derived from swsusp (suspend-to-swap).
5  * The original copyright on that was: 
6  *
7  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8  * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
9  *
10  * The additional parts are: 
11  * 
12  * Copyright (C) 2003 Patrick Mochel
13  * Copyright (C) 2003 Open Source Development Lab
14  * 
15  * This file is released under the GPLv2. 
16  *
17  * For more information, please see the text files in Documentation/power/
18  *
19  */
20
21 #undef DEBUG
22
23 #include <linux/mm.h>
24 #include <linux/bio.h>
25 #include <linux/suspend.h>
26 #include <linux/version.h>
27 #include <linux/reboot.h>
28 #include <linux/device.h>
29 #include <linux/swapops.h>
30 #include <linux/bootmem.h>
31 #include <linux/utsname.h>
32
33 #include <asm/mmu_context.h>
34
35 #include "power.h"
36
37
38 extern asmlinkage int pmdisk_arch_suspend(int resume);
39
40 #define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
41 #define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
42 #define ADDRESS2(x) __ADDRESS(__pa(x))          /* Needed for x86-64 where some pages are in memory twice */
43
44 /* References to section boundaries */
45 extern char __nosave_begin, __nosave_end;
46
47 extern int is_head_of_free_region(struct page *);
48
49 /* Variables to be preserved over suspend */
50 static int pagedir_order_check;
51 static int nr_copy_pages_check;
52
53 /* For resume= kernel option */
54 static char resume_file[256] = CONFIG_PM_DISK_PARTITION;
55
56 static dev_t resume_device;
57 /* Local variables that should not be affected by save */
58 unsigned int pmdisk_pages __nosavedata = 0;
59
60 /* Suspend pagedir is allocated before final copy, therefore it
61    must be freed after resume 
62
63    Warning: this is evil. There are actually two pagedirs at time of
64    resume. One is "pagedir_save", which is empty frame allocated at
65    time of suspend, that must be freed. Second is "pagedir_nosave", 
66    allocated at time of resume, that travels through memory not to
67    collide with anything.
68  */
69 suspend_pagedir_t *pm_pagedir_nosave __nosavedata = NULL;
70 static suspend_pagedir_t *pagedir_save;
71 static int pagedir_order __nosavedata = 0;
72
73
74 struct pmdisk_info {
75         struct new_utsname      uts;
76         u32                     version_code;
77         unsigned long           num_physpages;
78         int                     cpus;
79         unsigned long           image_pages;
80         unsigned long           pagedir_pages;
81         swp_entry_t             pagedir[768];
82 } __attribute__((aligned(PAGE_SIZE))) pmdisk_info;
83
84
85
86 #define PMDISK_SIG      "pmdisk-swap1"
87
88 struct pmdisk_header {
89         char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
90         swp_entry_t pmdisk_info;
91         char    orig_sig[10];
92         char    sig[10];
93 } __attribute__((packed, aligned(PAGE_SIZE))) pmdisk_header;
94
95 /*
96  * XXX: We try to keep some more pages free so that I/O operations succeed
97  * without paging. Might this be more?
98  */
99 #define PAGES_FOR_IO    512
100
101
102 /*
103  * Saving part...
104  */
105
106
107 /* We memorize in swapfile_used what swap devices are used for suspension */
108 #define SWAPFILE_UNUSED    0
109 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
110 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
111
112 static unsigned short swapfile_used[MAX_SWAPFILES];
113 static unsigned short root_swap;
114
115
116 static int mark_swapfiles(swp_entry_t prev)
117 {
118         int error;
119
120         rw_swap_page_sync(READ, 
121                           swp_entry(root_swap, 0),
122                           virt_to_page((unsigned long)&pmdisk_header));
123         if (!memcmp("SWAP-SPACE",pmdisk_header.sig,10) ||
124             !memcmp("SWAPSPACE2",pmdisk_header.sig,10)) {
125                 memcpy(pmdisk_header.orig_sig,pmdisk_header.sig,10);
126                 memcpy(pmdisk_header.sig,PMDISK_SIG,10);
127                 pmdisk_header.pmdisk_info = prev;
128                 error = rw_swap_page_sync(WRITE, 
129                                           swp_entry(root_swap, 0),
130                                           virt_to_page((unsigned long)
131                                                        &pmdisk_header));
132         } else {
133                 pr_debug("pmdisk: Partition is not swap space.\n");
134                 error = -ENODEV;
135         }
136         return error;
137 }
138
139 static int read_swapfiles(void) /* This is called before saving image */
140 {
141         int i, len;
142         
143         len=strlen(resume_file);
144         root_swap = 0xFFFF;
145         
146         swap_list_lock();
147         for(i=0; i<MAX_SWAPFILES; i++) {
148                 if (swap_info[i].flags == 0) {
149                         swapfile_used[i]=SWAPFILE_UNUSED;
150                 } else {
151                         if(!len) {
152                                 pr_debug("pmdisk: Default resume partition not set.\n");
153                                 if(root_swap == 0xFFFF) {
154                                         swapfile_used[i] = SWAPFILE_SUSPEND;
155                                         root_swap = i;
156                                 } else
157                                         swapfile_used[i] = SWAPFILE_IGNORED;                              
158                         } else {
159                                 /* we ignore all swap devices that are not the resume_file */
160                                 if (1) {
161 // FIXME                                if(resume_device == swap_info[i].swap_device) {
162                                         swapfile_used[i] = SWAPFILE_SUSPEND;
163                                         root_swap = i;
164                                 } else
165                                         swapfile_used[i] = SWAPFILE_IGNORED;
166                         }
167                 }
168         }
169         swap_list_unlock();
170         return (root_swap != 0xffff) ? 0 : -ENODEV;
171 }
172
173
174 /* This is called after saving image so modification
175    will be lost after resume... and that's what we want. */
176 static void lock_swapdevices(void)
177 {
178         int i;
179
180         swap_list_lock();
181         for(i = 0; i< MAX_SWAPFILES; i++)
182                 if(swapfile_used[i] == SWAPFILE_IGNORED) {
183                         swap_info[i].flags ^= 0xFF; /* we make the device unusable. A new call to
184                                                        lock_swapdevices can unlock the devices. */
185                 }
186         swap_list_unlock();
187 }
188
189
190
191 /**
192  *      write_swap_page - Write one page to a fresh swap location.
193  *      @addr:  Address we're writing.
194  *      @loc:   Place to store the entry we used.
195  *
196  *      Allocate a new swap entry and 'sync' it. Note we discard -EIO
197  *      errors. That is an artifact left over from swsusp. It did not 
198  *      check the return of rw_swap_page_sync() at all, since most pages
199  *      written back to swap would return -EIO.
200  *      This is a partial improvement, since we will at least return other
201  *      errors, though we need to eventually fix the damn code.
202  */
203
204 static int write_swap_page(unsigned long addr, swp_entry_t * loc)
205 {
206         swp_entry_t entry;
207         int error = 0;
208
209         entry = get_swap_page();
210         if (swp_offset(entry) && 
211             swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
212                 error = rw_swap_page_sync(WRITE, entry,
213                                           virt_to_page(addr));
214                 if (error == -EIO)
215                         error = 0;
216                 if (!error)
217                         *loc = entry;
218         } else
219                 error = -ENOSPC;
220         return error;
221 }
222
223
224 /**
225  *      free_data - Free the swap entries used by the saved image.
226  *
227  *      Walk the list of used swap entries and free each one. 
228  */
229
230 static void free_data(void)
231 {
232         swp_entry_t entry;
233         int i;
234
235         for (i = 0; i < pmdisk_pages; i++) {
236                 entry = (pm_pagedir_nosave + i)->swap_address;
237                 if (entry.val)
238                         swap_free(entry);
239                 else
240                         break;
241                 (pm_pagedir_nosave + i)->swap_address = (swp_entry_t){0};
242         }
243 }
244
245
246 /**
247  *      write_data - Write saved image to swap.
248  *
249  *      Walk the list of pages in the image and sync each one to swap.
250  */
251
252 static int write_data(void)
253 {
254         int error = 0;
255         int i;
256
257         printk( "Writing data to swap (%d pages): ", pmdisk_pages );
258         for (i = 0; i < pmdisk_pages && !error; i++) {
259                 if (!(i%100))
260                         printk( "." );
261                 error = write_swap_page((pm_pagedir_nosave+i)->address,
262                                         &((pm_pagedir_nosave+i)->swap_address));
263         }
264         printk(" %d Pages done.\n",i);
265         return error;
266 }
267
268
269 /**
270  *      free_pagedir - Free pages used by the page directory.
271  */
272
273 static void free_pagedir_entries(void)
274 {
275         int num = pmdisk_info.pagedir_pages;
276         int i;
277
278         for (i = 0; i < num; i++)
279                 swap_free(pmdisk_info.pagedir[i]);
280 }
281
282
283 /**
284  *      write_pagedir - Write the array of pages holding the page directory.
285  *      @last:  Last swap entry we write (needed for header).
286  */
287
288 static int write_pagedir(void)
289 {
290         unsigned long addr = (unsigned long)pm_pagedir_nosave;
291         int error = 0;
292         int n = SUSPEND_PD_PAGES(pmdisk_pages);
293         int i;
294
295         pmdisk_info.pagedir_pages = n;
296         printk( "Writing pagedir (%d pages)\n", n);
297         for (i = 0; i < n && !error; i++, addr += PAGE_SIZE)
298                 error = write_swap_page(addr,&pmdisk_info.pagedir[i]);
299         return error;
300 }
301
302
303 #ifdef DEBUG
304 static void dump_pmdisk_info(void)
305 {
306         printk(" pmdisk: Version: %u\n",pmdisk_info.version_code);
307         printk(" pmdisk: Num Pages: %ld\n",pmdisk_info.num_physpages);
308         printk(" pmdisk: UTS Sys: %s\n",pmdisk_info.uts.sysname);
309         printk(" pmdisk: UTS Node: %s\n",pmdisk_info.uts.nodename);
310         printk(" pmdisk: UTS Release: %s\n",pmdisk_info.uts.release);
311         printk(" pmdisk: UTS Version: %s\n",pmdisk_info.uts.version);
312         printk(" pmdisk: UTS Machine: %s\n",pmdisk_info.uts.machine);
313         printk(" pmdisk: UTS Domain: %s\n",pmdisk_info.uts.domainname);
314         printk(" pmdisk: CPUs: %d\n",pmdisk_info.cpus);
315         printk(" pmdisk: Image: %ld Pages\n",pmdisk_info.image_pages);
316         printk(" pmdisk: Pagedir: %ld Pages\n",pmdisk_info.pagedir_pages);
317 }
318 #else
319 static void dump_pmdisk_info(void)
320 {
321
322 }
323 #endif
324
325 static void init_header(void)
326 {
327         memset(&pmdisk_info,0,sizeof(pmdisk_info));
328         pmdisk_info.version_code = LINUX_VERSION_CODE;
329         pmdisk_info.num_physpages = num_physpages;
330         memcpy(&pmdisk_info.uts,&system_utsname,sizeof(system_utsname));
331
332         pmdisk_info.cpus = num_online_cpus();
333         pmdisk_info.image_pages = pmdisk_pages;
334 }
335
336 /**
337  *      write_header - Fill and write the suspend header.
338  *      @entry: Location of the last swap entry used.
339  *
340  *      Allocate a page, fill header, write header. 
341  *
342  *      @entry is the location of the last pagedir entry written on 
343  *      entrance. On exit, it contains the location of the header. 
344  */
345
346 static int write_header(swp_entry_t * entry)
347 {
348         dump_pmdisk_info();
349         return write_swap_page((unsigned long)&pmdisk_info,entry);
350 }
351
352
353
354 /**
355  *      write_suspend_image - Write entire image and metadata.
356  *
357  */
358
359 static int write_suspend_image(void)
360 {
361         int error;
362         swp_entry_t prev = { 0 };
363
364         init_header();
365
366         if ((error = write_data()))
367                 goto FreeData;
368
369         if ((error = write_pagedir()))
370                 goto FreePagedir;
371
372         if ((error = write_header(&prev)))
373                 goto FreePagedir;
374
375         error = mark_swapfiles(prev);
376  Done:
377         return error;
378  FreePagedir:
379         free_pagedir_entries();
380  FreeData:
381         free_data();
382         goto Done;
383 }
384
385
386
387 /**
388  *      saveable - Determine whether a page should be cloned or not.
389  *      @pfn:   The page
390  *
391  *      We save a page if it's Reserved, and not in the range of pages
392  *      statically defined as 'unsaveable', or if it isn't reserved, and
393  *      isn't part of a free chunk of pages.
394  *      If it is part of a free chunk, we update @pfn to point to the last 
395  *      page of the chunk.
396  */
397
398 static int saveable(unsigned long * pfn)
399 {
400         struct page * page = pfn_to_page(*pfn);
401
402         if (PageNosave(page))
403                 return 0;
404
405         if (!PageReserved(page)) {
406                 int chunk_size;
407
408                 if ((chunk_size = is_head_of_free_region(page))) {
409                         *pfn += chunk_size - 1;
410                         return 0;
411                 }
412         } else if (PageReserved(page)) {
413                 /* Just copy whole code segment. 
414                  * Hopefully it is not that big.
415                  */
416                 if ((ADDRESS(*pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) && 
417                     (ADDRESS(*pfn) <  (unsigned long) ADDRESS2(&__nosave_end))) {
418                         pr_debug("[nosave %lx]\n", ADDRESS(*pfn));
419                         return 0;
420                 }
421                 /* Hmm, perhaps copying all reserved pages is not 
422                  * too healthy as they may contain 
423                  * critical bios data? 
424                  */
425         }
426         return 1;
427 }
428
429
430
431 /**
432  *      count_pages - Determine size of page directory.
433  *      
434  *      Iterate over all the pages in the system and tally the number
435  *      we need to clone.
436  */
437
438 static void count_pages(void)
439 {
440         unsigned long pfn;
441         int n = 0;
442         
443         for (pfn = 0; pfn < max_pfn; pfn++) {
444                 if (saveable(&pfn))
445                         n++;
446         }
447         pmdisk_pages = n;
448 }
449
450
451 /**
452  *      copy_pages - Atomically snapshot memory.
453  *
454  *      Iterate over all the pages in the system and copy each one 
455  *      into its corresponding location in the pagedir.
456  *      We rely on the fact that the number of pages that we're snap-
457  *      shotting hasn't changed since we counted them. 
458  */
459
460 static void copy_pages(void)
461 {
462         struct pbe * p = pagedir_save;
463         unsigned long pfn;
464         int n = 0;
465
466         for (pfn = 0; pfn < max_pfn; pfn++) {
467                 if (saveable(&pfn)) {
468                         n++;
469                         p->orig_address = ADDRESS(pfn);
470                         copy_page((void *) p->address, 
471                                   (void *) p->orig_address);
472                         p++;
473                 }
474         }
475         BUG_ON(n != pmdisk_pages);
476 }
477
478
479 /**
480  *      free_image_pages - Free each page allocated for snapshot.
481  */
482
483 static void free_image_pages(void)
484 {
485         struct pbe * p;
486         int i;
487
488         for (i = 0, p = pagedir_save; i < pmdisk_pages; i++, p++) {
489                 ClearPageNosave(virt_to_page(p->address));
490                 free_page(p->address);
491         }
492 }
493
494
495 /**
496  *      free_pagedir - Free the page directory.
497  */
498
499 static void free_pagedir(void)
500 {
501         free_image_pages();
502         free_pages((unsigned long)pagedir_save, pagedir_order);
503 }
504
505
506 static void calc_order(void)
507 {
508         int diff;
509         int order;
510
511         order = get_bitmask_order(SUSPEND_PD_PAGES(pmdisk_pages));
512         pmdisk_pages += 1 << order;
513         do {
514                 diff = get_bitmask_order(SUSPEND_PD_PAGES(pmdisk_pages)) - order;
515                 if (diff) {
516                         order += diff;
517                         pmdisk_pages += 1 << diff;
518                 }
519         } while(diff);
520         pagedir_order = order;
521 }
522
523
524 /**
525  *      alloc_pagedir - Allocate the page directory.
526  *
527  *      First, determine exactly how many contiguous pages we need, 
528  *      allocate them, then mark each 'unsavable'.
529  */
530
531 static int alloc_pagedir(void)
532 {
533         calc_order();
534         pagedir_save = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, 
535                                                              pagedir_order);
536         if(!pagedir_save)
537                 return -ENOMEM;
538         memset(pagedir_save,0,(1 << pagedir_order) * PAGE_SIZE);
539         pm_pagedir_nosave = pagedir_save;
540         return 0;
541 }
542
543
544 /**
545  *      alloc_image_pages - Allocate pages for the snapshot.
546  *
547  */
548
549 static int alloc_image_pages(void)
550 {
551         struct pbe * p;
552         int i;
553
554         for (i = 0, p = pagedir_save; i < pmdisk_pages; i++, p++) {
555                 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
556                 if(!p->address)
557                         goto Error;
558                 SetPageNosave(virt_to_page(p->address));
559         }
560         return 0;
561  Error:
562         do { 
563                 if (p->address)
564                         free_page(p->address);
565                 p->address = 0;
566         } while (p-- > pagedir_save);
567         return -ENOMEM;
568 }
569
570
571 /**
572  *      enough_free_mem - Make sure we enough free memory to snapshot.
573  *
574  *      Returns TRUE or FALSE after checking the number of available 
575  *      free pages.
576  */
577
578 static int enough_free_mem(void)
579 {
580         if(nr_free_pages() < (pmdisk_pages + PAGES_FOR_IO)) {
581                 pr_debug("pmdisk: Not enough free pages: Have %d\n",
582                          nr_free_pages());
583                 return 0;
584         }
585         return 1;
586 }
587
588
589 /**
590  *      enough_swap - Make sure we have enough swap to save the image.
591  *
592  *      Returns TRUE or FALSE after checking the total amount of swap 
593  *      space avaiable.
594  *
595  *      FIXME: si_swapinfo(&i) returns all swap devices information.
596  *      We should only consider resume_device. 
597  */
598
599 static int enough_swap(void)
600 {
601         struct sysinfo i;
602
603         si_swapinfo(&i);
604         if (i.freeswap < (pmdisk_pages + PAGES_FOR_IO))  {
605                 pr_debug("pmdisk: Not enough swap. Need %ld\n",i.freeswap);
606                 return 0;
607         }
608         return 1;
609 }
610
611
612 /**
613  *      pmdisk_suspend - Atomically snapshot the system.
614  *
615  *      This must be called with interrupts disabled, to prevent the 
616  *      system changing at all from underneath us. 
617  *
618  *      To do this, we count the number of pages in the system that we 
619  *      need to save; make sure we have enough memory and swap to clone
620  *      the pages and save them in swap, allocate the space to hold them,
621  *      and then snapshot them all.
622  */
623
624 int pmdisk_suspend(void)
625 {
626         int error = 0;
627
628         if ((error = read_swapfiles()))
629                 return error;
630
631         drain_local_pages();
632
633         pm_pagedir_nosave = NULL;
634         pr_debug("pmdisk: Counting pages to copy.\n" );
635         count_pages();
636         
637         pr_debug("pmdisk: (pages needed: %d + %d free: %d)\n",
638                  pmdisk_pages,PAGES_FOR_IO,nr_free_pages());
639
640         if (!enough_free_mem())
641                 return -ENOMEM;
642
643         if (!enough_swap())
644                 return -ENOSPC;
645
646         if ((error = alloc_pagedir())) {
647                 pr_debug("pmdisk: Allocating pagedir failed.\n");
648                 return error;
649         }
650         if ((error = alloc_image_pages())) {
651                 pr_debug("pmdisk: Allocating image pages failed.\n");
652                 free_pagedir();
653                 return error;
654         }
655
656         nr_copy_pages_check = pmdisk_pages;
657         pagedir_order_check = pagedir_order;
658
659         /* During allocating of suspend pagedir, new cold pages may appear. 
660          * Kill them 
661          */
662         drain_local_pages();
663
664         /* copy */
665         copy_pages();
666
667         /*
668          * End of critical section. From now on, we can write to memory,
669          * but we should not touch disk. This specially means we must _not_
670          * touch swap space! Except we must write out our image of course.
671          */
672
673         pr_debug("pmdisk: %d pages copied\n", pmdisk_pages );
674         return 0;
675 }
676
677
678 /**
679  *      suspend_save_image - Prepare and write saved image to swap.
680  *
681  *      IRQs are re-enabled here so we can resume devices and safely write
682  *      to the swap devices. We disable them again before we leave.
683  *
684  *      The second lock_swapdevices() will unlock ignored swap devices since
685  *      writing is finished.
686  *      It is important _NOT_ to umount filesystems at this point. We want
687  *      them synced (in case something goes wrong) but we DO not want to mark
688  *      filesystem clean: it is not. (And it does not matter, if we resume
689  *      correctly, we'll mark system clean, anyway.)
690  */
691
692 static int suspend_save_image(void)
693 {
694         int error;
695         device_resume();
696         lock_swapdevices();
697         error = write_suspend_image();
698         lock_swapdevices();
699         return error;
700 }
701
702 /*
703  * Magic happens here
704  */
705
706 int pmdisk_resume(void)
707 {
708         BUG_ON (nr_copy_pages_check != pmdisk_pages);
709         BUG_ON (pagedir_order_check != pagedir_order);
710         
711         /* Even mappings of "global" things (vmalloc) need to be fixed */
712         __flush_tlb_global();
713         return 0;
714 }
715
716 /* pmdisk_arch_suspend() is implemented in arch/?/power/pmdisk.S,
717    and basically does:
718
719         if (!resume) {
720                 save_processor_state();
721                 SAVE_REGISTERS
722                 return pmdisk_suspend();
723         }
724         GO_TO_SWAPPER_PAGE_TABLES
725         COPY_PAGES_BACK
726         RESTORE_REGISTERS
727         restore_processor_state();
728         return pmdisk_resume();
729
730  */
731
732
733 /* More restore stuff */
734
735 /* FIXME: Why not memcpy(to, from, 1<<pagedir_order*PAGE_SIZE)? */
736 static void __init copy_pagedir(suspend_pagedir_t *to, suspend_pagedir_t *from)
737 {
738         int i;
739         char *topointer=(char *)to, *frompointer=(char *)from;
740
741         for(i=0; i < 1 << pagedir_order; i++) {
742                 copy_page(topointer, frompointer);
743                 topointer += PAGE_SIZE;
744                 frompointer += PAGE_SIZE;
745         }
746 }
747
748 #define does_collide(addr) does_collide_order(pm_pagedir_nosave, addr, 0)
749
750 /*
751  * Returns true if given address/order collides with any orig_address 
752  */
753 static int __init does_collide_order(suspend_pagedir_t *pagedir, 
754                                      unsigned long addr, int order)
755 {
756         int i;
757         unsigned long addre = addr + (PAGE_SIZE<<order);
758         
759         for(i=0; i < pmdisk_pages; i++)
760                 if((pagedir+i)->orig_address >= addr &&
761                         (pagedir+i)->orig_address < addre)
762                         return 1;
763
764         return 0;
765 }
766
767 /*
768  * We check here that pagedir & pages it points to won't collide with pages
769  * where we're going to restore from the loaded pages later
770  */
771 static int __init check_pagedir(void)
772 {
773         int i;
774
775         for(i=0; i < pmdisk_pages; i++) {
776                 unsigned long addr;
777
778                 do {
779                         addr = get_zeroed_page(GFP_ATOMIC);
780                         if(!addr)
781                                 return -ENOMEM;
782                 } while (does_collide(addr));
783
784                 (pm_pagedir_nosave+i)->address = addr;
785         }
786         return 0;
787 }
788
789 static int __init relocate_pagedir(void)
790 {
791         /*
792          * We have to avoid recursion (not to overflow kernel stack),
793          * and that's why code looks pretty cryptic 
794          */
795         suspend_pagedir_t *new_pagedir, *old_pagedir = pm_pagedir_nosave;
796         void **eaten_memory = NULL;
797         void **c = eaten_memory, *m, *f;
798
799         pr_debug("pmdisk: Relocating pagedir\n");
800
801         if(!does_collide_order(old_pagedir, (unsigned long)old_pagedir, pagedir_order)) {
802                 pr_debug("pmdisk: Relocation not necessary\n");
803                 return 0;
804         }
805
806         while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order))) {
807                 memset(m, 0, PAGE_SIZE);
808                 if (!does_collide_order(old_pagedir, (unsigned long)m, pagedir_order))
809                         break;
810                 eaten_memory = m;
811                 printk( "." ); 
812                 *eaten_memory = c;
813                 c = eaten_memory;
814         }
815
816         if (!m)
817                 return -ENOMEM;
818
819         pm_pagedir_nosave = new_pagedir = m;
820         copy_pagedir(new_pagedir, old_pagedir);
821
822         c = eaten_memory;
823         while(c) {
824                 printk(":");
825                 f = *c;
826                 c = *c;
827                 if (f)
828                         free_pages((unsigned long)f, pagedir_order);
829         }
830         printk("|\n");
831         return 0;
832 }
833
834
835 static struct block_device * resume_bdev;
836
837
838 /**
839  *      Using bio to read from swap.
840  *      This code requires a bit more work than just using buffer heads
841  *      but, it is the recommended way for 2.5/2.6.
842  *      The following are to signal the beginning and end of I/O. Bios
843  *      finish asynchronously, while we want them to happen synchronously.
844  *      A simple atomic_t, and a wait loop take care of this problem.
845  */
846
847 static atomic_t io_done = ATOMIC_INIT(0);
848
849 static void start_io(void)
850 {
851         atomic_set(&io_done,1);
852 }
853
854 static int end_io(struct bio * bio, unsigned int num, int err)
855 {
856         atomic_set(&io_done,0);
857         return 0;
858 }
859
860 static void wait_io(void)
861 {
862         while(atomic_read(&io_done))
863                 io_schedule();
864 }
865
866
867 /**
868  *      submit - submit BIO request.
869  *      @rw:    READ or WRITE.
870  *      @off    physical offset of page.
871  *      @page:  page we're reading or writing.
872  *
873  *      Straight from the textbook - allocate and initialize the bio.
874  *      If we're writing, make sure the page is marked as dirty.
875  *      Then submit it and wait.
876  */
877
878 static int submit(int rw, pgoff_t page_off, void * page)
879 {
880         int error = 0;
881         struct bio * bio;
882
883         bio = bio_alloc(GFP_ATOMIC,1);
884         if (!bio)
885                 return -ENOMEM;
886         bio->bi_sector = page_off * (PAGE_SIZE >> 9);
887         bio_get(bio);
888         bio->bi_bdev = resume_bdev;
889         bio->bi_end_io = end_io;
890
891         if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
892                 printk("pmdisk: ERROR: adding page to bio at %ld\n",page_off);
893                 error = -EFAULT;
894                 goto Done;
895         }
896
897         if (rw == WRITE)
898                 bio_set_pages_dirty(bio);
899         start_io();
900         submit_bio(rw | (1 << BIO_RW_SYNC), bio);
901         wait_io();
902  Done:
903         bio_put(bio);
904         return error;
905 }
906
907 static int
908 read_page(pgoff_t page_off, void * page)
909 {
910         return submit(READ,page_off,page);
911 }
912
913 static int
914 write_page(pgoff_t page_off, void * page)
915 {
916         return submit(WRITE,page_off,page);
917 }
918
919
920 extern dev_t __init name_to_dev_t(const char *line);
921
922
923 static int __init check_sig(void)
924 {
925         int error;
926
927         memset(&pmdisk_header,0,sizeof(pmdisk_header));
928         if ((error = read_page(0,&pmdisk_header)))
929                 return error;
930         if (!memcmp(PMDISK_SIG,pmdisk_header.sig,10)) {
931                 memcpy(pmdisk_header.sig,pmdisk_header.orig_sig,10);
932
933                 /*
934                  * Reset swap signature now.
935                  */
936                 error = write_page(0,&pmdisk_header);
937         } else { 
938                 pr_debug(KERN_ERR "pmdisk: Invalid partition type.\n");
939                 return -EINVAL;
940         }
941         if (!error)
942                 pr_debug("pmdisk: Signature found, resuming\n");
943         return error;
944 }
945
946
947 /*
948  * Sanity check if this image makes sense with this kernel/swap context
949  * I really don't think that it's foolproof but more than nothing..
950  */
951
952 static const char * __init sanity_check(void)
953 {
954         dump_pmdisk_info();
955         if(pmdisk_info.version_code != LINUX_VERSION_CODE)
956                 return "kernel version";
957         if(pmdisk_info.num_physpages != num_physpages)
958                 return "memory size";
959         if (strcmp(pmdisk_info.uts.sysname,system_utsname.sysname))
960                 return "system type";
961         if (strcmp(pmdisk_info.uts.release,system_utsname.release))
962                 return "kernel release";
963         if (strcmp(pmdisk_info.uts.version,system_utsname.version))
964                 return "version";
965         if (strcmp(pmdisk_info.uts.machine,system_utsname.machine))
966                 return "machine";
967         if(pmdisk_info.cpus != num_online_cpus())
968                 return "number of cpus";
969         return 0;
970 }
971
972
973 static int __init check_header(void)
974 {
975         const char * reason = NULL;
976         int error;
977
978         init_header();
979
980         if ((error = read_page(swp_offset(pmdisk_header.pmdisk_info), 
981                                &pmdisk_info)))
982                 return error;
983
984         /* Is this same machine? */
985         if ((reason = sanity_check())) {
986                 printk(KERN_ERR "pmdisk: Resume mismatch: %s\n",reason);
987                 return -EPERM;
988         }
989         pmdisk_pages = pmdisk_info.image_pages;
990         return error;
991 }
992
993
994 static int __init read_pagedir(void)
995 {
996         unsigned long addr;
997         int i, n = pmdisk_info.pagedir_pages;
998         int error = 0;
999
1000         pagedir_order = get_bitmask_order(n);
1001
1002         addr =__get_free_pages(GFP_ATOMIC, pagedir_order);
1003         if (!addr)
1004                 return -ENOMEM;
1005         pm_pagedir_nosave = (struct pbe *)addr;
1006
1007         pr_debug("pmdisk: Reading pagedir (%d Pages)\n",n);
1008
1009         for (i = 0; i < n && !error; i++, addr += PAGE_SIZE) {
1010                 unsigned long offset = swp_offset(pmdisk_info.pagedir[i]);
1011                 if (offset)
1012                         error = read_page(offset, (void *)addr);
1013                 else
1014                         error = -EFAULT;
1015         }
1016         if (error)
1017                 free_pages((unsigned long)pm_pagedir_nosave,pagedir_order);
1018         return error;
1019 }
1020
1021
1022 /**
1023  *      read_image_data - Read image pages from swap.
1024  *
1025  *      You do not need to check for overlaps, check_pagedir()
1026  *      already did that.
1027  */
1028
1029 static int __init read_image_data(void)
1030 {
1031         struct pbe * p;
1032         int error = 0;
1033         int i;
1034
1035         printk( "Reading image data (%d pages): ", pmdisk_pages );
1036         for(i = 0, p = pm_pagedir_nosave; i < pmdisk_pages && !error; i++, p++) {
1037                 if (!(i%100))
1038                         printk( "." );
1039                 error = read_page(swp_offset(p->swap_address),
1040                                   (void *)p->address);
1041         }
1042         printk(" %d done.\n",i);
1043         return error;
1044 }
1045
1046
1047 static int __init read_suspend_image(void)
1048 {
1049         int error = 0;
1050
1051         if ((error = check_sig()))
1052                 return error;
1053         if ((error = check_header()))
1054                 return error;
1055         if ((error = read_pagedir()))
1056                 return error;
1057         if ((error = relocate_pagedir()))
1058                 goto FreePagedir;
1059         if ((error = check_pagedir()))
1060                 goto FreePagedir;
1061         if ((error = read_image_data()))
1062                 goto FreePagedir;
1063  Done:
1064         return error;
1065  FreePagedir:
1066         free_pages((unsigned long)pm_pagedir_nosave,pagedir_order);
1067         goto Done;
1068 }
1069
1070 /**
1071  *      pmdisk_save - Snapshot memory
1072  */
1073
1074 int pmdisk_save(void) 
1075 {
1076         int error;
1077
1078 #if defined (CONFIG_HIGHMEM) || defined (CONFIG_DISCONTIGMEM)
1079         pr_debug("pmdisk: not supported with high- or discontig-mem.\n");
1080         return -EPERM;
1081 #endif
1082         if ((error = arch_prepare_suspend()))
1083                 return error;
1084         local_irq_disable();
1085         save_processor_state();
1086         error = pmdisk_arch_suspend(0);
1087         restore_processor_state();
1088         local_irq_enable();
1089         return error;
1090 }
1091
1092
1093 /**
1094  *      pmdisk_write - Write saved memory image to swap.
1095  *
1096  *      pmdisk_arch_suspend(0) returns after system is resumed.
1097  *
1098  *      pmdisk_arch_suspend() copies all "used" memory to "free" memory,
1099  *      then unsuspends all device drivers, and writes memory to disk
1100  *      using normal kernel mechanism.
1101  */
1102
1103 int pmdisk_write(void)
1104 {
1105         return suspend_save_image();
1106 }
1107
1108
1109 /**
1110  *      pmdisk_read - Read saved image from swap.
1111  */
1112
1113 int __init pmdisk_read(void)
1114 {
1115         int error;
1116
1117         if (!strlen(resume_file))
1118                 return -ENOENT;
1119
1120         resume_device = name_to_dev_t(resume_file);
1121         pr_debug("pmdisk: Resume From Partition: %s\n", resume_file);
1122
1123         resume_bdev = open_by_devnum(resume_device, FMODE_READ);
1124         if (!IS_ERR(resume_bdev)) {
1125                 set_blocksize(resume_bdev, PAGE_SIZE);
1126                 error = read_suspend_image();
1127                 blkdev_put(resume_bdev);
1128         } else
1129                 error = PTR_ERR(resume_bdev);
1130
1131         if (!error)
1132                 pr_debug("Reading resume file was successful\n");
1133         else
1134                 pr_debug("pmdisk: Error %d resuming\n", error);
1135         return error;
1136 }
1137
1138
1139 /**
1140  *      pmdisk_restore - Replace running kernel with saved image.
1141  */
1142
1143 int __init pmdisk_restore(void)
1144 {
1145         int error;
1146         local_irq_disable();
1147         save_processor_state();
1148         error = pmdisk_arch_suspend(1);
1149         restore_processor_state();
1150         local_irq_enable();
1151         return error;
1152 }
1153
1154
1155 /**
1156  *      pmdisk_free - Free memory allocated to hold snapshot.
1157  */
1158
1159 int pmdisk_free(void)
1160 {
1161         pr_debug( "Freeing prev allocated pagedir\n" );
1162         free_pagedir();
1163         return 0;
1164 }
1165
1166 static int __init pmdisk_setup(char *str)
1167 {
1168         if (strlen(str)) {
1169                 if (!strcmp(str,"off"))
1170                         resume_file[0] = '\0';
1171                 else
1172                         strncpy(resume_file, str, 255);
1173         } else
1174                 resume_file[0] = '\0';
1175         return 1;
1176 }
1177
1178 __setup("pmdisk=", pmdisk_setup);
1179