VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / kernel / power / pmdisk.c
1 /*
2  * kernel/power/pmdisk.c - Suspend-to-disk implmentation
3  *
4  * This STD implementation is initially derived from swsusp (suspend-to-swap).
5  * The original copyright on that was: 
6  *
7  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8  * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
9  *
10  * The additional parts are: 
11  * 
12  * Copyright (C) 2003 Patrick Mochel
13  * Copyright (C) 2003 Open Source Development Lab
14  * 
15  * This file is released under the GPLv2. 
16  *
17  * For more information, please see the text files in Documentation/power/
18  *
19  */
20
21 #undef DEBUG
22
23 #include <linux/mm.h>
24 #include <linux/bio.h>
25 #include <linux/suspend.h>
26 #include <linux/version.h>
27 #include <linux/reboot.h>
28 #include <linux/device.h>
29 #include <linux/swapops.h>
30 #include <linux/bootmem.h>
31 #include <linux/utsname.h>
32
33 #include <asm/mmu_context.h>
34
35 #include "power.h"
36
37
38 extern asmlinkage int pmdisk_arch_suspend(int resume);
39
40 #define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
41 #define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
42 #define ADDRESS2(x) __ADDRESS(__pa(x))          /* Needed for x86-64 where some pages are in memory twice */
43
44 /* References to section boundaries */
45 extern char __nosave_begin, __nosave_end;
46
47 extern int is_head_of_free_region(struct page *);
48
49 /* Variables to be preserved over suspend */
50 static int pagedir_order_check;
51 static int nr_copy_pages_check;
52
53 /* For resume= kernel option */
54 static char resume_file[256] = CONFIG_PM_DISK_PARTITION;
55
56 static dev_t resume_device;
57 /* Local variables that should not be affected by save */
58 unsigned int pmdisk_pages __nosavedata = 0;
59
60 /* Suspend pagedir is allocated before final copy, therefore it
61    must be freed after resume 
62
63    Warning: this is evil. There are actually two pagedirs at time of
64    resume. One is "pagedir_save", which is empty frame allocated at
65    time of suspend, that must be freed. Second is "pagedir_nosave", 
66    allocated at time of resume, that travels through memory not to
67    collide with anything.
68  */
69 suspend_pagedir_t *pm_pagedir_nosave __nosavedata = NULL;
70 static suspend_pagedir_t *pagedir_save;
71 static int pagedir_order __nosavedata = 0;
72
73
74 struct pmdisk_info {
75         struct new_utsname      uts;
76         u32                     version_code;
77         unsigned long           num_physpages;
78         int                     cpus;
79         unsigned long           image_pages;
80         unsigned long           pagedir_pages;
81         swp_entry_t             pagedir[768];
82 } __attribute__((aligned(PAGE_SIZE))) pmdisk_info;
83
84
85
86 #define PMDISK_SIG      "pmdisk-swap1"
87
88 struct pmdisk_header {
89         char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
90         swp_entry_t pmdisk_info;
91         char    orig_sig[10];
92         char    sig[10];
93 } __attribute__((packed, aligned(PAGE_SIZE))) pmdisk_header;
94
95 /*
96  * XXX: We try to keep some more pages free so that I/O operations succeed
97  * without paging. Might this be more?
98  */
99 #define PAGES_FOR_IO    512
100
101
102 /*
103  * Saving part...
104  */
105
106
107 /* We memorize in swapfile_used what swap devices are used for suspension */
108 #define SWAPFILE_UNUSED    0
109 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
110 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
111
112 static unsigned short swapfile_used[MAX_SWAPFILES];
113 static unsigned short root_swap;
114
115
116 static int mark_swapfiles(swp_entry_t prev)
117 {
118         int error;
119
120         rw_swap_page_sync(READ, 
121                           swp_entry(root_swap, 0),
122                           virt_to_page((unsigned long)&pmdisk_header));
123         if (!memcmp("SWAP-SPACE",pmdisk_header.sig,10) ||
124             !memcmp("SWAPSPACE2",pmdisk_header.sig,10)) {
125                 memcpy(pmdisk_header.orig_sig,pmdisk_header.sig,10);
126                 memcpy(pmdisk_header.sig,PMDISK_SIG,10);
127                 pmdisk_header.pmdisk_info = prev;
128                 error = rw_swap_page_sync(WRITE, 
129                                           swp_entry(root_swap, 0),
130                                           virt_to_page((unsigned long)
131                                                        &pmdisk_header));
132         } else {
133                 pr_debug("pmdisk: Partition is not swap space.\n");
134                 error = -ENODEV;
135         }
136         return error;
137 }
138
139 static int read_swapfiles(void) /* This is called before saving image */
140 {
141         int i, len;
142         
143         len=strlen(resume_file);
144         root_swap = 0xFFFF;
145         
146         swap_list_lock();
147         for(i=0; i<MAX_SWAPFILES; i++) {
148                 if (swap_info[i].flags == 0) {
149                         swapfile_used[i]=SWAPFILE_UNUSED;
150                 } else {
151                         if(!len) {
152                                 pr_debug("pmdisk: Default resume partition not set.\n");
153                                 if(root_swap == 0xFFFF) {
154                                         swapfile_used[i] = SWAPFILE_SUSPEND;
155                                         root_swap = i;
156                                 } else
157                                         swapfile_used[i] = SWAPFILE_IGNORED;                              
158                         } else {
159                                 /* we ignore all swap devices that are not the resume_file */
160                                 if (1) {
161 // FIXME                                if(resume_device == swap_info[i].swap_device) {
162                                         swapfile_used[i] = SWAPFILE_SUSPEND;
163                                         root_swap = i;
164                                 } else
165                                         swapfile_used[i] = SWAPFILE_IGNORED;
166                         }
167                 }
168         }
169         swap_list_unlock();
170         return (root_swap != 0xffff) ? 0 : -ENODEV;
171 }
172
173
174 /* This is called after saving image so modification
175    will be lost after resume... and that's what we want. */
176 static void lock_swapdevices(void)
177 {
178         int i;
179
180         swap_list_lock();
181         for(i = 0; i< MAX_SWAPFILES; i++)
182                 if(swapfile_used[i] == SWAPFILE_IGNORED) {
183                         swap_info[i].flags ^= 0xFF; /* we make the device unusable. A new call to
184                                                        lock_swapdevices can unlock the devices. */
185                 }
186         swap_list_unlock();
187 }
188
189
190
191 /**
192  *      write_swap_page - Write one page to a fresh swap location.
193  *      @addr:  Address we're writing.
194  *      @loc:   Place to store the entry we used.
195  *
196  *      Allocate a new swap entry and 'sync' it. Note we discard -EIO
197  *      errors. That is an artifact left over from swsusp. It did not 
198  *      check the return of rw_swap_page_sync() at all, since most pages
199  *      written back to swap would return -EIO.
200  *      This is a partial improvement, since we will at least return other
201  *      errors, though we need to eventually fix the damn code.
202  */
203
204 static int write_swap_page(unsigned long addr, swp_entry_t * loc)
205 {
206         swp_entry_t entry;
207         int error = 0;
208
209         entry = get_swap_page();
210         if (swp_offset(entry) && 
211             swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
212                 error = rw_swap_page_sync(WRITE, entry,
213                                           virt_to_page(addr));
214                 if (error == -EIO)
215                         error = 0;
216                 if (!error)
217                         *loc = entry;
218         } else
219                 error = -ENOSPC;
220         return error;
221 }
222
223
224 /**
225  *      free_data - Free the swap entries used by the saved image.
226  *
227  *      Walk the list of used swap entries and free each one. 
228  */
229
230 static void free_data(void)
231 {
232         swp_entry_t entry;
233         int i;
234
235         for (i = 0; i < pmdisk_pages; i++) {
236                 entry = (pm_pagedir_nosave + i)->swap_address;
237                 if (entry.val)
238                         swap_free(entry);
239                 else
240                         break;
241                 (pm_pagedir_nosave + i)->swap_address = (swp_entry_t){0};
242         }
243 }
244
245
246 /**
247  *      write_data - Write saved image to swap.
248  *
249  *      Walk the list of pages in the image and sync each one to swap.
250  */
251
252 static int write_data(void)
253 {
254         int error = 0;
255         int i;
256
257         printk( "Writing data to swap (%d pages): ", pmdisk_pages );
258         for (i = 0; i < pmdisk_pages && !error; i++) {
259                 if (!(i%100))
260                         printk( "." );
261                 error = write_swap_page((pm_pagedir_nosave+i)->address,
262                                         &((pm_pagedir_nosave+i)->swap_address));
263         }
264         printk(" %d Pages done.\n",i);
265         return error;
266 }
267
268
269 /**
270  *      free_pagedir - Free pages used by the page directory.
271  */
272
273 static void free_pagedir_entries(void)
274 {
275         int num = pmdisk_info.pagedir_pages;
276         int i;
277
278         for (i = 0; i < num; i++)
279                 swap_free(pmdisk_info.pagedir[i]);
280 }
281
282
283 /**
284  *      write_pagedir - Write the array of pages holding the page directory.
285  *      @last:  Last swap entry we write (needed for header).
286  */
287
288 static int write_pagedir(void)
289 {
290         unsigned long addr = (unsigned long)pm_pagedir_nosave;
291         int error = 0;
292         int n = SUSPEND_PD_PAGES(pmdisk_pages);
293         int i;
294
295         pmdisk_info.pagedir_pages = n;
296         printk( "Writing pagedir (%d pages)\n", n);
297         for (i = 0; i < n && !error; i++, addr += PAGE_SIZE)
298                 error = write_swap_page(addr,&pmdisk_info.pagedir[i]);
299         return error;
300 }
301
302
303 #ifdef DEBUG
304 static void dump_pmdisk_info(void)
305 {
306         printk(" pmdisk: Version: %u\n",pmdisk_info.version_code);
307         printk(" pmdisk: Num Pages: %ld\n",pmdisk_info.num_physpages);
308         printk(" pmdisk: UTS Sys: %s\n",pmdisk_info.uts.sysname);
309         printk(" pmdisk: UTS Node: %s\n",pmdisk_info.uts.nodename);
310         printk(" pmdisk: UTS Release: %s\n",pmdisk_info.uts.release);
311         printk(" pmdisk: UTS Version: %s\n",pmdisk_info.uts.version);
312         printk(" pmdisk: UTS Machine: %s\n",pmdisk_info.uts.machine);
313         printk(" pmdisk: UTS Domain: %s\n",pmdisk_info.uts.domainname);
314         printk(" pmdisk: CPUs: %d\n",pmdisk_info.cpus);
315         printk(" pmdisk: Image: %ld Pages\n",pmdisk_info.image_pages);
316         printk(" pmdisk: Pagedir: %ld Pages\n",pmdisk_info.pagedir_pages);
317 }
318 #else
319 static void dump_pmdisk_info(void)
320 {
321
322 }
323 #endif
324
325 static void init_header(void)
326 {
327         memset(&pmdisk_info,0,sizeof(pmdisk_info));
328         pmdisk_info.version_code = LINUX_VERSION_CODE;
329         pmdisk_info.num_physpages = num_physpages;
330         memcpy(&pmdisk_info.uts,&system_utsname,sizeof(system_utsname));
331
332         pmdisk_info.cpus = num_online_cpus();
333         pmdisk_info.image_pages = pmdisk_pages;
334 }
335
336 /**
337  *      write_header - Fill and write the suspend header.
338  *      @entry: Location of the last swap entry used.
339  *
340  *      Allocate a page, fill header, write header. 
341  *
342  *      @entry is the location of the last pagedir entry written on 
343  *      entrance. On exit, it contains the location of the header. 
344  */
345
346 static int write_header(swp_entry_t * entry)
347 {
348         dump_pmdisk_info();
349         return write_swap_page((unsigned long)&pmdisk_info,entry);
350 }
351
352
353
354 /**
355  *      write_suspend_image - Write entire image and metadata.
356  *
357  */
358
359 static int write_suspend_image(void)
360 {
361         int error;
362         swp_entry_t prev = { 0 };
363
364         init_header();
365
366         if ((error = write_data()))
367                 goto FreeData;
368
369         if ((error = write_pagedir()))
370                 goto FreePagedir;
371
372         if ((error = write_header(&prev)))
373                 goto FreePagedir;
374
375         error = mark_swapfiles(prev);
376  Done:
377         return error;
378  FreePagedir:
379         free_pagedir_entries();
380  FreeData:
381         free_data();
382         goto Done;
383 }
384
385
386
387 /**
388  *      saveable - Determine whether a page should be cloned or not.
389  *      @pfn:   The page
390  *
391  *      We save a page if it's Reserved, and not in the range of pages
392  *      statically defined as 'unsaveable', or if it isn't reserved, and
393  *      isn't part of a free chunk of pages.
394  *      If it is part of a free chunk, we update @pfn to point to the last 
395  *      page of the chunk.
396  */
397
398 static int saveable(unsigned long * pfn)
399 {
400         struct page * page = pfn_to_page(*pfn);
401
402         if (PageNosave(page))
403                 return 0;
404
405         if (!PageReserved(page)) {
406                 int chunk_size;
407
408                 if ((chunk_size = is_head_of_free_region(page))) {
409                         *pfn += chunk_size - 1;
410                         return 0;
411                 }
412         } else if (PageReserved(page)) {
413                 /* Just copy whole code segment. 
414                  * Hopefully it is not that big.
415                  */
416                 if ((ADDRESS(*pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) && 
417                     (ADDRESS(*pfn) <  (unsigned long) ADDRESS2(&__nosave_end))) {
418                         pr_debug("[nosave %lx]\n", ADDRESS(*pfn));
419                         return 0;
420                 }
421                 /* Hmm, perhaps copying all reserved pages is not 
422                  * too healthy as they may contain 
423                  * critical bios data? 
424                  */
425         }
426         return 1;
427 }
428
429
430
431 /**
432  *      count_pages - Determine size of page directory.
433  *      
434  *      Iterate over all the pages in the system and tally the number
435  *      we need to clone.
436  */
437
438 static void count_pages(void)
439 {
440         unsigned long pfn;
441         int n = 0;
442         
443         for (pfn = 0; pfn < max_pfn; pfn++) {
444                 if (saveable(&pfn))
445                         n++;
446         }
447         pmdisk_pages = n;
448 }
449
450
451 /**
452  *      copy_pages - Atomically snapshot memory.
453  *
454  *      Iterate over all the pages in the system and copy each one 
455  *      into its corresponding location in the pagedir.
456  *      We rely on the fact that the number of pages that we're snap-
457  *      shotting hasn't changed since we counted them. 
458  */
459
460 static void copy_pages(void)
461 {
462         struct pbe * p = pagedir_save;
463         unsigned long pfn;
464         int n = 0;
465
466         for (pfn = 0; pfn < max_pfn; pfn++) {
467                 if (saveable(&pfn)) {
468                         n++;
469                         p->orig_address = ADDRESS(pfn);
470                         copy_page((void *) p->address, 
471                                   (void *) p->orig_address);
472                         p++;
473                 }
474         }
475         BUG_ON(n != pmdisk_pages);
476 }
477
478
479 /**
480  *      free_image_pages - Free each page allocated for snapshot.
481  */
482
483 static void free_image_pages(void)
484 {
485         struct pbe * p;
486         int i;
487
488         for (i = 0, p = pagedir_save; i < pmdisk_pages; i++, p++) {
489                 ClearPageNosave(virt_to_page(p->address));
490                 free_page(p->address);
491         }
492 }
493
494
495 /**
496  *      free_pagedir - Free the page directory.
497  */
498
499 static void free_pagedir(void)
500 {
501         free_image_pages();
502         free_pages((unsigned long)pagedir_save, pagedir_order);
503 }
504
505
506 static void calc_order(void)
507 {
508         int diff;
509         int order;
510
511         order = get_bitmask_order(SUSPEND_PD_PAGES(pmdisk_pages));
512         pmdisk_pages += 1 << order;
513         do {
514                 diff = get_bitmask_order(SUSPEND_PD_PAGES(pmdisk_pages)) - order;
515                 if (diff) {
516                         order += diff;
517                         pmdisk_pages += 1 << diff;
518                 }
519         } while(diff);
520         pagedir_order = order;
521 }
522
523
524 /**
525  *      alloc_pagedir - Allocate the page directory.
526  *
527  *      First, determine exactly how many contiguous pages we need, 
528  *      allocate them, then mark each 'unsavable'.
529  */
530
531 static int alloc_pagedir(void)
532 {
533         calc_order();
534         pagedir_save = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, 
535                                                              pagedir_order);
536         if(!pagedir_save)
537                 return -ENOMEM;
538         memset(pagedir_save,0,(1 << pagedir_order) * PAGE_SIZE);
539         pm_pagedir_nosave = pagedir_save;
540         return 0;
541 }
542
543
544 /**
545  *      alloc_image_pages - Allocate pages for the snapshot.
546  *
547  */
548
549 static int alloc_image_pages(void)
550 {
551         struct pbe * p;
552         int i;
553
554         for (i = 0, p = pagedir_save; i < pmdisk_pages; i++, p++) {
555                 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
556                 if(!p->address)
557                         goto Error;
558                 SetPageNosave(virt_to_page(p->address));
559         }
560         return 0;
561  Error:
562         do { 
563                 if (p->address)
564                         free_page(p->address);
565                 p->address = 0;
566         } while (p-- > pagedir_save);
567         return -ENOMEM;
568 }
569
570
571 /**
572  *      enough_free_mem - Make sure we enough free memory to snapshot.
573  *
574  *      Returns TRUE or FALSE after checking the number of available 
575  *      free pages.
576  */
577
578 static int enough_free_mem(void)
579 {
580         if(nr_free_pages() < (pmdisk_pages + PAGES_FOR_IO)) {
581                 pr_debug("pmdisk: Not enough free pages: Have %d\n",
582                          nr_free_pages());
583                 return 0;
584         }
585         return 1;
586 }
587
588
589 /**
590  *      enough_swap - Make sure we have enough swap to save the image.
591  *
592  *      Returns TRUE or FALSE after checking the total amount of swap 
593  *      space avaiable.
594  *
595  *      FIXME: si_swapinfo(&i) returns all swap devices information.
596  *      We should only consider resume_device. 
597  */
598
599 static int enough_swap(void)
600 {
601         struct sysinfo i;
602
603         si_swapinfo(&i);
604         if (i.freeswap < (pmdisk_pages + PAGES_FOR_IO))  {
605                 pr_debug("pmdisk: Not enough swap. Need %ld\n",i.freeswap);
606                 return 0;
607         }
608         return 1;
609 }
610
611
612 /**
613  *      pmdisk_suspend - Atomically snapshot the system.
614  *
615  *      This must be called with interrupts disabled, to prevent the 
616  *      system changing at all from underneath us. 
617  *
618  *      To do this, we count the number of pages in the system that we 
619  *      need to save; make sure we have enough memory and swap to clone
620  *      the pages and save them in swap, allocate the space to hold them,
621  *      and then snapshot them all.
622  */
623
624 int pmdisk_suspend(void)
625 {
626         int error = 0;
627
628         if ((error = read_swapfiles()))
629                 return error;
630
631         drain_local_pages();
632
633         pm_pagedir_nosave = NULL;
634         pr_debug("pmdisk: Counting pages to copy.\n" );
635         count_pages();
636         
637         pr_debug("pmdisk: (pages needed: %d + %d free: %d)\n",
638                  pmdisk_pages,PAGES_FOR_IO,nr_free_pages());
639
640         if (!enough_free_mem())
641                 return -ENOMEM;
642
643         if (!enough_swap())
644                 return -ENOSPC;
645
646         if ((error = alloc_pagedir())) {
647                 pr_debug("pmdisk: Allocating pagedir failed.\n");
648                 return error;
649         }
650         if ((error = alloc_image_pages())) {
651                 pr_debug("pmdisk: Allocating image pages failed.\n");
652                 free_pagedir();
653                 return error;
654         }
655
656         nr_copy_pages_check = pmdisk_pages;
657         pagedir_order_check = pagedir_order;
658
659         /* During allocating of suspend pagedir, new cold pages may appear. 
660          * Kill them 
661          */
662         drain_local_pages();
663
664         /* copy */
665         copy_pages();
666
667         /*
668          * End of critical section. From now on, we can write to memory,
669          * but we should not touch disk. This specially means we must _not_
670          * touch swap space! Except we must write out our image of course.
671          */
672
673         pr_debug("pmdisk: %d pages copied\n", pmdisk_pages );
674         return 0;
675 }
676
677
678 /**
679  *      suspend_save_image - Prepare and write saved image to swap.
680  *
681  *      IRQs are re-enabled here so we can resume devices and safely write
682  *      to the swap devices. We disable them again before we leave.
683  *
684  *      The second lock_swapdevices() will unlock ignored swap devices since
685  *      writing is finished.
686  *      It is important _NOT_ to umount filesystems at this point. We want
687  *      them synced (in case something goes wrong) but we DO not want to mark
688  *      filesystem clean: it is not. (And it does not matter, if we resume
689  *      correctly, we'll mark system clean, anyway.)
690  */
691
692 static int suspend_save_image(void)
693 {
694         int error;
695         device_resume();
696         lock_swapdevices();
697         error = write_suspend_image();
698         lock_swapdevices();
699         return error;
700 }
701
702 /*
703  * Magic happens here
704  */
705
706 int pmdisk_resume(void)
707 {
708         BUG_ON (nr_copy_pages_check != pmdisk_pages);
709         BUG_ON (pagedir_order_check != pagedir_order);
710         
711         /* Even mappings of "global" things (vmalloc) need to be fixed */
712         __flush_tlb_global();
713         return 0;
714 }
715
716 /* pmdisk_arch_suspend() is implemented in arch/?/power/pmdisk.S,
717    and basically does:
718
719         if (!resume) {
720                 save_processor_state();
721                 SAVE_REGISTERS
722                 return pmdisk_suspend();
723         }
724         GO_TO_SWAPPER_PAGE_TABLES
725         COPY_PAGES_BACK
726         RESTORE_REGISTERS
727         restore_processor_state();
728         return pmdisk_resume();
729
730  */
731
732
733 /* More restore stuff */
734
735 #define does_collide(addr) does_collide_order(pm_pagedir_nosave, addr, 0)
736
737 /*
738  * Returns true if given address/order collides with any orig_address 
739  */
740 static int __init does_collide_order(suspend_pagedir_t *pagedir, 
741                                      unsigned long addr, int order)
742 {
743         int i;
744         unsigned long addre = addr + (PAGE_SIZE<<order);
745         
746         for(i=0; i < pmdisk_pages; i++)
747                 if((pagedir+i)->orig_address >= addr &&
748                         (pagedir+i)->orig_address < addre)
749                         return 1;
750
751         return 0;
752 }
753
754 /*
755  * We check here that pagedir & pages it points to won't collide with pages
756  * where we're going to restore from the loaded pages later
757  */
758 static int __init check_pagedir(void)
759 {
760         int i;
761
762         for(i=0; i < pmdisk_pages; i++) {
763                 unsigned long addr;
764
765                 do {
766                         addr = get_zeroed_page(GFP_ATOMIC);
767                         if(!addr)
768                                 return -ENOMEM;
769                 } while (does_collide(addr));
770
771                 (pm_pagedir_nosave+i)->address = addr;
772         }
773         return 0;
774 }
775
776 static int __init relocate_pagedir(void)
777 {
778         /*
779          * We have to avoid recursion (not to overflow kernel stack),
780          * and that's why code looks pretty cryptic 
781          */
782         suspend_pagedir_t *old_pagedir = pm_pagedir_nosave;
783         void **eaten_memory = NULL;
784         void **c = eaten_memory, *m, *f;
785         int err;
786
787         pr_debug("pmdisk: Relocating pagedir\n");
788
789         if(!does_collide_order(old_pagedir, (unsigned long)old_pagedir, pagedir_order)) {
790                 pr_debug("pmdisk: Relocation not necessary\n");
791                 return 0;
792         }
793
794         err = -ENOMEM;
795         while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order)) != NULL) {
796                 if (!does_collide_order(old_pagedir, (unsigned long)m,
797                                         pagedir_order)) {
798                         pm_pagedir_nosave =
799                                 memcpy(m, old_pagedir,
800                                        PAGE_SIZE << pagedir_order);
801                         err = 0;
802                         break;
803                 }
804                 eaten_memory = m;
805                 printk( "." ); 
806                 *eaten_memory = c;
807                 c = eaten_memory;
808         }
809
810         c = eaten_memory;
811         while(c) {
812                 printk(":");
813                 f = c;
814                 c = *c;
815                 free_pages((unsigned long)f, pagedir_order);
816         }
817         printk("|\n");
818         return err;
819 }
820
821
822 static struct block_device * resume_bdev;
823
824
825 /**
826  *      Using bio to read from swap.
827  *      This code requires a bit more work than just using buffer heads
828  *      but, it is the recommended way for 2.5/2.6.
829  *      The following are to signal the beginning and end of I/O. Bios
830  *      finish asynchronously, while we want them to happen synchronously.
831  *      A simple atomic_t, and a wait loop take care of this problem.
832  */
833
834 static atomic_t io_done = ATOMIC_INIT(0);
835
836 static void start_io(void)
837 {
838         atomic_set(&io_done,1);
839 }
840
841 static int end_io(struct bio * bio, unsigned int num, int err)
842 {
843         atomic_set(&io_done,0);
844         return 0;
845 }
846
847 static void wait_io(void)
848 {
849         while(atomic_read(&io_done))
850                 io_schedule();
851 }
852
853
854 /**
855  *      submit - submit BIO request.
856  *      @rw:    READ or WRITE.
857  *      @off    physical offset of page.
858  *      @page:  page we're reading or writing.
859  *
860  *      Straight from the textbook - allocate and initialize the bio.
861  *      If we're writing, make sure the page is marked as dirty.
862  *      Then submit it and wait.
863  */
864
865 static int submit(int rw, pgoff_t page_off, void * page)
866 {
867         int error = 0;
868         struct bio * bio;
869
870         bio = bio_alloc(GFP_ATOMIC,1);
871         if (!bio)
872                 return -ENOMEM;
873         bio->bi_sector = page_off * (PAGE_SIZE >> 9);
874         bio_get(bio);
875         bio->bi_bdev = resume_bdev;
876         bio->bi_end_io = end_io;
877
878         if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
879                 printk("pmdisk: ERROR: adding page to bio at %ld\n",page_off);
880                 error = -EFAULT;
881                 goto Done;
882         }
883
884         if (rw == WRITE)
885                 bio_set_pages_dirty(bio);
886         start_io();
887         submit_bio(rw | (1 << BIO_RW_SYNC), bio);
888         wait_io();
889  Done:
890         bio_put(bio);
891         return error;
892 }
893
894 static int
895 read_page(pgoff_t page_off, void * page)
896 {
897         return submit(READ,page_off,page);
898 }
899
900 static int
901 write_page(pgoff_t page_off, void * page)
902 {
903         return submit(WRITE,page_off,page);
904 }
905
906
907 extern dev_t __init name_to_dev_t(const char *line);
908
909
910 static int __init check_sig(void)
911 {
912         int error;
913
914         memset(&pmdisk_header,0,sizeof(pmdisk_header));
915         if ((error = read_page(0,&pmdisk_header)))
916                 return error;
917         if (!memcmp(PMDISK_SIG,pmdisk_header.sig,10)) {
918                 memcpy(pmdisk_header.sig,pmdisk_header.orig_sig,10);
919
920                 /*
921                  * Reset swap signature now.
922                  */
923                 error = write_page(0,&pmdisk_header);
924         } else { 
925                 pr_debug(KERN_ERR "pmdisk: Invalid partition type.\n");
926                 return -EINVAL;
927         }
928         if (!error)
929                 pr_debug("pmdisk: Signature found, resuming\n");
930         return error;
931 }
932
933
934 /*
935  * Sanity check if this image makes sense with this kernel/swap context
936  * I really don't think that it's foolproof but more than nothing..
937  */
938
939 static const char * __init sanity_check(void)
940 {
941         dump_pmdisk_info();
942         if(pmdisk_info.version_code != LINUX_VERSION_CODE)
943                 return "kernel version";
944         if(pmdisk_info.num_physpages != num_physpages)
945                 return "memory size";
946         if (strcmp(pmdisk_info.uts.sysname,system_utsname.sysname))
947                 return "system type";
948         if (strcmp(pmdisk_info.uts.release,system_utsname.release))
949                 return "kernel release";
950         if (strcmp(pmdisk_info.uts.version,system_utsname.version))
951                 return "version";
952         if (strcmp(pmdisk_info.uts.machine,system_utsname.machine))
953                 return "machine";
954         if(pmdisk_info.cpus != num_online_cpus())
955                 return "number of cpus";
956         return NULL;
957 }
958
959
960 static int __init check_header(void)
961 {
962         const char * reason = NULL;
963         int error;
964
965         init_header();
966
967         if ((error = read_page(swp_offset(pmdisk_header.pmdisk_info), 
968                                &pmdisk_info)))
969                 return error;
970
971         /* Is this same machine? */
972         if ((reason = sanity_check())) {
973                 printk(KERN_ERR "pmdisk: Resume mismatch: %s\n",reason);
974                 return -EPERM;
975         }
976         pmdisk_pages = pmdisk_info.image_pages;
977         return error;
978 }
979
980
981 static int __init read_pagedir(void)
982 {
983         unsigned long addr;
984         int i, n = pmdisk_info.pagedir_pages;
985         int error = 0;
986
987         pagedir_order = get_bitmask_order(n);
988
989         addr =__get_free_pages(GFP_ATOMIC, pagedir_order);
990         if (!addr)
991                 return -ENOMEM;
992         pm_pagedir_nosave = (struct pbe *)addr;
993
994         pr_debug("pmdisk: Reading pagedir (%d Pages)\n",n);
995
996         for (i = 0; i < n && !error; i++, addr += PAGE_SIZE) {
997                 unsigned long offset = swp_offset(pmdisk_info.pagedir[i]);
998                 if (offset)
999                         error = read_page(offset, (void *)addr);
1000                 else
1001                         error = -EFAULT;
1002         }
1003         if (error)
1004                 free_pages((unsigned long)pm_pagedir_nosave,pagedir_order);
1005         return error;
1006 }
1007
1008
1009 /**
1010  *      read_image_data - Read image pages from swap.
1011  *
1012  *      You do not need to check for overlaps, check_pagedir()
1013  *      already did that.
1014  */
1015
1016 static int __init read_image_data(void)
1017 {
1018         struct pbe * p;
1019         int error = 0;
1020         int i;
1021
1022         printk( "Reading image data (%d pages): ", pmdisk_pages );
1023         for(i = 0, p = pm_pagedir_nosave; i < pmdisk_pages && !error; i++, p++) {
1024                 if (!(i%100))
1025                         printk( "." );
1026                 error = read_page(swp_offset(p->swap_address),
1027                                   (void *)p->address);
1028         }
1029         printk(" %d done.\n",i);
1030         return error;
1031 }
1032
1033
1034 static int __init read_suspend_image(void)
1035 {
1036         int error = 0;
1037
1038         if ((error = check_sig()))
1039                 return error;
1040         if ((error = check_header()))
1041                 return error;
1042         if ((error = read_pagedir()))
1043                 return error;
1044         if ((error = relocate_pagedir()))
1045                 goto FreePagedir;
1046         if ((error = check_pagedir()))
1047                 goto FreePagedir;
1048         if ((error = read_image_data()))
1049                 goto FreePagedir;
1050  Done:
1051         return error;
1052  FreePagedir:
1053         free_pages((unsigned long)pm_pagedir_nosave,pagedir_order);
1054         goto Done;
1055 }
1056
1057 /**
1058  *      pmdisk_save - Snapshot memory
1059  */
1060
1061 int pmdisk_save(void) 
1062 {
1063         int error;
1064
1065 #if defined (CONFIG_HIGHMEM) || defined (CONFIG_DISCONTIGMEM)
1066         pr_debug("pmdisk: not supported with high- or discontig-mem.\n");
1067         return -EPERM;
1068 #endif
1069         if ((error = arch_prepare_suspend()))
1070                 return error;
1071         local_irq_disable();
1072         save_processor_state();
1073         error = pmdisk_arch_suspend(0);
1074         restore_processor_state();
1075         local_irq_enable();
1076         return error;
1077 }
1078
1079
1080 /**
1081  *      pmdisk_write - Write saved memory image to swap.
1082  *
1083  *      pmdisk_arch_suspend(0) returns after system is resumed.
1084  *
1085  *      pmdisk_arch_suspend() copies all "used" memory to "free" memory,
1086  *      then unsuspends all device drivers, and writes memory to disk
1087  *      using normal kernel mechanism.
1088  */
1089
1090 int pmdisk_write(void)
1091 {
1092         return suspend_save_image();
1093 }
1094
1095
1096 /**
1097  *      pmdisk_read - Read saved image from swap.
1098  */
1099
1100 int __init pmdisk_read(void)
1101 {
1102         int error;
1103
1104         if (!strlen(resume_file))
1105                 return -ENOENT;
1106
1107         resume_device = name_to_dev_t(resume_file);
1108         pr_debug("pmdisk: Resume From Partition: %s\n", resume_file);
1109
1110         resume_bdev = open_by_devnum(resume_device, FMODE_READ);
1111         if (!IS_ERR(resume_bdev)) {
1112                 set_blocksize(resume_bdev, PAGE_SIZE);
1113                 error = read_suspend_image();
1114                 blkdev_put(resume_bdev);
1115         } else
1116                 error = PTR_ERR(resume_bdev);
1117
1118         if (!error)
1119                 pr_debug("Reading resume file was successful\n");
1120         else
1121                 pr_debug("pmdisk: Error %d resuming\n", error);
1122         return error;
1123 }
1124
1125
1126 /**
1127  *      pmdisk_restore - Replace running kernel with saved image.
1128  */
1129
1130 int __init pmdisk_restore(void)
1131 {
1132         int error;
1133         local_irq_disable();
1134         save_processor_state();
1135         error = pmdisk_arch_suspend(1);
1136         restore_processor_state();
1137         local_irq_enable();
1138         return error;
1139 }
1140
1141
1142 /**
1143  *      pmdisk_free - Free memory allocated to hold snapshot.
1144  */
1145
1146 int pmdisk_free(void)
1147 {
1148         pr_debug( "Freeing prev allocated pagedir\n" );
1149         free_pagedir();
1150         return 0;
1151 }
1152
1153 static int __init pmdisk_setup(char *str)
1154 {
1155         if (strlen(str)) {
1156                 if (!strcmp(str,"off"))
1157                         resume_file[0] = '\0';
1158                 else
1159                         strncpy(resume_file, str, 255);
1160         } else
1161                 resume_file[0] = '\0';
1162         return 1;
1163 }
1164
1165 __setup("pmdisk=", pmdisk_setup);
1166