This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / arch / i386 / kernel / setup-xen.c
1 /*
2  *  linux/arch/i386/kernel/setup.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  *
6  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
7  *
8  *  Memory region support
9  *      David Parsons <orc@pell.chi.il.us>, July-August 1999
10  *
11  *  Added E820 sanitization routine (removes overlapping memory regions);
12  *  Brian Moyle <bmoyle@mvista.com>, February 2001
13  *
14  * Moved CPU detection code to cpu/${cpu}.c
15  *    Patrick Mochel <mochel@osdl.org>, March 2002
16  *
17  *  Provisions for empty E820 memory regions (reported by certain BIOSes).
18  *  Alex Achenbach <xela@slit.de>, December 2002.
19  *
20  */
21
22 /*
23  * This file handles the architecture-dependent parts of initialization
24  */
25
26 #include <linux/config.h>
27 #include <linux/sched.h>
28 #include <linux/mm.h>
29 #include <linux/mmzone.h>
30 #include <linux/tty.h>
31 #include <linux/ioport.h>
32 #include <linux/acpi.h>
33 #include <linux/apm_bios.h>
34 #include <linux/initrd.h>
35 #include <linux/bootmem.h>
36 #include <linux/seq_file.h>
37 #include <linux/platform_device.h>
38 #include <linux/console.h>
39 #include <linux/mca.h>
40 #include <linux/root_dev.h>
41 #include <linux/highmem.h>
42 #include <linux/module.h>
43 #include <linux/efi.h>
44 #include <linux/init.h>
45 #include <linux/edd.h>
46 #include <linux/nodemask.h>
47 #include <linux/kernel.h>
48 #include <linux/percpu.h>
49 #include <linux/notifier.h>
50 #include <linux/kexec.h>
51 #include <linux/crash_dump.h>
52 #include <linux/dmi.h>
53 #include <linux/pfn.h>
54
55 #include <video/edid.h>
56
57 #include <asm/apic.h>
58 #include <asm/e820.h>
59 #include <asm/mpspec.h>
60 #include <asm/setup.h>
61 #include <asm/arch_hooks.h>
62 #include <asm/sections.h>
63 #include <asm/io_apic.h>
64 #include <asm/ist.h>
65 #include <asm/io.h>
66 #include <asm/hypervisor.h>
67 #include <xen/interface/physdev.h>
68 #include <xen/interface/memory.h>
69 #include <xen/features.h>
70 #include "setup_arch_pre.h"
71 #include <bios_ebda.h>
72
73 /* Forward Declaration. */
74 void __init find_max_pfn(void);
75
76 static int xen_panic_event(struct notifier_block *, unsigned long, void *);
77 static struct notifier_block xen_panic_block = {
78         xen_panic_event, NULL, 0 /* try to go last */
79 };
80
81 extern char hypercall_page[PAGE_SIZE];
82 EXPORT_SYMBOL(hypercall_page);
83
84 int disable_pse __devinitdata = 0;
85
86 /*
87  * Machine setup..
88  */
89
90 #ifdef CONFIG_EFI
91 int efi_enabled = 0;
92 EXPORT_SYMBOL(efi_enabled);
93 #endif
94
95 /* cpu data as detected by the assembly code in head.S */
96 struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
97 /* common cpu data for all cpus */
98 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
99 EXPORT_SYMBOL(boot_cpu_data);
100
101 unsigned long mmu_cr4_features;
102
103 #ifdef  CONFIG_ACPI
104         int acpi_disabled = 0;
105 #else
106         int acpi_disabled = 1;
107 #endif
108 EXPORT_SYMBOL(acpi_disabled);
109
110 #ifdef  CONFIG_ACPI
111 int __initdata acpi_force = 0;
112 extern acpi_interrupt_flags     acpi_sci_flags;
113 #endif
114
115 /* for MCA, but anyone else can use it if they want */
116 unsigned int machine_id;
117 #ifdef CONFIG_MCA
118 EXPORT_SYMBOL(machine_id);
119 #endif
120 unsigned int machine_submodel_id;
121 unsigned int BIOS_revision;
122 unsigned int mca_pentium_flag;
123
124 /* For PCI or other memory-mapped resources */
125 unsigned long pci_mem_start = 0x10000000;
126 #ifdef CONFIG_PCI
127 EXPORT_SYMBOL(pci_mem_start);
128 #endif
129
130 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
131 int bootloader_type;
132
133 /* user-defined highmem size */
134 static unsigned int highmem_pages = -1;
135
136 /*
137  * Setup options
138  */
139 struct drive_info_struct { char dummy[32]; } drive_info;
140 #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
141     defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
142 EXPORT_SYMBOL(drive_info);
143 #endif
144 struct screen_info screen_info;
145 EXPORT_SYMBOL(screen_info);
146 struct apm_info apm_info;
147 EXPORT_SYMBOL(apm_info);
148 struct sys_desc_table_struct {
149         unsigned short length;
150         unsigned char table[0];
151 };
152 struct edid_info edid_info;
153 EXPORT_SYMBOL_GPL(edid_info);
154 struct ist_info ist_info;
155 #if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
156         defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
157 EXPORT_SYMBOL(ist_info);
158 #endif
159 struct e820map e820;
160
161 extern void early_cpu_init(void);
162 extern void generic_apic_probe(char *);
163 extern int root_mountflags;
164
165 unsigned long saved_videomode;
166
167 #define RAMDISK_IMAGE_START_MASK        0x07FF
168 #define RAMDISK_PROMPT_FLAG             0x8000
169 #define RAMDISK_LOAD_FLAG               0x4000  
170
171 static char command_line[COMMAND_LINE_SIZE];
172
173 unsigned char __initdata boot_params[PARAM_SIZE];
174
175 static struct resource data_resource = {
176         .name   = "Kernel data",
177         .start  = 0,
178         .end    = 0,
179         .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
180 };
181
182 static struct resource code_resource = {
183         .name   = "Kernel code",
184         .start  = 0,
185         .end    = 0,
186         .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
187 };
188
189 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
190 static struct resource system_rom_resource = {
191         .name   = "System ROM",
192         .start  = 0xf0000,
193         .end    = 0xfffff,
194         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
195 };
196
197 static struct resource extension_rom_resource = {
198         .name   = "Extension ROM",
199         .start  = 0xe0000,
200         .end    = 0xeffff,
201         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
202 };
203
204 static struct resource adapter_rom_resources[] = { {
205         .name   = "Adapter ROM",
206         .start  = 0xc8000,
207         .end    = 0,
208         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
209 }, {
210         .name   = "Adapter ROM",
211         .start  = 0,
212         .end    = 0,
213         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
214 }, {
215         .name   = "Adapter ROM",
216         .start  = 0,
217         .end    = 0,
218         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
219 }, {
220         .name   = "Adapter ROM",
221         .start  = 0,
222         .end    = 0,
223         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
224 }, {
225         .name   = "Adapter ROM",
226         .start  = 0,
227         .end    = 0,
228         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
229 }, {
230         .name   = "Adapter ROM",
231         .start  = 0,
232         .end    = 0,
233         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
234 } };
235
236 #define ADAPTER_ROM_RESOURCES \
237         (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
238
239 static struct resource video_rom_resource = {
240         .name   = "Video ROM",
241         .start  = 0xc0000,
242         .end    = 0xc7fff,
243         .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
244 };
245 #endif
246
247 static struct resource video_ram_resource = {
248         .name   = "Video RAM area",
249         .start  = 0xa0000,
250         .end    = 0xbffff,
251         .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
252 };
253
254 static struct resource standard_io_resources[] = { {
255         .name   = "dma1",
256         .start  = 0x0000,
257         .end    = 0x001f,
258         .flags  = IORESOURCE_BUSY | IORESOURCE_IO
259 }, {
260         .name   = "pic1",
261         .start  = 0x0020,
262         .end    = 0x0021,
263         .flags  = IORESOURCE_BUSY | IORESOURCE_IO
264 }, {
265         .name   = "timer0",
266         .start  = 0x0040,
267         .end    = 0x0043,
268         .flags  = IORESOURCE_BUSY | IORESOURCE_IO
269 }, {
270         .name   = "timer1",
271         .start  = 0x0050,
272         .end    = 0x0053,
273         .flags  = IORESOURCE_BUSY | IORESOURCE_IO
274 }, {
275         .name   = "keyboard",
276         .start  = 0x0060,
277         .end    = 0x006f,
278         .flags  = IORESOURCE_BUSY | IORESOURCE_IO
279 }, {
280         .name   = "dma page reg",
281         .start  = 0x0080,
282         .end    = 0x008f,
283         .flags  = IORESOURCE_BUSY | IORESOURCE_IO
284 }, {
285         .name   = "pic2",
286         .start  = 0x00a0,
287         .end    = 0x00a1,
288         .flags  = IORESOURCE_BUSY | IORESOURCE_IO
289 }, {
290         .name   = "dma2",
291         .start  = 0x00c0,
292         .end    = 0x00df,
293         .flags  = IORESOURCE_BUSY | IORESOURCE_IO
294 }, {
295         .name   = "fpu",
296         .start  = 0x00f0,
297         .end    = 0x00ff,
298         .flags  = IORESOURCE_BUSY | IORESOURCE_IO
299 } };
300
301 #define STANDARD_IO_RESOURCES \
302         (sizeof standard_io_resources / sizeof standard_io_resources[0])
303
304 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
305 #define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
306
307 static int __init romchecksum(unsigned char *rom, unsigned long length)
308 {
309         unsigned char *p, sum = 0;
310
311         for (p = rom; p < rom + length; p++)
312                 sum += *p;
313         return sum == 0;
314 }
315
316 static void __init probe_roms(void)
317 {
318         unsigned long start, length, upper;
319         unsigned char *rom;
320         int           i;
321
322         /* Nothing to do if not running in dom0. */
323         if (!(xen_start_info->flags & SIF_INITDOMAIN))
324                 return;
325
326         /* video rom */
327         upper = adapter_rom_resources[0].start;
328         for (start = video_rom_resource.start; start < upper; start += 2048) {
329                 rom = isa_bus_to_virt(start);
330                 if (!romsignature(rom))
331                         continue;
332
333                 video_rom_resource.start = start;
334
335                 /* 0 < length <= 0x7f * 512, historically */
336                 length = rom[2] * 512;
337
338                 /* if checksum okay, trust length byte */
339                 if (length && romchecksum(rom, length))
340                         video_rom_resource.end = start + length - 1;
341
342                 request_resource(&iomem_resource, &video_rom_resource);
343                 break;
344         }
345
346         start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
347         if (start < upper)
348                 start = upper;
349
350         /* system rom */
351         request_resource(&iomem_resource, &system_rom_resource);
352         upper = system_rom_resource.start;
353
354         /* check for extension rom (ignore length byte!) */
355         rom = isa_bus_to_virt(extension_rom_resource.start);
356         if (romsignature(rom)) {
357                 length = extension_rom_resource.end - extension_rom_resource.start + 1;
358                 if (romchecksum(rom, length)) {
359                         request_resource(&iomem_resource, &extension_rom_resource);
360                         upper = extension_rom_resource.start;
361                 }
362         }
363
364         /* check for adapter roms on 2k boundaries */
365         for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
366                 rom = isa_bus_to_virt(start);
367                 if (!romsignature(rom))
368                         continue;
369
370                 /* 0 < length <= 0x7f * 512, historically */
371                 length = rom[2] * 512;
372
373                 /* but accept any length that fits if checksum okay */
374                 if (!length || start + length > upper || !romchecksum(rom, length))
375                         continue;
376
377                 adapter_rom_resources[i].start = start;
378                 adapter_rom_resources[i].end = start + length - 1;
379                 request_resource(&iomem_resource, &adapter_rom_resources[i]);
380
381                 start = adapter_rom_resources[i++].end & ~2047UL;
382         }
383 }
384 #endif
385
386 /*
387  * Point at the empty zero page to start with. We map the real shared_info
388  * page as soon as fixmap is up and running.
389  */
390 shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
391 EXPORT_SYMBOL(HYPERVISOR_shared_info);
392
393 unsigned long *phys_to_machine_mapping;
394 unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
395 EXPORT_SYMBOL(phys_to_machine_mapping);
396
397 /* Raw start-of-day parameters from the hypervisor. */
398 start_info_t *xen_start_info;
399 EXPORT_SYMBOL(xen_start_info);
400
401 static void __init add_memory_region(unsigned long long start,
402                                   unsigned long long size, int type)
403 {
404         int x;
405
406         if (!efi_enabled) {
407                 x = e820.nr_map;
408
409                 if (x == E820MAX) {
410                     printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
411                     return;
412                 }
413
414                 e820.map[x].addr = start;
415                 e820.map[x].size = size;
416                 e820.map[x].type = type;
417                 e820.nr_map++;
418         }
419 } /* add_memory_region */
420
421 static void __init limit_regions(unsigned long long size)
422 {
423         unsigned long long current_addr = 0;
424         int i;
425
426         if (efi_enabled) {
427                 efi_memory_desc_t *md;
428                 void *p;
429
430                 for (p = memmap.map, i = 0; p < memmap.map_end;
431                         p += memmap.desc_size, i++) {
432                         md = p;
433                         current_addr = md->phys_addr + (md->num_pages << 12);
434                         if (md->type == EFI_CONVENTIONAL_MEMORY) {
435                                 if (current_addr >= size) {
436                                         md->num_pages -=
437                                                 (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
438                                         memmap.nr_map = i + 1;
439                                         return;
440                                 }
441                         }
442                 }
443         }
444         for (i = 0; i < e820.nr_map; i++) {
445                 current_addr = e820.map[i].addr + e820.map[i].size;
446                 if (current_addr < size)
447                         continue;
448
449                 if (e820.map[i].type != E820_RAM)
450                         continue;
451
452                 if (e820.map[i].addr >= size) {
453                         /*
454                          * This region starts past the end of the
455                          * requested size, skip it completely.
456                          */
457                         e820.nr_map = i;
458                 } else {
459                         e820.nr_map = i + 1;
460                         e820.map[i].size -= current_addr - size;
461                 }
462                 return;
463         }
464 #ifdef CONFIG_XEN
465         if (i==e820.nr_map && current_addr < size) {
466                 /*
467                  * The e820 map finished before our requested size so
468                  * extend the final entry to the requested address.
469                  */
470                 --i;
471                 if (e820.map[i].type == E820_RAM)
472                         e820.map[i].size -= current_addr - size;
473                 else
474                         add_memory_region(current_addr, size - current_addr, E820_RAM);
475         }
476 #endif
477 }
478
479 #define E820_DEBUG      1
480
481 static void __init print_memory_map(char *who)
482 {
483         int i;
484
485         for (i = 0; i < e820.nr_map; i++) {
486                 printk(" %s: %016Lx - %016Lx ", who,
487                         e820.map[i].addr,
488                         e820.map[i].addr + e820.map[i].size);
489                 switch (e820.map[i].type) {
490                 case E820_RAM:  printk("(usable)\n");
491                                 break;
492                 case E820_RESERVED:
493                                 printk("(reserved)\n");
494                                 break;
495                 case E820_ACPI:
496                                 printk("(ACPI data)\n");
497                                 break;
498                 case E820_NVS:
499                                 printk("(ACPI NVS)\n");
500                                 break;
501                 default:        printk("type %lu\n", e820.map[i].type);
502                                 break;
503                 }
504         }
505 }
506
507 /*
508  * Sanitize the BIOS e820 map.
509  *
510  * Some e820 responses include overlapping entries.  The following 
511  * replaces the original e820 map with a new one, removing overlaps.
512  *
513  */
514 struct change_member {
515         struct e820entry *pbios; /* pointer to original bios entry */
516         unsigned long long addr; /* address for this change point */
517 };
518 static struct change_member change_point_list[2*E820MAX] __initdata;
519 static struct change_member *change_point[2*E820MAX] __initdata;
520 static struct e820entry *overlap_list[E820MAX] __initdata;
521 static struct e820entry new_bios[E820MAX] __initdata;
522
523 static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
524 {
525         struct change_member *change_tmp;
526         unsigned long current_type, last_type;
527         unsigned long long last_addr;
528         int chgidx, still_changing;
529         int overlap_entries;
530         int new_bios_entry;
531         int old_nr, new_nr, chg_nr;
532         int i;
533
534         /*
535                 Visually we're performing the following (1,2,3,4 = memory types)...
536
537                 Sample memory map (w/overlaps):
538                    ____22__________________
539                    ______________________4_
540                    ____1111________________
541                    _44_____________________
542                    11111111________________
543                    ____________________33__
544                    ___________44___________
545                    __________33333_________
546                    ______________22________
547                    ___________________2222_
548                    _________111111111______
549                    _____________________11_
550                    _________________4______
551
552                 Sanitized equivalent (no overlap):
553                    1_______________________
554                    _44_____________________
555                    ___1____________________
556                    ____22__________________
557                    ______11________________
558                    _________1______________
559                    __________3_____________
560                    ___________44___________
561                    _____________33_________
562                    _______________2________
563                    ________________1_______
564                    _________________4______
565                    ___________________2____
566                    ____________________33__
567                    ______________________4_
568         */
569
570         /* if there's only one memory region, don't bother */
571         if (*pnr_map < 2)
572                 return -1;
573
574         old_nr = *pnr_map;
575
576         /* bail out if we find any unreasonable addresses in bios map */
577         for (i=0; i<old_nr; i++)
578                 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
579                         return -1;
580
581         /* create pointers for initial change-point information (for sorting) */
582         for (i=0; i < 2*old_nr; i++)
583                 change_point[i] = &change_point_list[i];
584
585         /* record all known change-points (starting and ending addresses),
586            omitting those that are for empty memory regions */
587         chgidx = 0;
588         for (i=0; i < old_nr; i++)      {
589                 if (biosmap[i].size != 0) {
590                         change_point[chgidx]->addr = biosmap[i].addr;
591                         change_point[chgidx++]->pbios = &biosmap[i];
592                         change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
593                         change_point[chgidx++]->pbios = &biosmap[i];
594                 }
595         }
596         chg_nr = chgidx;        /* true number of change-points */
597
598         /* sort change-point list by memory addresses (low -> high) */
599         still_changing = 1;
600         while (still_changing)  {
601                 still_changing = 0;
602                 for (i=1; i < chg_nr; i++)  {
603                         /* if <current_addr> > <last_addr>, swap */
604                         /* or, if current=<start_addr> & last=<end_addr>, swap */
605                         if ((change_point[i]->addr < change_point[i-1]->addr) ||
606                                 ((change_point[i]->addr == change_point[i-1]->addr) &&
607                                  (change_point[i]->addr == change_point[i]->pbios->addr) &&
608                                  (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
609                            )
610                         {
611                                 change_tmp = change_point[i];
612                                 change_point[i] = change_point[i-1];
613                                 change_point[i-1] = change_tmp;
614                                 still_changing=1;
615                         }
616                 }
617         }
618
619         /* create a new bios memory map, removing overlaps */
620         overlap_entries=0;       /* number of entries in the overlap table */
621         new_bios_entry=0;        /* index for creating new bios map entries */
622         last_type = 0;           /* start with undefined memory type */
623         last_addr = 0;           /* start with 0 as last starting address */
624         /* loop through change-points, determining affect on the new bios map */
625         for (chgidx=0; chgidx < chg_nr; chgidx++)
626         {
627                 /* keep track of all overlapping bios entries */
628                 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
629                 {
630                         /* add map entry to overlap list (> 1 entry implies an overlap) */
631                         overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
632                 }
633                 else
634                 {
635                         /* remove entry from list (order independent, so swap with last) */
636                         for (i=0; i<overlap_entries; i++)
637                         {
638                                 if (overlap_list[i] == change_point[chgidx]->pbios)
639                                         overlap_list[i] = overlap_list[overlap_entries-1];
640                         }
641                         overlap_entries--;
642                 }
643                 /* if there are overlapping entries, decide which "type" to use */
644                 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
645                 current_type = 0;
646                 for (i=0; i<overlap_entries; i++)
647                         if (overlap_list[i]->type > current_type)
648                                 current_type = overlap_list[i]->type;
649                 /* continue building up new bios map based on this information */
650                 if (current_type != last_type)  {
651                         if (last_type != 0)      {
652                                 new_bios[new_bios_entry].size =
653                                         change_point[chgidx]->addr - last_addr;
654                                 /* move forward only if the new size was non-zero */
655                                 if (new_bios[new_bios_entry].size != 0)
656                                         if (++new_bios_entry >= E820MAX)
657                                                 break;  /* no more space left for new bios entries */
658                         }
659                         if (current_type != 0)  {
660                                 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
661                                 new_bios[new_bios_entry].type = current_type;
662                                 last_addr=change_point[chgidx]->addr;
663                         }
664                         last_type = current_type;
665                 }
666         }
667         new_nr = new_bios_entry;   /* retain count for new bios entries */
668
669         /* copy new bios mapping into original location */
670         memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
671         *pnr_map = new_nr;
672
673         return 0;
674 }
675
676 /*
677  * Copy the BIOS e820 map into a safe place.
678  *
679  * Sanity-check it while we're at it..
680  *
681  * If we're lucky and live on a modern system, the setup code
682  * will have given us a memory map that we can use to properly
683  * set up memory.  If we aren't, we'll fake a memory map.
684  *
685  * We check to see that the memory map contains at least 2 elements
686  * before we'll use it, because the detection code in setup.S may
687  * not be perfect and most every PC known to man has two memory
688  * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
689  * thinkpad 560x, for example, does not cooperate with the memory
690  * detection code.)
691  */
692 static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
693 {
694 #ifndef CONFIG_XEN
695         /* Only one memory region (or negative)? Ignore it */
696         if (nr_map < 2)
697                 return -1;
698 #else
699         BUG_ON(nr_map < 1);
700 #endif
701
702         do {
703                 unsigned long long start = biosmap->addr;
704                 unsigned long long size = biosmap->size;
705                 unsigned long long end = start + size;
706                 unsigned long type = biosmap->type;
707
708                 /* Overflow in 64 bits? Ignore the memory map. */
709                 if (start > end)
710                         return -1;
711
712 #ifndef CONFIG_XEN
713                 /*
714                  * Some BIOSes claim RAM in the 640k - 1M region.
715                  * Not right. Fix it up.
716                  */
717                 if (type == E820_RAM) {
718                         if (start < 0x100000ULL && end > 0xA0000ULL) {
719                                 if (start < 0xA0000ULL)
720                                         add_memory_region(start, 0xA0000ULL-start, type);
721                                 if (end <= 0x100000ULL)
722                                         continue;
723                                 start = 0x100000ULL;
724                                 size = end - start;
725                         }
726                 }
727 #endif
728                 add_memory_region(start, size, type);
729         } while (biosmap++,--nr_map);
730         return 0;
731 }
732
733 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
734 struct edd edd;
735 #ifdef CONFIG_EDD_MODULE
736 EXPORT_SYMBOL(edd);
737 #endif
738 /**
739  * copy_edd() - Copy the BIOS EDD information
740  *              from boot_params into a safe place.
741  *
742  */
743 static inline void copy_edd(void)
744 {
745      memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
746      memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
747      edd.mbr_signature_nr = EDD_MBR_SIG_NR;
748      edd.edd_info_nr = EDD_NR;
749 }
750 #else
751 static inline void copy_edd(void)
752 {
753 }
754 #endif
755
756 /*
757  * Do NOT EVER look at the BIOS memory size location.
758  * It does not work on many machines.
759  */
760 #define LOWMEMSIZE()    (0x9f000)
761
762 static void __init parse_cmdline_early (char ** cmdline_p)
763 {
764         char c = ' ', *to = command_line, *from = saved_command_line;
765         int len = 0, max_cmdline;
766         int userdef = 0;
767
768         if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
769                 max_cmdline = COMMAND_LINE_SIZE;
770         memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
771         /* Save unparsed command line copy for /proc/cmdline */
772         saved_command_line[max_cmdline-1] = '\0';
773
774         for (;;) {
775                 if (c != ' ')
776                         goto next_char;
777                 /*
778                  * "mem=nopentium" disables the 4MB page tables.
779                  * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
780                  * to <mem>, overriding the bios size.
781                  * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
782                  * <start> to <start>+<mem>, overriding the bios size.
783                  *
784                  * HPA tells me bootloaders need to parse mem=, so no new
785                  * option should be mem=  [also see Documentation/i386/boot.txt]
786                  */
787                 if (!memcmp(from, "mem=", 4)) {
788                         if (to != command_line)
789                                 to--;
790                         if (!memcmp(from+4, "nopentium", 9)) {
791                                 from += 9+4;
792                                 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
793                                 disable_pse = 1;
794                         } else {
795                                 /* If the user specifies memory size, we
796                                  * limit the BIOS-provided memory map to
797                                  * that size. exactmap can be used to specify
798                                  * the exact map. mem=number can be used to
799                                  * trim the existing memory map.
800                                  */
801                                 unsigned long long mem_size;
802  
803                                 mem_size = memparse(from+4, &from);
804                                 limit_regions(mem_size);
805                                 userdef=1;
806                         }
807                 }
808
809                 else if (!memcmp(from, "memmap=", 7)) {
810                         if (to != command_line)
811                                 to--;
812                         if (!memcmp(from+7, "exactmap", 8)) {
813 #ifdef CONFIG_CRASH_DUMP
814                                 /* If we are doing a crash dump, we
815                                  * still need to know the real mem
816                                  * size before original memory map is
817                                  * reset.
818                                  */
819                                 find_max_pfn();
820                                 saved_max_pfn = max_pfn;
821 #endif
822                                 from += 8+7;
823                                 e820.nr_map = 0;
824                                 userdef = 1;
825                         } else {
826                                 /* If the user specifies memory size, we
827                                  * limit the BIOS-provided memory map to
828                                  * that size. exactmap can be used to specify
829                                  * the exact map. mem=number can be used to
830                                  * trim the existing memory map.
831                                  */
832                                 unsigned long long start_at, mem_size;
833  
834                                 mem_size = memparse(from+7, &from);
835                                 if (*from == '@') {
836                                         start_at = memparse(from+1, &from);
837                                         add_memory_region(start_at, mem_size, E820_RAM);
838                                 } else if (*from == '#') {
839                                         start_at = memparse(from+1, &from);
840                                         add_memory_region(start_at, mem_size, E820_ACPI);
841                                 } else if (*from == '$') {
842                                         start_at = memparse(from+1, &from);
843                                         add_memory_region(start_at, mem_size, E820_RESERVED);
844                                 } else {
845                                         limit_regions(mem_size);
846                                         userdef=1;
847                                 }
848                         }
849                 }
850
851                 else if (!memcmp(from, "noexec=", 7))
852                         noexec_setup(from + 7);
853
854
855 #ifdef  CONFIG_X86_MPPARSE
856                 /*
857                  * If the BIOS enumerates physical processors before logical,
858                  * maxcpus=N at enumeration-time can be used to disable HT.
859                  */
860                 else if (!memcmp(from, "maxcpus=", 8)) {
861                         extern unsigned int maxcpus;
862
863                         maxcpus = simple_strtoul(from + 8, NULL, 0);
864                 }
865 #endif
866
867 #ifdef CONFIG_ACPI
868                 /* "acpi=off" disables both ACPI table parsing and interpreter */
869                 else if (!memcmp(from, "acpi=off", 8)) {
870                         disable_acpi();
871                 }
872
873                 /* acpi=force to over-ride black-list */
874                 else if (!memcmp(from, "acpi=force", 10)) {
875                         acpi_force = 1;
876                         acpi_ht = 1;
877                         acpi_disabled = 0;
878                 }
879
880                 /* acpi=strict disables out-of-spec workarounds */
881                 else if (!memcmp(from, "acpi=strict", 11)) {
882                         acpi_strict = 1;
883                 }
884
885                 /* Limit ACPI just to boot-time to enable HT */
886                 else if (!memcmp(from, "acpi=ht", 7)) {
887                         if (!acpi_force)
888                                 disable_acpi();
889                         acpi_ht = 1;
890                 }
891                 
892                 /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
893                 else if (!memcmp(from, "pci=noacpi", 10)) {
894                         acpi_disable_pci();
895                 }
896                 /* "acpi=noirq" disables ACPI interrupt routing */
897                 else if (!memcmp(from, "acpi=noirq", 10)) {
898                         acpi_noirq_set();
899                 }
900
901                 else if (!memcmp(from, "acpi_sci=edge", 13))
902                         acpi_sci_flags.trigger =  1;
903
904                 else if (!memcmp(from, "acpi_sci=level", 14))
905                         acpi_sci_flags.trigger = 3;
906
907                 else if (!memcmp(from, "acpi_sci=high", 13))
908                         acpi_sci_flags.polarity = 1;
909
910                 else if (!memcmp(from, "acpi_sci=low", 12))
911                         acpi_sci_flags.polarity = 3;
912
913 #ifdef CONFIG_X86_IO_APIC
914                 else if (!memcmp(from, "acpi_skip_timer_override", 24))
915                         acpi_skip_timer_override = 1;
916
917                 if (!memcmp(from, "disable_timer_pin_1", 19))
918                         disable_timer_pin_1 = 1;
919                 if (!memcmp(from, "enable_timer_pin_1", 18))
920                         disable_timer_pin_1 = -1;
921
922                 /* disable IO-APIC */
923                 else if (!memcmp(from, "noapic", 6))
924                         disable_ioapic_setup();
925 #endif /* CONFIG_X86_IO_APIC */
926 #endif /* CONFIG_ACPI */
927
928 #ifdef CONFIG_X86_LOCAL_APIC
929                 /* enable local APIC */
930                 else if (!memcmp(from, "lapic", 5))
931                         lapic_enable();
932
933                 /* disable local APIC */
934                 else if (!memcmp(from, "nolapic", 6))
935                         lapic_disable();
936 #endif /* CONFIG_X86_LOCAL_APIC */
937
938 #ifdef CONFIG_KEXEC
939                 /* crashkernel=size@addr specifies the location to reserve for
940                  * a crash kernel.  By reserving this memory we guarantee
941                  * that linux never set's it up as a DMA target.
942                  * Useful for holding code to do something appropriate
943                  * after a kernel panic.
944                  */
945                 else if (!memcmp(from, "crashkernel=", 12)) {
946                         unsigned long size, base;
947                         size = memparse(from+12, &from);
948                         if (*from == '@') {
949                                 base = memparse(from+1, &from);
950                                 /* FIXME: Do I want a sanity check
951                                  * to validate the memory range?
952                                  */
953                                 crashk_res.start = base;
954                                 crashk_res.end   = base + size - 1;
955                         }
956                 }
957 #endif
958 #ifdef CONFIG_PROC_VMCORE
959                 /* elfcorehdr= specifies the location of elf core header
960                  * stored by the crashed kernel.
961                  */
962                 else if (!memcmp(from, "elfcorehdr=", 11))
963                         elfcorehdr_addr = memparse(from+11, &from);
964 #endif
965
966                 /*
967                  * highmem=size forces highmem to be exactly 'size' bytes.
968                  * This works even on boxes that have no highmem otherwise.
969                  * This also works to reduce highmem size on bigger boxes.
970                  */
971                 else if (!memcmp(from, "highmem=", 8))
972                         highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
973         
974                 /*
975                  * vmalloc=size forces the vmalloc area to be exactly 'size'
976                  * bytes. This can be used to increase (or decrease) the
977                  * vmalloc area - the default is 128m.
978                  */
979                 else if (!memcmp(from, "vmalloc=", 8))
980                         __VMALLOC_RESERVE = memparse(from+8, &from);
981
982         next_char:
983                 c = *(from++);
984                 if (!c)
985                         break;
986                 if (COMMAND_LINE_SIZE <= ++len)
987                         break;
988                 *(to++) = c;
989         }
990         *to = '\0';
991         *cmdline_p = command_line;
992         if (userdef) {
993                 printk(KERN_INFO "user-defined physical RAM map:\n");
994                 print_memory_map("user");
995         }
996 }
997
998 /*
999  * Callback for efi_memory_walk.
1000  */
1001 static int __init
1002 efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1003 {
1004         unsigned long *max_pfn = arg, pfn;
1005
1006         if (start < end) {
1007                 pfn = PFN_UP(end -1);
1008                 if (pfn > *max_pfn)
1009                         *max_pfn = pfn;
1010         }
1011         return 0;
1012 }
1013
1014 static int __init
1015 efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1016 {
1017         memory_present(0, start, end);
1018         return 0;
1019 }
1020
1021  /*
1022   * This function checks if the entire range <start,end> is mapped with type.
1023   *
1024   * Note: this function only works correct if the e820 table is sorted and
1025   * not-overlapping, which is the case
1026   */
1027 int __init
1028 e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
1029 {
1030         u64 start = s;
1031         u64 end = e;
1032         int i;
1033         for (i = 0; i < e820.nr_map; i++) {
1034                 struct e820entry *ei = &e820.map[i];
1035                 if (type && ei->type != type)
1036                         continue;
1037                 /* is the region (part) in overlap with the current region ?*/
1038                 if (ei->addr >= end || ei->addr + ei->size <= start)
1039                         continue;
1040                 /* if the region is at the beginning of <start,end> we move
1041                  * start to the end of the region since it's ok until there
1042                  */
1043                 if (ei->addr <= start)
1044                         start = ei->addr + ei->size;
1045                 /* if start is now at or beyond end, we're done, full
1046                  * coverage */
1047                 if (start >= end)
1048                         return 1; /* we're done */
1049         }
1050         return 0;
1051 }
1052
1053 /*
1054  * Find the highest page frame number we have available
1055  */
1056 void __init find_max_pfn(void)
1057 {
1058         int i;
1059
1060         max_pfn = 0;
1061         if (efi_enabled) {
1062                 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1063                 efi_memmap_walk(efi_memory_present_wrapper, NULL);
1064                 return;
1065         }
1066
1067         for (i = 0; i < e820.nr_map; i++) {
1068                 unsigned long start, end;
1069                 /* RAM? */
1070                 if (e820.map[i].type != E820_RAM)
1071                         continue;
1072                 start = PFN_UP(e820.map[i].addr);
1073                 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1074                 if (start >= end)
1075                         continue;
1076                 if (end > max_pfn)
1077                         max_pfn = end;
1078                 memory_present(0, start, end);
1079         }
1080 }
1081
1082 /*
1083  * Determine low and high memory ranges:
1084  */
1085 unsigned long __init find_max_low_pfn(void)
1086 {
1087         unsigned long max_low_pfn;
1088
1089         max_low_pfn = max_pfn;
1090         if (max_low_pfn > MAXMEM_PFN) {
1091                 if (highmem_pages == -1)
1092                         highmem_pages = max_pfn - MAXMEM_PFN;
1093                 if (highmem_pages + MAXMEM_PFN < max_pfn)
1094                         max_pfn = MAXMEM_PFN + highmem_pages;
1095                 if (highmem_pages + MAXMEM_PFN > max_pfn) {
1096                         printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
1097                         highmem_pages = 0;
1098                 }
1099                 max_low_pfn = MAXMEM_PFN;
1100 #ifndef CONFIG_HIGHMEM
1101                 /* Maximum memory usable is what is directly addressable */
1102                 printk(KERN_WARNING "Warning only %ldMB will be used.\n",
1103                                         MAXMEM>>20);
1104                 if (max_pfn > MAX_NONPAE_PFN)
1105                         printk(KERN_WARNING "Use a PAE enabled kernel.\n");
1106                 else
1107                         printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
1108                 max_pfn = MAXMEM_PFN;
1109 #else /* !CONFIG_HIGHMEM */
1110 #ifndef CONFIG_X86_PAE
1111                 if (max_pfn > MAX_NONPAE_PFN) {
1112                         max_pfn = MAX_NONPAE_PFN;
1113                         printk(KERN_WARNING "Warning only 4GB will be used.\n");
1114                         printk(KERN_WARNING "Use a PAE enabled kernel.\n");
1115                 }
1116 #endif /* !CONFIG_X86_PAE */
1117 #endif /* !CONFIG_HIGHMEM */
1118         } else {
1119                 if (highmem_pages == -1)
1120                         highmem_pages = 0;
1121 #ifdef CONFIG_HIGHMEM
1122                 if (highmem_pages >= max_pfn) {
1123                         printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
1124                         highmem_pages = 0;
1125                 }
1126                 if (highmem_pages) {
1127                         if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
1128                                 printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
1129                                 highmem_pages = 0;
1130                         }
1131                         max_low_pfn -= highmem_pages;
1132                 }
1133 #else
1134                 if (highmem_pages)
1135                         printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
1136 #endif
1137         }
1138         return max_low_pfn;
1139 }
1140
1141 /*
1142  * Free all available memory for boot time allocation.  Used
1143  * as a callback function by efi_memory_walk()
1144  */
1145
1146 static int __init
1147 free_available_memory(unsigned long start, unsigned long end, void *arg)
1148 {
1149         /* check max_low_pfn */
1150         if (start >= ((max_low_pfn + 1) << PAGE_SHIFT))
1151                 return 0;
1152         if (end >= ((max_low_pfn + 1) << PAGE_SHIFT))
1153                 end = (max_low_pfn + 1) << PAGE_SHIFT;
1154         if (start < end)
1155                 free_bootmem(start, end - start);
1156
1157         return 0;
1158 }
1159 /*
1160  * Register fully available low RAM pages with the bootmem allocator.
1161  */
1162 static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1163 {
1164         int i;
1165
1166         if (efi_enabled) {
1167                 efi_memmap_walk(free_available_memory, NULL);
1168                 return;
1169         }
1170         for (i = 0; i < e820.nr_map; i++) {
1171                 unsigned long curr_pfn, last_pfn, size;
1172                 /*
1173                  * Reserve usable low memory
1174                  */
1175                 if (e820.map[i].type != E820_RAM)
1176                         continue;
1177                 /*
1178                  * We are rounding up the start address of usable memory:
1179                  */
1180                 curr_pfn = PFN_UP(e820.map[i].addr);
1181                 if (curr_pfn >= max_low_pfn)
1182                         continue;
1183                 /*
1184                  * ... and at the end of the usable range downwards:
1185                  */
1186                 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1187
1188 #ifdef CONFIG_XEN
1189                 /*
1190                  * Truncate to the number of actual pages currently
1191                  * present.
1192                  */
1193                 if (last_pfn > xen_start_info->nr_pages)
1194                         last_pfn = xen_start_info->nr_pages;
1195 #endif
1196
1197                 if (last_pfn > max_low_pfn)
1198                         last_pfn = max_low_pfn;
1199
1200                 /*
1201                  * .. finally, did all the rounding and playing
1202                  * around just make the area go away?
1203                  */
1204                 if (last_pfn <= curr_pfn)
1205                         continue;
1206
1207                 size = last_pfn - curr_pfn;
1208                 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1209         }
1210 }
1211
1212 #ifndef CONFIG_XEN
1213 /*
1214  * workaround for Dell systems that neglect to reserve EBDA
1215  */
1216 static void __init reserve_ebda_region(void)
1217 {
1218         unsigned int addr;
1219         addr = get_bios_ebda();
1220         if (addr)
1221                 reserve_bootmem(addr, PAGE_SIZE);       
1222 }
1223 #endif
1224
1225 #ifndef CONFIG_NEED_MULTIPLE_NODES
1226 void __init setup_bootmem_allocator(void);
1227 static unsigned long __init setup_memory(void)
1228 {
1229         /*
1230          * partially used pages are not usable - thus
1231          * we are rounding upwards:
1232          */
1233         min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
1234                 xen_start_info->nr_pt_frames;
1235
1236         find_max_pfn();
1237
1238         max_low_pfn = find_max_low_pfn();
1239
1240 #ifdef CONFIG_HIGHMEM
1241         highstart_pfn = highend_pfn = max_pfn;
1242         if (max_pfn > max_low_pfn) {
1243                 highstart_pfn = max_low_pfn;
1244         }
1245         printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
1246                 pages_to_mb(highend_pfn - highstart_pfn));
1247 #endif
1248         printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
1249                         pages_to_mb(max_low_pfn));
1250
1251         setup_bootmem_allocator();
1252
1253         return max_low_pfn;
1254 }
1255
1256 void __init zone_sizes_init(void)
1257 {
1258         unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
1259         unsigned int max_dma, low;
1260
1261         /*
1262          * XEN: Our notion of "DMA memory" is fake when running over Xen.
1263          * We simply put all RAM in the DMA zone so that those drivers which
1264          * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
1265          * Those drivers that *do* require lowmem are screwed anyway when
1266          * running over Xen!
1267          */
1268         max_dma = max_low_pfn;
1269         low = max_low_pfn;
1270
1271         if (low < max_dma)
1272                 zones_size[ZONE_DMA] = low;
1273         else {
1274                 zones_size[ZONE_DMA] = max_dma;
1275                 zones_size[ZONE_NORMAL] = low - max_dma;
1276 #ifdef CONFIG_HIGHMEM
1277                 zones_size[ZONE_HIGHMEM] = highend_pfn - low;
1278 #endif
1279         }
1280         free_area_init(zones_size);
1281 }
1282 #else
1283 extern unsigned long __init setup_memory(void);
1284 extern void zone_sizes_init(void);
1285 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
1286
1287 void __init setup_bootmem_allocator(void)
1288 {
1289         unsigned long bootmap_size;
1290         /*
1291          * Initialize the boot-time allocator (with low memory only):
1292          */
1293         bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
1294
1295         register_bootmem_low_pages(max_low_pfn);
1296
1297         /*
1298          * Reserve the bootmem bitmap itself as well. We do this in two
1299          * steps (first step was init_bootmem()) because this catches
1300          * the (very unlikely) case of us accidentally initializing the
1301          * bootmem allocator with an invalid RAM area.
1302          */
1303         reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
1304                          bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
1305
1306 #ifndef CONFIG_XEN
1307         /*
1308          * reserve physical page 0 - it's a special BIOS page on many boxes,
1309          * enabling clean reboots, SMP operation, laptop functions.
1310          */
1311         reserve_bootmem(0, PAGE_SIZE);
1312
1313         /* reserve EBDA region, it's a 4K region */
1314         reserve_ebda_region();
1315
1316     /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
1317        PCI prefetch into it (errata #56). Usually the page is reserved anyways,
1318        unless you have no PS/2 mouse plugged in. */
1319         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
1320             boot_cpu_data.x86 == 6)
1321              reserve_bootmem(0xa0000 - 4096, 4096);
1322
1323 #ifdef CONFIG_SMP
1324         /*
1325          * But first pinch a few for the stack/trampoline stuff
1326          * FIXME: Don't need the extra page at 4K, but need to fix
1327          * trampoline before removing it. (see the GDT stuff)
1328          */
1329         reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
1330 #endif
1331 #ifdef CONFIG_ACPI_SLEEP
1332         /*
1333          * Reserve low memory region for sleep support.
1334          */
1335         acpi_reserve_bootmem();
1336 #endif
1337 #endif /* !CONFIG_XEN */
1338
1339 #ifdef CONFIG_BLK_DEV_INITRD
1340         if (xen_start_info->mod_start) {
1341                 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
1342                         /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
1343                         initrd_start = INITRD_START + PAGE_OFFSET;
1344                         initrd_end = initrd_start+INITRD_SIZE;
1345                         initrd_below_start_ok = 1;
1346                 }
1347                 else {
1348                         printk(KERN_ERR "initrd extends beyond end of memory "
1349                             "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
1350                             INITRD_START + INITRD_SIZE,
1351                             max_low_pfn << PAGE_SHIFT);
1352                         initrd_start = 0;
1353                 }
1354         }
1355 #endif
1356 #ifdef CONFIG_KEXEC
1357         if (crashk_res.start != crashk_res.end)
1358                 reserve_bootmem(crashk_res.start,
1359                         crashk_res.end - crashk_res.start + 1);
1360 #endif
1361
1362         if (!xen_feature(XENFEAT_auto_translated_physmap))
1363                 phys_to_machine_mapping =
1364                         (unsigned long *)xen_start_info->mfn_list;
1365 }
1366
1367 /*
1368  * The node 0 pgdat is initialized before all of these because
1369  * it's needed for bootmem.  node>0 pgdats have their virtual
1370  * space allocated before the pagetables are in place to access
1371  * them, so they can't be cleared then.
1372  *
1373  * This should all compile down to nothing when NUMA is off.
1374  */
1375 void __init remapped_pgdat_init(void)
1376 {
1377         int nid;
1378
1379         for_each_online_node(nid) {
1380                 if (nid != 0)
1381                         memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
1382         }
1383 }
1384
1385 /*
1386  * Request address space for all standard RAM and ROM resources
1387  * and also for regions reported as reserved by the e820.
1388  */
1389 static void __init
1390 legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
1391                             struct resource *code_resource,
1392                             struct resource *data_resource)
1393 {
1394         int i;
1395
1396 #if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
1397         probe_roms();
1398 #endif
1399
1400         for (i = 0; i < nr_map; i++) {
1401                 struct resource *res;
1402                 if (e820[i].addr + e820[i].size > 0x100000000ULL)
1403                         continue;
1404                 res = alloc_bootmem_low(sizeof(struct resource));
1405                 switch (e820[i].type) {
1406                 case E820_RAM:  res->name = "System RAM"; break;
1407                 case E820_ACPI: res->name = "ACPI Tables"; break;
1408                 case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
1409                 default:        res->name = "reserved";
1410                 }
1411                 res->start = e820[i].addr;
1412                 res->end = res->start + e820[i].size - 1;
1413                 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1414                 request_resource(&iomem_resource, res);
1415                 if (e820[i].type == E820_RAM) {
1416                         /*
1417                          *  We don't know which RAM region contains kernel data,
1418                          *  so we try it repeatedly and let the resource manager
1419                          *  test it.
1420                          */
1421                         request_resource(res, code_resource);
1422                         request_resource(res, data_resource);
1423 #ifdef CONFIG_KEXEC
1424                         request_resource(res, &crashk_res);
1425 #endif
1426                 }
1427         }
1428 }
1429
1430 /*
1431  * Locate a unused range of the physical address space below 4G which
1432  * can be used for PCI mappings.
1433  */
1434 static void __init
1435 e820_setup_gap(struct e820entry *e820, int nr_map)
1436 {
1437         unsigned long gapstart, gapsize, round;
1438         unsigned long long last;
1439         int i;
1440
1441         /*
1442          * Search for the bigest gap in the low 32 bits of the e820
1443          * memory space.
1444          */
1445         last = 0x100000000ull;
1446         gapstart = 0x10000000;
1447         gapsize = 0x400000;
1448         i = nr_map;
1449         while (--i >= 0) {
1450                 unsigned long long start = e820[i].addr;
1451                 unsigned long long end = start + e820[i].size;
1452
1453                 /*
1454                  * Since "last" is at most 4GB, we know we'll
1455                  * fit in 32 bits if this condition is true
1456                  */
1457                 if (last > end) {
1458                         unsigned long gap = last - end;
1459
1460                         if (gap > gapsize) {
1461                                 gapsize = gap;
1462                                 gapstart = end;
1463                         }
1464                 }
1465                 if (start < last)
1466                         last = start;
1467         }
1468
1469         /*
1470          * See how much we want to round up: start off with
1471          * rounding to the next 1MB area.
1472          */
1473         round = 0x100000;
1474         while ((gapsize >> 4) > round)
1475                 round += round;
1476         /* Fun with two's complement */
1477         pci_mem_start = (gapstart + round) & -round;
1478
1479         printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1480                 pci_mem_start, gapstart, gapsize);
1481 }
1482
1483 /*
1484  * Request address space for all standard resources
1485  */
1486 static void __init register_memory(void)
1487 {
1488 #ifdef CONFIG_XEN
1489         struct e820entry *machine_e820;
1490         struct xen_memory_map memmap;
1491 #endif
1492         int           i;
1493
1494         /* Nothing to do if not running in dom0. */
1495         if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
1496                 legacy_init_iomem_resources(e820.map, e820.nr_map,
1497                                             &code_resource, &data_resource);
1498                 return;
1499         }
1500
1501 #ifdef CONFIG_XEN
1502         machine_e820 = alloc_bootmem_low_pages(PAGE_SIZE);
1503
1504         memmap.nr_entries = E820MAX;
1505         set_xen_guest_handle(memmap.buffer, machine_e820);
1506
1507         BUG_ON(HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap));
1508
1509         legacy_init_iomem_resources(machine_e820, memmap.nr_entries,
1510                                     &code_resource, &data_resource);
1511 #else
1512         if (efi_enabled)
1513                 efi_initialize_iomem_resources(&code_resource, &data_resource);
1514         else
1515                 legacy_init_iomem_resources(e820.map, e820.nr_map,
1516                                             &code_resource, &data_resource);
1517 #endif
1518
1519         /* EFI systems may still have VGA */
1520         request_resource(&iomem_resource, &video_ram_resource);
1521
1522         /* request I/O space for devices used on all i[345]86 PCs */
1523         for (i = 0; i < STANDARD_IO_RESOURCES; i++)
1524                 request_resource(&ioport_resource, &standard_io_resources[i]);
1525
1526 #ifdef CONFIG_XEN
1527         e820_setup_gap(machine_e820, memmap.nr_entries);
1528         free_bootmem(__pa(machine_e820), PAGE_SIZE);
1529 #else
1530         e820_setup_gap(e820.map, e820.nr_map);
1531 #endif
1532 }
1533
1534 static char * __init machine_specific_memory_setup(void);
1535
1536 #ifdef CONFIG_MCA
1537 static void set_mca_bus(int x)
1538 {
1539         MCA_bus = x;
1540 }
1541 #else
1542 static void set_mca_bus(int x) { }
1543 #endif
1544
1545 /*
1546  * Determine if we were loaded by an EFI loader.  If so, then we have also been
1547  * passed the efi memmap, systab, etc., so we should use these data structures
1548  * for initialization.  Note, the efi init code path is determined by the
1549  * global efi_enabled. This allows the same kernel image to be used on existing
1550  * systems (with a traditional BIOS) as well as on EFI systems.
1551  */
1552 void __init setup_arch(char **cmdline_p)
1553 {
1554         int i, j, k, fpp;
1555         struct physdev_set_iopl set_iopl;
1556         unsigned long max_low_pfn;
1557
1558         /* Force a quick death if the kernel panics (not domain 0). */
1559         extern int panic_timeout;
1560         if (!panic_timeout && !(xen_start_info->flags & SIF_INITDOMAIN))
1561                 panic_timeout = 1;
1562
1563         /* Register a call for panic conditions. */
1564         atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
1565
1566         HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
1567         HYPERVISOR_vm_assist(VMASST_CMD_enable,
1568                              VMASST_TYPE_writable_pagetables);
1569
1570         memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
1571         pre_setup_arch_hook();
1572         early_cpu_init();
1573
1574         /*
1575          * FIXME: This isn't an official loader_type right
1576          * now but does currently work with elilo.
1577          * If we were configured as an EFI kernel, check to make
1578          * sure that we were loaded correctly from elilo and that
1579          * the system table is valid.  If not, then initialize normally.
1580          */
1581 #ifdef CONFIG_EFI
1582         if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
1583                 efi_enabled = 1;
1584 #endif
1585
1586         /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
1587            properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
1588         */
1589         ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
1590         drive_info = DRIVE_INFO;
1591         screen_info = SCREEN_INFO;
1592         edid_info = EDID_INFO;
1593         apm_info.bios = APM_BIOS_INFO;
1594         ist_info = IST_INFO;
1595         saved_videomode = VIDEO_MODE;
1596         if( SYS_DESC_TABLE.length != 0 ) {
1597                 set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
1598                 machine_id = SYS_DESC_TABLE.table[0];
1599                 machine_submodel_id = SYS_DESC_TABLE.table[1];
1600                 BIOS_revision = SYS_DESC_TABLE.table[2];
1601         }
1602         bootloader_type = LOADER_TYPE;
1603
1604         if (xen_start_info->flags & SIF_INITDOMAIN) {
1605                 /* This is drawn from a dump from vgacon:startup in
1606                  * standard Linux. */
1607                 screen_info.orig_video_mode = 3; 
1608                 screen_info.orig_video_isVGA = 1;
1609                 screen_info.orig_video_lines = 25;
1610                 screen_info.orig_video_cols = 80;
1611                 screen_info.orig_video_ega_bx = 3;
1612                 screen_info.orig_video_points = 16;
1613         } else
1614                 screen_info.orig_video_isVGA = 0;
1615
1616 #ifdef CONFIG_BLK_DEV_RAM
1617         rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
1618         rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
1619         rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
1620 #endif
1621
1622         setup_xen_features();
1623
1624         ARCH_SETUP
1625         if (efi_enabled)
1626                 efi_init();
1627         else {
1628                 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1629                 print_memory_map(machine_specific_memory_setup());
1630         }
1631
1632         copy_edd();
1633
1634         if (!MOUNT_ROOT_RDONLY)
1635                 root_mountflags &= ~MS_RDONLY;
1636         init_mm.start_code = (unsigned long) _text;
1637         init_mm.end_code = (unsigned long) _etext;
1638         init_mm.end_data = (unsigned long) _edata;
1639         init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
1640                        xen_start_info->nr_pt_frames) << PAGE_SHIFT;
1641
1642         code_resource.start = virt_to_phys(_text);
1643         code_resource.end = virt_to_phys(_etext)-1;
1644         data_resource.start = virt_to_phys(_etext);
1645         data_resource.end = virt_to_phys(_edata)-1;
1646
1647         parse_cmdline_early(cmdline_p);
1648
1649 #ifdef CONFIG_EARLY_PRINTK
1650         {
1651                 char *s = strstr(*cmdline_p, "earlyprintk=");
1652                 if (s) {
1653                         setup_early_printk(strchr(s, '=') + 1);
1654                         printk("early console enabled\n");
1655                 }
1656         }
1657 #endif
1658
1659         max_low_pfn = setup_memory();
1660
1661         /*
1662          * NOTE: before this point _nobody_ is allowed to allocate
1663          * any memory using the bootmem allocator.  Although the
1664          * alloctor is now initialised only the first 8Mb of the kernel
1665          * virtual address space has been mapped.  All allocations before
1666          * paging_init() has completed must use the alloc_bootmem_low_pages()
1667          * variant (which allocates DMA'able memory) and care must be taken
1668          * not to exceed the 8Mb limit.
1669          */
1670
1671 #ifdef CONFIG_SMP
1672         smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
1673 #endif
1674         paging_init();
1675         remapped_pgdat_init();
1676         sparse_init();
1677         zone_sizes_init();
1678
1679 #ifdef CONFIG_X86_FIND_SMP_CONFIG
1680         /*
1681          * Find and reserve possible boot-time SMP configuration:
1682          */
1683         find_smp_config();
1684 #endif
1685
1686         /* Make sure we have a correctly sized P->M table. */
1687         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1688                 phys_to_machine_mapping = alloc_bootmem_low_pages(
1689                      max_pfn * sizeof(unsigned long));
1690                 memset(phys_to_machine_mapping, ~0,
1691                        max_pfn * sizeof(unsigned long));
1692                 memcpy(phys_to_machine_mapping,
1693                        (unsigned long *)xen_start_info->mfn_list,
1694                        xen_start_info->nr_pages * sizeof(unsigned long));
1695                 free_bootmem(
1696                      __pa(xen_start_info->mfn_list),
1697                      PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
1698                                      sizeof(unsigned long))));
1699
1700                 /*
1701                  * Initialise the list of the frames that specify the list of
1702                  * frames that make up the p2m table. Used by save/restore
1703                  */
1704                 pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
1705                 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
1706                      virt_to_mfn(pfn_to_mfn_frame_list_list);
1707
1708                 fpp = PAGE_SIZE/sizeof(unsigned long);
1709                 for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
1710                         if ((j % fpp) == 0) {
1711                                 k++;
1712                                 BUG_ON(k>=16);
1713                                 pfn_to_mfn_frame_list[k] =
1714                                         alloc_bootmem_low_pages(PAGE_SIZE);
1715                                 pfn_to_mfn_frame_list_list[k] =
1716                                         virt_to_mfn(pfn_to_mfn_frame_list[k]);
1717                                 j=0;
1718                         }
1719                         pfn_to_mfn_frame_list[k][j] =
1720                                 virt_to_mfn(&phys_to_machine_mapping[i]);
1721                 }
1722                 HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
1723         }
1724
1725         /*
1726          * NOTE: at this point the bootmem allocator is fully available.
1727          */
1728
1729         if (xen_start_info->flags & SIF_INITDOMAIN)
1730                 dmi_scan_machine();
1731
1732 #ifdef CONFIG_X86_GENERICARCH
1733         generic_apic_probe(*cmdline_p);
1734 #endif  
1735         if (efi_enabled)
1736                 efi_map_memmap();
1737
1738         set_iopl.iopl = 1;
1739         HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1740
1741 #ifdef CONFIG_X86_IO_APIC
1742         check_acpi_pci();       /* Checks more than just ACPI actually */
1743 #endif
1744
1745 #ifdef CONFIG_ACPI
1746         if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
1747                 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
1748                 acpi_disabled = 1;
1749                 acpi_ht = 0;
1750         }
1751
1752         /*
1753          * Parse the ACPI tables for possible boot-time SMP configuration.
1754          */
1755         acpi_boot_table_init();
1756         acpi_boot_init();
1757
1758 #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
1759         if (def_to_bigsmp)
1760                 printk(KERN_WARNING "More than 8 CPUs detected and "
1761                         "CONFIG_X86_PC cannot handle it.\nUse "
1762                         "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
1763 #endif
1764 #endif
1765 #ifdef CONFIG_X86_LOCAL_APIC
1766         if (smp_found_config)
1767                 get_smp_config();
1768 #endif
1769 #if defined(CONFIG_XEN) && defined(CONFIG_SMP)
1770         prefill_possible_map();
1771 #endif
1772
1773         register_memory();
1774
1775         if (xen_start_info->flags & SIF_INITDOMAIN) {
1776                 if (!(xen_start_info->flags & SIF_PRIVILEGED))
1777                         panic("Xen granted us console access "
1778                               "but not privileged status");
1779
1780 #ifdef CONFIG_VT
1781 #if defined(CONFIG_VGA_CONSOLE)
1782                 if (!efi_enabled ||
1783                     (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
1784                         conswitchp = &vga_con;
1785 #elif defined(CONFIG_DUMMY_CONSOLE)
1786                 conswitchp = &dummy_con;
1787 #endif
1788 #endif
1789         } else {
1790                 extern int console_use_vt;
1791                 console_use_vt = 0;
1792         }
1793 }
1794
1795 static int
1796 xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
1797 {
1798         HYPERVISOR_shutdown(SHUTDOWN_crash);
1799         /* we're never actually going to get here... */
1800         return NOTIFY_DONE;
1801 }
1802
1803 static __init int add_pcspkr(void)
1804 {
1805         struct platform_device *pd;
1806         int ret;
1807
1808         pd = platform_device_alloc("pcspkr", -1);
1809         if (!pd)
1810                 return -ENOMEM;
1811
1812         ret = platform_device_add(pd);
1813         if (ret)
1814                 platform_device_put(pd);
1815
1816         return ret;
1817 }
1818 device_initcall(add_pcspkr);
1819
1820 #include "setup_arch_post.h"
1821 /*
1822  * Local Variables:
1823  * mode:c
1824  * c-file-style:"k&r"
1825  * c-basic-offset:8
1826  * End:
1827  */