linux 2.6.16.38 w/ vs2.0.3-rc1
[linux-2.6.git] / arch / i386 / kernel / setup.c
index 55ce1b1..0fc23c0 100644 (file)
  * This file handles the architecture-dependent parts of initialization
  */
 
+#include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/mmzone.h>
 #include <linux/tty.h>
 #include <linux/ioport.h>
 #include <linux/acpi.h>
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/edd.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/dmi.h>
+
 #include <video/edid.h>
+
+#include <asm/apic.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+/* Forward Declaration. */
+void __init find_max_pfn(void);
+
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
    address, and must not be in the .bss segment! */
 unsigned long init_pg_tables_end __initdata = ~0UL;
 
-int disable_pse __initdata = 0;
+int disable_pse __devinitdata = 0;
 
 /*
  * Machine setup..
@@ -71,31 +83,37 @@ EXPORT_SYMBOL(efi_enabled);
 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 /* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+EXPORT_SYMBOL(boot_cpu_data);
 
 unsigned long mmu_cr4_features;
-EXPORT_SYMBOL_GPL(mmu_cr4_features);
 
-#ifdef CONFIG_ACPI_INTERPRETER
+#ifdef CONFIG_ACPI
        int acpi_disabled = 0;
 #else
        int acpi_disabled = 1;
 #endif
 EXPORT_SYMBOL(acpi_disabled);
 
-#ifdef CONFIG_ACPI_BOOT
+#ifdef CONFIG_ACPI
 int __initdata acpi_force = 0;
 extern acpi_interrupt_flags    acpi_sci_flags;
 #endif
 
 /* for MCA, but anyone else can use it if they want */
 unsigned int machine_id;
+#ifdef CONFIG_MCA
+EXPORT_SYMBOL(machine_id);
+#endif
 unsigned int machine_submodel_id;
 unsigned int BIOS_revision;
 unsigned int mca_pentium_flag;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0x10000000;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
 
 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
 int bootloader_type;
@@ -107,20 +125,28 @@ static unsigned int highmem_pages = -1;
  * Setup options
  */
 struct drive_info_struct { char dummy[32]; } drive_info;
+#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
+    defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
+EXPORT_SYMBOL(drive_info);
+#endif
 struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
 struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
 struct sys_desc_table_struct {
        unsigned short length;
        unsigned char table[0];
 };
 struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
 struct ist_info ist_info;
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+       defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+EXPORT_SYMBOL(ist_info);
+#endif
 struct e820map e820;
 
-unsigned char aux_device_present;
-
 extern void early_cpu_init(void);
-extern void dmi_scan_machine(void);
 extern void generic_apic_probe(char *);
 extern int root_mountflags;
 
@@ -343,12 +369,16 @@ static void __init limit_regions(unsigned long long size)
        int i;
 
        if (efi_enabled) {
-               for (i = 0; i < memmap.nr_map; i++) {
-                       current_addr = memmap.map[i].phys_addr +
-                                      (memmap.map[i].num_pages << 12);
-                       if (memmap.map[i].type == EFI_CONVENTIONAL_MEMORY) {
+               efi_memory_desc_t *md;
+               void *p;
+
+               for (p = memmap.map, i = 0; p < memmap.map_end;
+                       p += memmap.desc_size, i++) {
+                       md = p;
+                       current_addr = md->phys_addr + (md->num_pages << 12);
+                       if (md->type == EFI_CONVENTIONAL_MEMORY) {
                                if (current_addr >= size) {
-                                       memmap.map[i].num_pages -=
+                                       md->num_pages -=
                                                (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
                                        memmap.nr_map = i + 1;
                                        return;
@@ -357,14 +387,24 @@ static void __init limit_regions(unsigned long long size)
                }
        }
        for (i = 0; i < e820.nr_map; i++) {
-               if (e820.map[i].type == E820_RAM) {
-                       current_addr = e820.map[i].addr + e820.map[i].size;
-                       if (current_addr >= size) {
-                               e820.map[i].size -= current_addr-size;
-                               e820.nr_map = i + 1;
-                               return;
-                       }
+               current_addr = e820.map[i].addr + e820.map[i].size;
+               if (current_addr < size)
+                       continue;
+
+               if (e820.map[i].type != E820_RAM)
+                       continue;
+
+               if (e820.map[i].addr >= size) {
+                       /*
+                        * This region starts past the end of the
+                        * requested size, skip it completely.
+                        */
+                       e820.nr_map = i;
+               } else {
+                       e820.nr_map = i + 1;
+                       e820.map[i].size -= current_addr - size;
                }
+               return;
        }
 }
 
@@ -427,10 +467,10 @@ struct change_member {
        struct e820entry *pbios; /* pointer to original bios entry */
        unsigned long long addr; /* address for this change point */
 };
-struct change_member change_point_list[2*E820MAX] __initdata;
-struct change_member *change_point[2*E820MAX] __initdata;
-struct e820entry *overlap_list[E820MAX] __initdata;
-struct e820entry new_bios[E820MAX] __initdata;
+static struct change_member change_point_list[2*E820MAX] __initdata;
+static struct change_member *change_point[2*E820MAX] __initdata;
+static struct e820entry *overlap_list[E820MAX] __initdata;
+static struct e820entry new_bios[E820MAX] __initdata;
 
 static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
 {
@@ -713,6 +753,15 @@ static void __init parse_cmdline_early (char ** cmdline_p)
                        if (to != command_line)
                                to--;
                        if (!memcmp(from+7, "exactmap", 8)) {
+#ifdef CONFIG_CRASH_DUMP
+                               /* If we are doing a crash dump, we
+                                * still need to know the real mem
+                                * size before original memory map is
+                                * reset.
+                                */
+                               find_max_pfn();
+                               saved_max_pfn = max_pfn;
+#endif
                                from += 8+7;
                                e820.nr_map = 0;
                                userdef = 1;
@@ -758,7 +807,7 @@ static void __init parse_cmdline_early (char ** cmdline_p)
                }
 #endif
 
-#ifdef CONFIG_ACPI_BOOT
+#ifdef CONFIG_ACPI
                /* "acpi=off" disables both ACPI table parsing and interpreter */
                else if (!memcmp(from, "acpi=off", 8)) {
                        disable_acpi();
@@ -807,14 +856,55 @@ static void __init parse_cmdline_early (char ** cmdline_p)
 #ifdef CONFIG_X86_IO_APIC
                else if (!memcmp(from, "acpi_skip_timer_override", 24))
                        acpi_skip_timer_override = 1;
-#endif
 
-#ifdef CONFIG_X86_LOCAL_APIC
+               if (!memcmp(from, "disable_timer_pin_1", 19))
+                       disable_timer_pin_1 = 1;
+               if (!memcmp(from, "enable_timer_pin_1", 18))
+                       disable_timer_pin_1 = -1;
+
                /* disable IO-APIC */
                else if (!memcmp(from, "noapic", 6))
                        disable_ioapic_setup();
+#endif /* CONFIG_X86_IO_APIC */
+#endif /* CONFIG_ACPI */
+
+#ifdef CONFIG_X86_LOCAL_APIC
+               /* enable local APIC */
+               else if (!memcmp(from, "lapic", 5))
+                       lapic_enable();
+
+               /* disable local APIC */
+               else if (!memcmp(from, "nolapic", 6))
+                       lapic_disable();
 #endif /* CONFIG_X86_LOCAL_APIC */
-#endif /* CONFIG_ACPI_BOOT */
+
+#ifdef CONFIG_KEXEC
+               /* crashkernel=size@addr specifies the location to reserve for
+                * a crash kernel.  By reserving this memory we guarantee
+                * that linux never set's it up as a DMA target.
+                * Useful for holding code to do something appropriate
+                * after a kernel panic.
+                */
+               else if (!memcmp(from, "crashkernel=", 12)) {
+                       unsigned long size, base;
+                       size = memparse(from+12, &from);
+                       if (*from == '@') {
+                               base = memparse(from+1, &from);
+                               /* FIXME: Do I want a sanity check
+                                * to validate the memory range?
+                                */
+                               crashk_res.start = base;
+                               crashk_res.end   = base + size - 1;
+                       }
+               }
+#endif
+#ifdef CONFIG_PROC_VMCORE
+               /* elfcorehdr= specifies the location of elf core header
+                * stored by the crashed kernel.
+                */
+               else if (!memcmp(from, "elfcorehdr=", 11))
+                       elfcorehdr_addr = memparse(from+11, &from);
+#endif
 
                /*
                 * highmem=size forces highmem to be exactly 'size' bytes.
@@ -864,6 +954,12 @@ efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
        return 0;
 }
 
+static int __init
+efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
+{
+       memory_present(0, start, end);
+       return 0;
+}
 
 /*
  * Find the highest page frame number we have available
@@ -875,6 +971,7 @@ void __init find_max_pfn(void)
        max_pfn = 0;
        if (efi_enabled) {
                efi_memmap_walk(efi_find_max_pfn, &max_pfn);
+               efi_memmap_walk(efi_memory_present_wrapper, NULL);
                return;
        }
 
@@ -889,6 +986,7 @@ void __init find_max_pfn(void)
                        continue;
                if (end > max_pfn)
                        max_pfn = end;
+               memory_present(0, start, end);
        }
 }
 
@@ -951,8 +1049,6 @@ unsigned long __init find_max_low_pfn(void)
        return max_low_pfn;
 }
 
-#ifndef CONFIG_DISCONTIGMEM
-
 /*
  * Free all available memory for boot time allocation.  Used
  * as a callback function by efi_memory_walk()
@@ -1026,15 +1122,15 @@ static void __init reserve_ebda_region(void)
                reserve_bootmem(addr, PAGE_SIZE);       
 }
 
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init setup_bootmem_allocator(void);
 static unsigned long __init setup_memory(void)
 {
-       unsigned long bootmap_size, start_pfn, max_low_pfn;
-
        /*
         * partially used pages are not usable - thus
         * we are rounding upwards:
         */
-       start_pfn = PFN_UP(init_pg_tables_end);
+       min_low_pfn = PFN_UP(init_pg_tables_end);
 
        find_max_pfn();
 
@@ -1050,10 +1146,43 @@ static unsigned long __init setup_memory(void)
 #endif
        printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
                        pages_to_mb(max_low_pfn));
+
+       setup_bootmem_allocator();
+
+       return max_low_pfn;
+}
+
+void __init zone_sizes_init(void)
+{
+       unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+       unsigned int max_dma, low;
+
+       max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+       low = max_low_pfn;
+
+       if (low < max_dma)
+               zones_size[ZONE_DMA] = low;
+       else {
+               zones_size[ZONE_DMA] = max_dma;
+               zones_size[ZONE_NORMAL] = low - max_dma;
+#ifdef CONFIG_HIGHMEM
+               zones_size[ZONE_HIGHMEM] = highend_pfn - low;
+#endif
+       }
+       free_area_init(zones_size);
+}
+#else
+extern unsigned long __init setup_memory(void);
+extern void zone_sizes_init(void);
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+void __init setup_bootmem_allocator(void)
+{
+       unsigned long bootmap_size;
        /*
         * Initialize the boot-time allocator (with low memory only):
         */
-       bootmap_size = init_bootmem(start_pfn, max_low_pfn);
+       bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
 
        register_bootmem_low_pages(max_low_pfn);
 
@@ -1063,8 +1192,8 @@ static unsigned long __init setup_memory(void)
         * the (very unlikely) case of us accidentally initializing the
         * bootmem allocator with an invalid RAM area.
         */
-       reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
-                        bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
+       reserve_bootmem(PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
+                        bootmap_size + PAGE_SIZE-1) - (PHYSICAL_START));
 
        /*
         * reserve physical page 0 - it's a special BIOS page on many boxes,
@@ -1120,11 +1249,30 @@ static unsigned long __init setup_memory(void)
                }
        }
 #endif
-       return max_low_pfn;
+#ifdef CONFIG_KEXEC
+       if (crashk_res.start != crashk_res.end)
+               reserve_bootmem(crashk_res.start,
+                       crashk_res.end - crashk_res.start + 1);
+#endif
+}
+
+/*
+ * The node 0 pgdat is initialized before all of these because
+ * it's needed for bootmem.  node>0 pgdats have their virtual
+ * space allocated before the pagetables are in place to access
+ * them, so they can't be cleared then.
+ *
+ * This should all compile down to nothing when NUMA is off.
+ */
+void __init remapped_pgdat_init(void)
+{
+       int nid;
+
+       for_each_online_node(nid) {
+               if (nid != 0)
+                       memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+       }
 }
-#else
-extern unsigned long setup_memory(void);
-#endif /* !CONFIG_DISCONTIGMEM */
 
 /*
  * Request address space for all standard RAM and ROM resources
@@ -1159,6 +1307,9 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat
                         */
                        request_resource(res, code_resource);
                        request_resource(res, data_resource);
+#ifdef CONFIG_KEXEC
+                       request_resource(res, &crashk_res);
+#endif
                }
        }
 }
@@ -1168,7 +1319,7 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat
  */
 static void __init register_memory(void)
 {
-       unsigned long gapstart, gapsize;
+       unsigned long gapstart, gapsize, round;
        unsigned long long last;
        int           i;
 
@@ -1213,14 +1364,14 @@ static void __init register_memory(void)
        }
 
        /*
-        * Start allocating dynamic PCI memory a bit into the gap,
-        * aligned up to the nearest megabyte.
-        *
-        * Question: should we try to pad it up a bit (do something
-        * like " + (gapsize >> 3)" in there too?). We now have the
-        * technology.
+        * See how much we want to round up: start off with
+        * rounding to the next 1MB area.
         */
-       pci_mem_start = (gapstart + 0xfffff) & ~0xfffff;
+       round = 0x100000;
+       while ((gapsize >> 4) > round)
+               round += round;
+       /* Fun with two's complement */
+       pci_mem_start = (gapstart + round) & -round;
 
        printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
                pci_mem_start, gapstart, gapsize);
@@ -1287,7 +1438,7 @@ static struct nop {
    This runs before SMP is initialized to avoid SMP problems with
    self modifying code. This implies that assymetric systems where
    APs have less capabilities than the boot processor are not handled. 
-   In this case boot with "noreplacement". */ 
+   Tough. Make sure you disable such features by hand. */ 
 void apply_alternatives(void *start, void *end) 
 { 
        struct alt_instr *a; 
@@ -1315,24 +1466,12 @@ void apply_alternatives(void *start, void *end)
        }
 } 
 
-static int no_replacement __initdata = 0; 
 void __init alternative_instructions(void)
 {
        extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-       if (no_replacement) 
-               return;
        apply_alternatives(__alt_instructions, __alt_instructions_end);
 }
 
-static int __init noreplacement_setup(char *s)
-{ 
-     no_replacement = 1; 
-     return 0; 
-} 
-
-__setup("noreplacement", noreplacement_setup); 
-
 static char * __init machine_specific_memory_setup(void);
 
 #ifdef CONFIG_MCA
@@ -1384,7 +1523,6 @@ void __init setup_arch(char **cmdline_p)
                machine_submodel_id = SYS_DESC_TABLE.table[1];
                BIOS_revision = SYS_DESC_TABLE.table[2];
        }
-       aux_device_present = AUX_DEVICE_INFO;
        bootloader_type = LOADER_TYPE;
 
 #ifdef CONFIG_BLK_DEV_RAM
@@ -1432,6 +1570,9 @@ void __init setup_arch(char **cmdline_p)
        smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
 #endif
        paging_init();
+       remapped_pgdat_init();
+       sparse_init();
+       zone_sizes_init();
 
        /*
         * NOTE: at this point the bootmem allocator is fully available.
@@ -1443,7 +1584,7 @@ void __init setup_arch(char **cmdline_p)
                if (s) {
                        extern void setup_early_printk(char *);
 
-                       setup_early_printk(s);
+                       setup_early_printk(strchr(s, '=') + 1);
                        printk("early console enabled\n");
                }
        }
@@ -1458,12 +1599,24 @@ void __init setup_arch(char **cmdline_p)
        if (efi_enabled)
                efi_map_memmap();
 
+#ifdef CONFIG_X86_IO_APIC
+       check_acpi_pci();       /* Checks more than just ACPI actually */
+#endif
+
+#ifdef CONFIG_ACPI
        /*
         * Parse the ACPI tables for possible boot-time SMP configuration.
         */
        acpi_boot_table_init();
        acpi_boot_init();
 
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+       if (def_to_bigsmp)
+               printk(KERN_WARNING "More than 8 CPUs detected and "
+                       "CONFIG_X86_PC cannot handle it.\nUse "
+                       "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+#endif
+#endif
 #ifdef CONFIG_X86_LOCAL_APIC
        if (smp_found_config)
                get_smp_config();