fedora core 6 1.2949 + vserver 2.2.0

[linux-2.6.git] / arch / i386 / kernel / smpboot.c
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c

index 45e8fba..8c6c8c5 100644 (file)
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -17,7 +17,7 @@
   *     Fixes
   *             Felix Koop      :       NR_CPUS used properly
   *             Jose Renau      :       Handle single CPU case.
- *             Alan Cox        :       By repeated request 8) - Total BogoMIP report.
+ *             Alan Cox        :       By repeated request 8) - Total BogoMIPS report.
   *             Greg Wright     :       Fix for kernel stacks panic.
   *             Erich Boleyn    :       MP v1.4 and additional changes.
   *     Matthias Sattler        :       Changes for 2.1 kernel map.
@@ -33,8 +33,12 @@
   *             Dave Jones      :       Report invalid combinations of Athlon CPUs.
  *              Rusty Russell   :       Hacked into shape for new "hotplug" boot process. */
  
+
+/* SMP boot always wants to use real time delay to allow sufficient time for
+ * the APs to come online */
+#define USE_REAL_TIME_DELAY
+
  #include <linux/module.h>
-#include <linux/config.h>
  #include <linux/init.h>
  #include <linux/kernel.h>
  
@@ -42,39 +46,68 @@
  #include <linux/sched.h>
  #include <linux/kernel_stat.h>
  #include <linux/smp_lock.h>
-#include <linux/irq.h>
  #include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
  
  #include <linux/delay.h>
  #include <linux/mc146818rtc.h>
-#include <asm/pgalloc.h>
  #include <asm/tlbflush.h>
  #include <asm/desc.h>
  #include <asm/arch_hooks.h>
+#include <asm/nmi.h>
+#include <asm/pda.h>
+#include <asm/genapic.h>
  
  #include <mach_apic.h>
  #include <mach_wakecpu.h>
  #include <smpboot_hooks.h>
  
  /* Set if we find a B stepping CPU */
-static int __initdata smp_b_stepping;
+static int __devinitdata smp_b_stepping;
  
  /* Number of siblings per CPU package */
  int smp_num_siblings = 1;
-int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
+/* representing HT siblings of each logical CPU */
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_sibling_map);
+
+/* representing HT and core siblings of each logical CPU */
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_core_map);
  
  /* bitmap of online cpus */
-cpumask_t cpu_online_map;
+cpumask_t cpu_online_map __read_mostly;
+EXPORT_SYMBOL(cpu_online_map);
  
-static cpumask_t cpu_callin_map;
+cpumask_t cpu_callin_map;
  cpumask_t cpu_callout_map;
+EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
  static cpumask_t smp_commenced_mask;
  
+/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
+ * is no way to resync one AP against BP. TBD: for prescott and above, we
+ * should use IA64's algorithm
+ */
+static int __devinitdata tsc_sync_disabled;
+
  /* Per CPU bogomips and other parameters */
  struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_data);
+
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
+                       { [0 ... NR_CPUS-1] = 0xff };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
  
-/* Set when the idlers are all forked */
-int smp_threads_ready;
+u8 apicid_2_node[MAX_APICID];
  
  /*
   * Trampoline 80x86 program as an array.
@@ -83,6 +116,12 @@ int smp_threads_ready;
  extern unsigned char trampoline_data [];
  extern unsigned char trampoline_end  [];
  static unsigned char *trampoline_base;
+static int trampoline_exec;
+
+static void map_cpu_to_logical_apicid(void);
+
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
  
  /*
   * Currently trivial. Write the real->protected mode
@@ -90,7 +129,7 @@ static unsigned char *trampoline_base;
   * has made sure it's suitably aligned.
   */
  
-static unsigned long __init setup_trampoline(void)
+static unsigned long __devinit setup_trampoline(void)
  {
         memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
         return virt_to_phys(trampoline_base);
@@ -109,6 +148,10 @@ void __init smp_alloc_memory(void)
          */
         if (__pa(trampoline_base) >= 0x9F000)
                 BUG();
+       /*
+        * Make the SMP trampoline executable:
+        */
+       trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
  }
  
  /*
@@ -116,7 +159,7 @@ void __init smp_alloc_memory(void)
   * a given CPU
   */
  
-static void __init smp_store_cpu_info(int id)
+static void __cpuinit smp_store_cpu_info(int id)
  {
         struct cpuinfo_x86 *c = cpu_data + id;
  
@@ -141,6 +184,9 @@ static void __init smp_store_cpu_info(int id)
          */
         if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
  
+               if (num_possible_cpus() == 1)
+                       goto valid_k7;
+
                 /* Athlon 660/661 is valid. */  
                 if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
                         goto valid_k7;
@@ -162,7 +208,7 @@ static void __init smp_store_cpu_info(int id)
                                 goto valid_k7;
  
                 /* If we get here, it's not a certified SMP capable AMD system. */
-               tainted |= TAINT_UNSAFE_SMP;
+               add_taint(TAINT_UNSAFE_SMP);
         }
  
  valid_k7:
@@ -176,56 +222,34 @@ valid_k7:
   * then we print a warning if not, and always resync.
   */
  
-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
+static struct {
+       atomic_t start_flag;
+       atomic_t count_start;
+       atomic_t count_stop;
+       unsigned long long values[NR_CPUS];
+} tsc __cpuinitdata = {
+       .start_flag = ATOMIC_INIT(0),
+       .count_start = ATOMIC_INIT(0),
+       .count_stop = ATOMIC_INIT(0),
+};
  
  #define NR_LOOPS 5
  
-/*
- * accurate 64-bit/32-bit division, expanded to 32-bit divisions and 64-bit
- * multiplication. Not terribly optimized but we need it at boot time only
- * anyway.
- *
- * result == a / b
- *     == (a1 + a2*(2^32)) / b
- *     == a1/b + a2*(2^32/b)
- *     == a1/b + a2*((2^32-1)/b) + a2/b + (a2*((2^32-1) % b))/b
- *                 ^---- (this multiplication can overflow)
- */
-
-static unsigned long long __init div64 (unsigned long long a, unsigned long b0)
-{
-       unsigned int a1, a2;
-       unsigned long long res;
-
-       a1 = ((unsigned int*)&a)[0];
-       a2 = ((unsigned int*)&a)[1];
-
-       res = a1/b0 +
-               (unsigned long long)a2 * (unsigned long long)(0xffffffff/b0) +
-               a2 / b0 +
-               (a2 * (0xffffffff % b0)) / b0;
-
-       return res;
-}
-
-static void __init synchronize_tsc_bp (void)
+static void __init synchronize_tsc_bp(void)
  {
         int i;
         unsigned long long t0;
         unsigned long long sum, avg;
         long long delta;
-       unsigned long one_usec;
+       unsigned int one_usec;
         int buggy = 0;
  
-       printk("checking TSC synchronization across %u CPUs: ", num_booting_cpus());
+       printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
  
         /* convert from kcyc/sec to cyc/usec */
         one_usec = cpu_khz / 1000;
  
-       atomic_set(&tsc_start_flag, 1);
+       atomic_set(&tsc.start_flag, 1);
         wmb();
  
         /*
@@ -242,16 +266,16 @@ static void __init synchronize_tsc_bp (void)
                 /*
                  * all APs synchronize but they loop on '== num_cpus'
                  */
-               while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
-                       mb();
-               atomic_set(&tsc_count_stop, 0);
+               while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
+                       cpu_relax();
+               atomic_set(&tsc.count_stop, 0);
                 wmb();
                 /*
                  * this lets the APs save their current TSC:
                  */
-               atomic_inc(&tsc_count_start);
+               atomic_inc(&tsc.count_start);
  
-               rdtscll(tsc_values[smp_processor_id()]);
+               rdtscll(tsc.values[smp_processor_id()]);
                 /*
                  * We clear the TSC in the last loop:
                  */
@@ -261,53 +285,54 @@ static void __init synchronize_tsc_bp (void)
                 /*
                  * Wait for all APs to leave the synchronization point:
                  */
-               while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
-                       mb();
-               atomic_set(&tsc_count_start, 0);
+               while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
+                       cpu_relax();
+               atomic_set(&tsc.count_start, 0);
                 wmb();
-               atomic_inc(&tsc_count_stop);
+               atomic_inc(&tsc.count_stop);
         }
  
         sum = 0;
         for (i = 0; i < NR_CPUS; i++) {
                 if (cpu_isset(i, cpu_callout_map)) {
-                       t0 = tsc_values[i];
+                       t0 = tsc.values[i];
                         sum += t0;
                 }
         }
-       avg = div64(sum, num_booting_cpus());
+       avg = sum;
+       do_div(avg, num_booting_cpus());
  
-       sum = 0;
         for (i = 0; i < NR_CPUS; i++) {
                 if (!cpu_isset(i, cpu_callout_map))
                         continue;
-               delta = tsc_values[i] - avg;
+               delta = tsc.values[i] - avg;
                 if (delta < 0)
                         delta = -delta;
                 /*
                  * We report bigger than 2 microseconds clock differences.
                  */
                 if (delta > 2*one_usec) {
-                       long realdelta;
+                       long long realdelta;
+
                         if (!buggy) {
                                 buggy = 1;
                                 printk("\n");
                         }
-                       realdelta = div64(delta, one_usec);
-                       if (tsc_values[i] < avg)
+                       realdelta = delta;
+                       do_div(realdelta, one_usec);
+                       if (tsc.values[i] < avg)
                                 realdelta = -realdelta;
  
-                       printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
+                       if (realdelta)
+                               printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
+                                       "skew, fixed it up.\n", i, realdelta);
                 }
-
-               sum += delta;
         }
         if (!buggy)
                 printk("passed.\n");
-               ;
  }
  
-static void __init synchronize_tsc_ap (void)
+static void __cpuinit synchronize_tsc_ap(void)
  {
         int i;
  
@@ -316,19 +341,21 @@ static void __init synchronize_tsc_ap (void)
          * this gets called, so we first wait for the BP to
          * finish SMP initialization:
          */
-       while (!atomic_read(&tsc_start_flag)) mb();
+       while (!atomic_read(&tsc.start_flag))
+               cpu_relax();
  
         for (i = 0; i < NR_LOOPS; i++) {
-               atomic_inc(&tsc_count_start);
-               while (atomic_read(&tsc_count_start) != num_booting_cpus())
-                       mb();
+               atomic_inc(&tsc.count_start);
+               while (atomic_read(&tsc.count_start) != num_booting_cpus())
+                       cpu_relax();
  
-               rdtscll(tsc_values[smp_processor_id()]);
+               rdtscll(tsc.values[smp_processor_id()]);
                 if (i == NR_LOOPS-1)
                         write_tsc(0, 0);
  
-               atomic_inc(&tsc_count_stop);
-               while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+               atomic_inc(&tsc.count_stop);
+               while (atomic_read(&tsc.count_stop) != num_booting_cpus())
+                       cpu_relax();
         }
  }
  #undef NR_LOOPS
@@ -337,7 +364,7 @@ extern void calibrate_delay(void);
  
  static atomic_t init_deasserted;
  
-void __init smp_callin(void)
+static void __cpuinit smp_callin(void)
  {
         int cpuid, phys_id;
         unsigned long timeout;
@@ -401,8 +428,6 @@ void __init smp_callin(void)
         setup_local_APIC();
         map_cpu_to_logical_apicid();
  
-       local_irq_enable();
-
         /*
          * Get our bogomips.
          */
@@ -415,7 +440,7 @@ void __init smp_callin(void)
         smp_store_cpu_info(cpuid);
  
         disable_APIC_timer();
-       local_irq_disable();
+
         /*
          * Allow the master to continue.
          */
@@ -424,25 +449,104 @@ void __init smp_callin(void)
         /*
          *      Synchronize the TSC with the BP
          */
-       if (cpu_has_tsc && cpu_khz)
+       if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
                 synchronize_tsc_ap();
  }
  
-int cpucount;
+static int cpucount;
+
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       /*
+        * For perf, we return last level cache shared map.
+        * And for power savings, we return cpu_core_map
+        */
+       if (sched_mc_power_savings || sched_smt_power_savings)
+               return cpu_core_map[cpu];
+       else
+               return c->llc_shared_map;
+}
+
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
+static inline void
+set_cpu_sibling_map(int cpu)
+{
+       int i;
+       struct cpuinfo_x86 *c = cpu_data;
+
+       cpu_set(cpu, cpu_sibling_setup_map);
+
+       if (smp_num_siblings > 1) {
+               for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                       if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+                           c[cpu].cpu_core_id == c[i].cpu_core_id) {
+                               cpu_set(i, cpu_sibling_map[cpu]);
+                               cpu_set(cpu, cpu_sibling_map[i]);
+                               cpu_set(i, cpu_core_map[cpu]);
+                               cpu_set(cpu, cpu_core_map[i]);
+                               cpu_set(i, c[cpu].llc_shared_map);
+                               cpu_set(cpu, c[i].llc_shared_map);
+                       }
+               }
+       } else {
+               cpu_set(cpu, cpu_sibling_map[cpu]);
+       }
  
-extern int cpu_idle(void);
+       cpu_set(cpu, c[cpu].llc_shared_map);
+
+       if (current_cpu_data.x86_max_cores == 1) {
+               cpu_core_map[cpu] = cpu_sibling_map[cpu];
+               c[cpu].booted_cores = 1;
+               return;
+       }
+
+       for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (cpu_llc_id[cpu] != BAD_APICID &&
+                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                       cpu_set(i, c[cpu].llc_shared_map);
+                       cpu_set(cpu, c[i].llc_shared_map);
+               }
+               if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
+                       cpu_set(i, cpu_core_map[cpu]);
+                       cpu_set(cpu, cpu_core_map[i]);
+                       /*
+                        *  Does this new cpu bringup a new core?
+                        */
+                       if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+                               /*
+                                * for each core in package, increment
+                                * the booted_cores for this new cpu
+                                */
+                               if (first_cpu(cpu_sibling_map[i]) == i)
+                                       c[cpu].booted_cores++;
+                               /*
+                                * increment the core count for all
+                                * the other cpus in this package
+                                */
+                               if (i != cpu)
+                                       c[i].booted_cores++;
+                       } else if (i != cpu && !c[cpu].booted_cores)
+                               c[cpu].booted_cores = c[i].booted_cores;
+               }
+       }
+}
  
  /*
   * Activate a secondary processor.
   */
-int __init start_secondary(void *unused)
+static void __cpuinit start_secondary(void *unused)
  {
         /*
-        * Dont put anything before smp_callin(), SMP
+        * Don't put *anything* before secondary_cpu_init(), SMP
          * booting is too fragile that we want to limit the
          * things done here to the most necessary things.
          */
-       cpu_init();
+       secondary_cpu_init();
+       preempt_disable();
         smp_callin();
         while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
                 rep_nop();
@@ -458,9 +562,29 @@ int __init start_secondary(void *unused)
          * the local TLBs too.
          */
         local_flush_tlb();
+
+       /* This must be done before setting cpu_online_map */
+       set_cpu_sibling_map(raw_smp_processor_id());
+       wmb();
+
+       /*
+        * We need to hold call_lock, so there is no inconsistency
+        * between the time smp_call_function() determines number of
+        * IPI receipients, and the time when the determination is made
+        * for which cpus receive the IPI. Holding this
+        * lock helps us to not include this cpu in a currently in progress
+        * smp_call_function().
+        */
+       lock_ipi_call_lock();
         cpu_set(smp_processor_id(), cpu_online_map);
+       unlock_ipi_call_lock();
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+       /* We can take interrupts now: we're officially "up". */
+       local_irq_enable();
+
         wmb();
-       return cpu_idle();
+       cpu_idle();
  }
  
  /*
@@ -469,8 +593,14 @@ int __init start_secondary(void *unused)
   * from the task structure
   * This function must not return.
   */
-void __init initialize_secondary(void)
+void __devinit initialize_secondary(void)
  {
+       /*
+        * switch to the per CPU GDT we already set up
+        * in do_boot_cpu()
+        */
+       cpu_set_gdt(current_thread_info()->cpu);
+
         /*
          * We don't actually need to load the full TSS,
          * basically just the stack pointer and the eip.
@@ -480,31 +610,25 @@ void __init initialize_secondary(void)
                 "movl %0,%%esp\n\t"
                 "jmp *%1"
                 :
-               :"r" (current->thread.esp),"r" (current->thread.eip));
+               :"m" (current->thread.esp),"m" (current->thread.eip));
  }
  
+/* Static state in head.S used to set up a CPU */
  extern struct {
         void * esp;
         unsigned short ss;
  } stack_start;
-
-static struct task_struct * __init fork_by_hand(void)
-{
-       struct pt_regs regs;
-       /*
-        * don't care about the eip and regs settings since
-        * we'll never reschedule the forked task.
-        */
-       return copy_process(CLONE_VM|CLONE_IDLETASK, 0, &regs, 0, NULL, NULL);
-}
+extern struct i386_pda *start_pda;
+extern struct Xgt_desc_struct cpu_gdt_descr;
  
  #ifdef CONFIG_NUMA
  
  /* which logical CPUs are on which nodes */
-cpumask_t node_2_cpu_mask[MAX_NUMNODES] =
+cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
                                 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
+EXPORT_SYMBOL(node_2_cpu_mask);
  /* which node each logical CPU is on */
-int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 };
+int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
  EXPORT_SYMBOL(cpu_2_node);
  
  /* set up a mapping between cpu and node. */
@@ -532,18 +656,22 @@ static inline void unmap_cpu_to_node(int cpu)
  
  #endif /* CONFIG_NUMA */
  
-u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
  
-void map_cpu_to_logical_apicid(void)
+static void map_cpu_to_logical_apicid(void)
  {
         int cpu = smp_processor_id();
         int apicid = logical_smp_processor_id();
+       int node = apicid_to_node(apicid);
+
+       if (!node_online(node))
+               node = first_online_node;
  
         cpu_2_logical_apicid[cpu] = apicid;
-       map_cpu_to_node(cpu, apicid_to_node(apicid));
+       map_cpu_to_node(cpu, node);
  }
  
-void unmap_cpu_to_logical_apicid(int cpu)
+static void unmap_cpu_to_logical_apicid(int cpu)
  {
         cpu_2_logical_apicid[cpu] = BAD_APICID;
         unmap_cpu_to_node(cpu);
@@ -558,7 +686,7 @@ static inline void __inquire_remote_apic(int apicid)
  
         printk("Inquiring remote APIC #%d...\n", apicid);
  
-       for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
+       for (i = 0; i < ARRAY_SIZE(regs); i++) {
                 printk("... APIC #%d %s: ", apicid, names[i]);
  
                 /*
@@ -593,7 +721,7 @@ static inline void __inquire_remote_apic(int apicid)
   * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
   * won't ... remember to clear down the APIC, etc later.
   */
-static int __init
+static int __devinit
  wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
  {
         unsigned long send_status = 0, accept_status = 0;
@@ -639,7 +767,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
  #endif /* WAKE_SECONDARY_VIA_NMI */
  
  #ifdef WAKE_SECONDARY_VIA_INIT
-static int __init
+static int __devinit
  wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
  {
         unsigned long send_status = 0, accept_status = 0;
@@ -774,8 +902,42 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
  #endif /* WAKE_SECONDARY_VIA_INIT */
  
  extern cpumask_t cpu_initialized;
+static inline int alloc_cpu_id(void)
+{
+       cpumask_t       tmp_map;
+       int cpu;
+       cpus_complement(tmp_map, cpu_present_map);
+       cpu = first_cpu(tmp_map);
+       if (cpu >= NR_CPUS)
+               return -ENODEV;
+       return cpu;
+}
  
-static int __init do_boot_cpu(int apicid)
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
+static inline struct task_struct * alloc_idle_task(int cpu)
+{
+       struct task_struct *idle;
+
+       if ((idle = cpu_idle_tasks[cpu]) != NULL) {
+               /* initialize thread_struct.  we really want to avoid destroy
+                * idle tread
+                */
+               idle->thread.esp = (unsigned long)task_pt_regs(idle);
+               init_idle(idle, cpu);
+               return idle;
+       }
+       idle = fork_idle(cpu);
+
+       if (!IS_ERR(idle))
+               cpu_idle_tasks[cpu] = idle;
+       return idle;
+}
+#else
+#define alloc_idle_task(cpu) fork_idle(cpu)
+#endif
+
+static int __cpuinit do_boot_cpu(int apicid, int cpu)
  /*
   * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
   * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -784,33 +946,33 @@ static int __init do_boot_cpu(int apicid)
  {
         struct task_struct *idle;
         unsigned long boot_error;
-       int timeout, cpu;
+       int timeout;
         unsigned long start_eip;
         unsigned short nmi_high = 0, nmi_low = 0;
  
-       cpu = ++cpucount;
         /*
          * We can't use kernel_thread since we must avoid to
          * reschedule the child.
          */
-       idle = fork_by_hand();
+       idle = alloc_idle_task(cpu);
         if (IS_ERR(idle))
                 panic("failed fork for CPU %d", cpu);
-       wake_up_forked_process(idle);
  
-       /*
-        * We remove it from the pidhash and the runqueue
-        * once we got the process:
-        */
-       init_idle(idle, cpu);
+       /* Pre-allocate and initialize the CPU's GDT and PDA so it
+          doesn't have to do any memory allocation during the
+          delicate CPU-bringup phase. */
+       if (!init_gdt(cpu, idle)) {
+               printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
+               return -1;      /* ? */
+       }
  
         idle->thread.eip = (unsigned long) start_secondary;
-
-       unhash_process(idle);
-
         /* start_eip had better be page-aligned! */
         start_eip = setup_trampoline();
  
+       ++cpucount;
+       alternatives_smp_switch(1);
+
         /* So we see what's up   */
         printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
         /* Stack for startup_32 can be just as for start_secondary onwards */
@@ -818,6 +980,7 @@ static int __init do_boot_cpu(int apicid)
  
         irq_ctx_init(cpu);
  
+       x86_cpu_to_apicid[cpu] = apicid;
         /*
          * This grunge runs the startup process for
          * the targeted processor.
@@ -871,12 +1034,16 @@ static int __init do_boot_cpu(int apicid)
                         inquire_remote_apic(apicid);
                 }
         }
+
         if (boot_error) {
                 /* Try to put things back the way they were before ... */
                 unmap_cpu_to_logical_apicid(cpu);
                 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
                 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
                 cpucount--;
+       } else {
+               x86_cpu_to_apicid[cpu] = apicid;
+               cpu_set(cpu, cpu_present_map);
         }
  
         /* mark "stuck" area as not stuck */
@@ -885,48 +1052,96 @@ static int __init do_boot_cpu(int apicid)
         return boot_error;
  }
  
-cycles_t cacheflush_time;
-unsigned long cache_decay_ticks;
+#ifdef CONFIG_HOTPLUG_CPU
+void cpu_exit_clear(void)
+{
+       int cpu = raw_smp_processor_id();
+
+       idle_task_exit();
+
+       cpucount --;
+       cpu_uninit();
+       irq_ctx_exit(cpu);
  
-static void smp_tune_scheduling (void)
+       cpu_clear(cpu, cpu_callout_map);
+       cpu_clear(cpu, cpu_callin_map);
+
+       cpu_clear(cpu, smp_commenced_mask);
+       unmap_cpu_to_logical_apicid(cpu);
+}
+
+struct warm_boot_cpu_info {
+       struct completion *complete;
+       struct work_struct task;
+       int apicid;
+       int cpu;
+};
+
+static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
  {
-       unsigned long cachesize;       /* kB   */
-       unsigned long bandwidth = 350; /* MB/s */
+       struct warm_boot_cpu_info *info =
+               container_of(work, struct warm_boot_cpu_info, task);
+       do_boot_cpu(info->apicid, info->cpu);
+       complete(info->complete);
+}
+
+static int __cpuinit __smp_prepare_cpu(int cpu)
+{
+       DECLARE_COMPLETION_ONSTACK(done);
+       struct warm_boot_cpu_info info;
+       int     apicid, ret;
+       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+
+       apicid = x86_cpu_to_apicid[cpu];
+       if (apicid == BAD_APICID) {
+               ret = -ENODEV;
+               goto exit;
+       }
+
         /*
-        * Rough estimation for SMP scheduling, this is the number of
-        * cycles it takes for a fully memory-limited process to flush
-        * the SMP-local cache.
-        *
-        * (For a P5 this pretty much means we will choose another idle
-        *  CPU almost always at wakeup time (this is due to the small
-        *  L1 cache), on PIIs it's around 50-100 usecs, depending on
-        *  the cache size)
+        * the CPU isn't initialized at boot time, allocate gdt table here.
+        * cpu_init will initialize it
          */
+       if (!cpu_gdt_descr->address) {
+               cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+               if (!cpu_gdt_descr->address)
+                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+                       ret = -ENOMEM;
+                       goto exit;
+       }
  
-       if (!cpu_khz) {
-               /*
-                * this basically disables processor-affinity
-                * scheduling on SMP without a TSC.
-                */
-               cacheflush_time = 0;
-               return;
-       } else {
-               cachesize = boot_cpu_data.x86_cache_size;
-               if (cachesize == -1) {
-                       cachesize = 16; /* Pentiums, 2x8kB cache */
-                       bandwidth = 100;
-               }
+       info.complete = &done;
+       info.apicid = apicid;
+       info.cpu = cpu;
+       INIT_WORK(&info.task, do_warm_boot_cpu);
  
-               cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
-       }
+       tsc_sync_disabled = 1;
+
+       /* init low mem mapping */
+       clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+                       min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+       flush_tlb_all();
+       schedule_work(&info.task);
+       wait_for_completion(&done);
+
+       tsc_sync_disabled = 0;
+       zap_low_mappings();
+       ret = 0;
+exit:
+       return ret;
+}
+#endif
+
+static void smp_tune_scheduling(void)
+{
+       unsigned long cachesize;       /* kB   */
  
-       cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1;
+       if (cpu_khz) {
+               cachesize = boot_cpu_data.x86_cache_size;
  
-       printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
-               (long)cacheflush_time/(cpu_khz/1000),
-               ((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
-       printk("task migration cache decay timeout: %ld msecs.\n",
-               cache_decay_ticks);
+               if (cachesize > 0)
+                       max_cache_size = cachesize * 1024;
+       }
  }
  
  /*
@@ -936,8 +1151,9 @@ static void smp_tune_scheduling (void)
  static int boot_cpu_logical_apicid;
  /* Where the IO area was mapped on multiquad, always 0 otherwise */
  void *xquad_portio;
-
-cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+#ifdef CONFIG_X86_NUMAQ
+EXPORT_SYMBOL(xquad_portio);
+#endif
  
  static void __init smp_boot_cpus(unsigned int max_cpus)
  {
@@ -953,11 +1169,12 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
  
         boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
         boot_cpu_logical_apicid = logical_smp_processor_id();
+       x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
  
         current_thread_info()->cpu = 0;
         smp_tune_scheduling();
-       cpus_clear(cpu_sibling_map[0]);
-       cpu_set(0, cpu_sibling_map[0]);
+
+       set_cpu_sibling_map(0);
  
         /*
          * If we couldn't find an SMP configuration at boot time,
@@ -971,6 +1188,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
                         printk(KERN_NOTICE "Local APIC not detected."
                                            " Using dummy APIC emulation.\n");
                 map_cpu_to_logical_apicid();
+               cpu_set(0, cpu_sibling_map[0]);
+               cpu_set(0, cpu_core_map[0]);
                 return;
         }
  
@@ -994,6 +1213,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
                 printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
                 smpboot_clear_io_apic_irqs();
                 phys_cpu_present_map = physid_mask_of_physid(0);
+               cpu_set(0, cpu_sibling_map[0]);
+               cpu_set(0, cpu_core_map[0]);
                 return;
         }
  
@@ -1007,6 +1228,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
                 printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
                 smpboot_clear_io_apic_irqs();
                 phys_cpu_present_map = physid_mask_of_physid(0);
+               cpu_set(0, cpu_sibling_map[0]);
+               cpu_set(0, cpu_core_map[0]);
                 return;
         }
  
@@ -1040,7 +1263,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
                 if (max_cpus <= cpucount+1)
                         continue;
  
-               if (do_boot_cpu(apicid))
+               if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
                         printk("CPU #%d not responding - cannot use it.\n",
                                                                 apicid);
                 else
@@ -1087,35 +1310,13 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
          * construct cpu_sibling_map[], so that we can tell sibling CPUs
          * efficiently.
          */
-       for (cpu = 0; cpu < NR_CPUS; cpu++)
-               cpus_clear(cpu_sibling_map[cpu]);
-
         for (cpu = 0; cpu < NR_CPUS; cpu++) {
-               int siblings = 0;
-               int i;
-               if (!cpu_isset(cpu, cpu_callout_map))
-                       continue;
-
-               if (smp_num_siblings > 1) {
-                       for (i = 0; i < NR_CPUS; i++) {
-                               if (!cpu_isset(i, cpu_callout_map))
-                                       continue;
-                               if (phys_proc_id[cpu] == phys_proc_id[i]) {
-                                       siblings++;
-                                       cpu_set(i, cpu_sibling_map[cpu]);
-                               }
-                       }
-               } else {
-                       siblings++;
-                       cpu_set(cpu, cpu_sibling_map[cpu]);
-               }
-
-               if (siblings != smp_num_siblings)
-                       printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
+               cpus_clear(cpu_sibling_map[cpu]);
+               cpus_clear(cpu_core_map[cpu]);
         }
  
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               check_nmi_watchdog();
+       cpu_set(0, cpu_sibling_map[0]);
+       cpu_set(0, cpu_core_map[0]);
  
         smpboot_setup_io_apic();
  
@@ -1128,241 +1329,149 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
                 synchronize_tsc_bp();
  }
  
-#ifdef CONFIG_SCHED_SMT
-#ifdef CONFIG_NUMA
-static struct sched_group sched_group_cpus[NR_CPUS];
-static struct sched_group sched_group_phys[NR_CPUS];
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-__init void arch_init_sched_domains(void)
+/* These are wrappers to interface to the new boot process.  Someone
+   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
+void __init smp_prepare_cpus(unsigned int max_cpus)
  {
-       int i;
-       struct sched_group *first = NULL, *last = NULL;
-
-       /* Set up domains */
-       for_each_cpu(i) {
-               struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
-               struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
-               struct sched_domain *node_domain = &per_cpu(node_domains, i);
-               int node = cpu_to_node(i);
-               cpumask_t nodemask = node_to_cpumask(node);
-
-               *cpu_domain = SD_SIBLING_INIT;
-               cpu_domain->span = cpu_sibling_map[i];
-               cpu_domain->parent = phys_domain;
-               cpu_domain->groups = &sched_group_cpus[i];
-
-               *phys_domain = SD_CPU_INIT;
-               phys_domain->span = nodemask;
-               phys_domain->parent = node_domain;
-               phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
-
-               *node_domain = SD_NODE_INIT;
-               node_domain->span = cpu_possible_map;
-               node_domain->groups = &sched_group_nodes[cpu_to_node(i)];
-       }
-
-       /* Set up CPU (sibling) groups */
-       for_each_cpu(i) {
-               struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
-               int j;
-               first = last = NULL;
-
-               if (i != first_cpu(cpu_domain->span))
-                       continue;
-
-               for_each_cpu_mask(j, cpu_domain->span) {
-                       struct sched_group *cpu = &sched_group_cpus[j];
-
-                       cpu->cpumask = CPU_MASK_NONE;
-                       cpu_set(j, cpu->cpumask);
-                       cpu->cpu_power = SCHED_LOAD_SCALE;
-
-                       if (!first)
-                               first = cpu;
-                       if (last)
-                               last->next = cpu;
-                       last = cpu;
-               }
-               last->next = first;
-       }
-
-       for (i = 0; i < MAX_NUMNODES; i++) {
-               int j;
-               cpumask_t nodemask;
-               struct sched_group *node = &sched_group_nodes[i];
-               cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map);
-
-               if (cpus_empty(nodemask))
-                       continue;
-
-               first = last = NULL;
-               /* Set up physical groups */
-               for_each_cpu_mask(j, nodemask) {
-                       struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j);
-                       struct sched_group *cpu = &sched_group_phys[j];
-
-                       if (j != first_cpu(cpu_domain->span))
-                               continue;
-
-                       cpu->cpumask = cpu_domain->span;
-                       /*
-                        * Make each extra sibling increase power by 10% of
-                        * the basic CPU. This is very arbitrary.
-                        */
-                       cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
-                       node->cpu_power += cpu->cpu_power;
-
-                       if (!first)
-                               first = cpu;
-                       if (last)
-                               last->next = cpu;
-                       last = cpu;
-               }
-               last->next = first;
-       }
-
-       /* Set up nodes */
-       first = last = NULL;
-       for (i = 0; i < MAX_NUMNODES; i++) {
-               struct sched_group *cpu = &sched_group_nodes[i];
-               cpumask_t nodemask;
-               cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map);
-
-               if (cpus_empty(nodemask))
-                       continue;
-
-               cpu->cpumask = nodemask;
-               /* ->cpu_power already setup */
-
-               if (!first)
-                       first = cpu;
-               if (last)
-                       last->next = cpu;
-               last = cpu;
-       }
-       last->next = first;
-
+       smp_commenced_mask = cpumask_of_cpu(0);
+       cpu_callin_map = cpumask_of_cpu(0);
         mb();
-       for_each_cpu(i) {
-               struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
-               cpu_attach_domain(cpu_domain, i);
-       }
+       smp_boot_cpus(max_cpus);
  }
-#else /* !CONFIG_NUMA */
-static struct sched_group sched_group_cpus[NR_CPUS];
-static struct sched_group sched_group_phys[NR_CPUS];
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-__init void arch_init_sched_domains(void)
-{
-       int i;
-       struct sched_group *first = NULL, *last = NULL;
  
-       /* Set up domains */
-       for_each_cpu(i) {
-               struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
-               struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
+void __devinit smp_prepare_boot_cpu(void)
+{
+       cpu_set(smp_processor_id(), cpu_online_map);
+       cpu_set(smp_processor_id(), cpu_callout_map);
+       cpu_set(smp_processor_id(), cpu_present_map);
+       cpu_set(smp_processor_id(), cpu_possible_map);
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+}
  
-               *cpu_domain = SD_SIBLING_INIT;
-               cpu_domain->span = cpu_sibling_map[i];
-               cpu_domain->parent = phys_domain;
-               cpu_domain->groups = &sched_group_cpus[i];
+#ifdef CONFIG_HOTPLUG_CPU
+static void
+remove_siblinginfo(int cpu)
+{
+       int sibling;
+       struct cpuinfo_x86 *c = cpu_data;
  
-               *phys_domain = SD_CPU_INIT;
-               phys_domain->span = cpu_possible_map;
-               phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
+       for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+               cpu_clear(cpu, cpu_core_map[sibling]);
+               /*
+                * last thread sibling in this cpu core going down
+                */
+               if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+                       c[sibling].booted_cores--;
         }
+                       
+       for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+               cpu_clear(cpu, cpu_sibling_map[sibling]);
+       cpus_clear(cpu_sibling_map[cpu]);
+       cpus_clear(cpu_core_map[cpu]);
+       c[cpu].phys_proc_id = 0;
+       c[cpu].cpu_core_id = 0;
+       cpu_clear(cpu, cpu_sibling_setup_map);
+}
  
-       /* Set up CPU (sibling) groups */
-       for_each_cpu(i) {
-               struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
-               int j;
-               first = last = NULL;
+int __cpu_disable(void)
+{
+       cpumask_t map = cpu_online_map;
+       int cpu = smp_processor_id();
  
-               if (i != first_cpu(cpu_domain->span))
-                       continue;
+       /*
+        * Perhaps use cpufreq to drop frequency, but that could go
+        * into generic code.
+        *
+        * We won't take down the boot processor on i386 due to some
+        * interrupts only being able to be serviced by the BSP.
+        * Especially so if we're not using an IOAPIC   -zwane
+        */
+       if (cpu == 0)
+               return -EBUSY;
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               stop_apic_nmi_watchdog(NULL);
+       clear_local_APIC();
+       /* Allow any queued timer interrupts to get serviced */
+       local_irq_enable();
+       mdelay(1);
+       local_irq_disable();
  
-               for_each_cpu_mask(j, cpu_domain->span) {
-                       struct sched_group *cpu = &sched_group_cpus[j];
+       remove_siblinginfo(cpu);
  
-                       cpus_clear(cpu->cpumask);
-                       cpu_set(j, cpu->cpumask);
-                       cpu->cpu_power = SCHED_LOAD_SCALE;
+       cpu_clear(cpu, map);
+       fixup_irqs(map);
+       /* It's now safe to remove this processor from the online map */
+       cpu_clear(cpu, cpu_online_map);
+       return 0;
+}
  
-                       if (!first)
-                               first = cpu;
-                       if (last)
-                               last->next = cpu;
-                       last = cpu;
+void __cpu_die(unsigned int cpu)
+{
+       /* We don't do anything here: idle task is faking death itself. */
+       unsigned int i;
+
+       for (i = 0; i < 10; i++) {
+               /* They ack this in play_dead by setting CPU_DEAD */
+               if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+                       printk ("CPU %d is now offline\n", cpu);
+                       if (1 == num_online_cpus())
+                               alternatives_smp_switch(0);
+                       return;
                 }
-               last->next = first;
-       }
-
-       first = last = NULL;
-       /* Set up physical groups */
-       for_each_cpu(i) {
-               struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
-               struct sched_group *cpu = &sched_group_phys[i];
-
-               if (i != first_cpu(cpu_domain->span))
-                       continue;
-
-               cpu->cpumask = cpu_domain->span;
-               /* See SMT+NUMA setup for comment */
-               cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
-
-               if (!first)
-                       first = cpu;
-               if (last)
-                       last->next = cpu;
-               last = cpu;
-       }
-       last->next = first;
-
-       mb();
-       for_each_cpu(i) {
-               struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
-               cpu_attach_domain(cpu_domain, i);
+               msleep(100);
         }
+       printk(KERN_ERR "CPU %u didn't die...\n", cpu);
  }
-#endif /* CONFIG_NUMA */
-#endif /* CONFIG_SCHED_SMT */
-
-/* These are wrappers to interface to the new boot process.  Someone
-   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
-void __init smp_prepare_cpus(unsigned int max_cpus)
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int __cpu_disable(void)
  {
-       smp_boot_cpus(max_cpus);
+       return -ENOSYS;
  }
  
-void __devinit smp_prepare_boot_cpu(void)
+void __cpu_die(unsigned int cpu)
  {
-       cpu_set(smp_processor_id(), cpu_online_map);
-       cpu_set(smp_processor_id(), cpu_callout_map);
+       /* We said "no" in __cpu_disable */
+       BUG();
  }
+#endif /* CONFIG_HOTPLUG_CPU */
  
-int __devinit __cpu_up(unsigned int cpu)
+int __cpuinit __cpu_up(unsigned int cpu)
  {
-       /* This only works at boot for x86.  See "rewrite" above. */
-       if (cpu_isset(cpu, smp_commenced_mask)) {
-               local_irq_enable();
-               return -ENOSYS;
-       }
+#ifdef CONFIG_HOTPLUG_CPU
+       int ret=0;
+
+       /*
+        * We do warm boot only on cpus that had booted earlier
+        * Otherwise cold boot is all handled from smp_boot_cpus().
+        * cpu_callin_map is set during AP kickstart process. Its reset
+        * when a cpu is taken offline from cpu_exit_clear().
+        */
+       if (!cpu_isset(cpu, cpu_callin_map))
+               ret = __smp_prepare_cpu(cpu);
+
+       if (ret)
+               return -EIO;
+#endif
  
         /* In case one didn't come up */
         if (!cpu_isset(cpu, cpu_callin_map)) {
+               printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
                 local_irq_enable();
                 return -EIO;
         }
  
         local_irq_enable();
+       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
         /* Unleash the CPU! */
         cpu_set(cpu, smp_commenced_mask);
         while (!cpu_isset(cpu, cpu_online_map))
-               mb();
+               cpu_relax();
+
+#ifdef CONFIG_X86_GENERICARCH
+       if (num_online_cpus() > 8 && genapic == &apic_default)
+               panic("Default flat APIC routing can't be used with > 8 cpus\n");
+#endif
+
         return 0;
  }
  
@@ -1372,6 +1481,12 @@ void __init smp_cpus_done(unsigned int max_cpus)
         setup_ioapic_dest();
  #endif
         zap_low_mappings();
+#ifndef CONFIG_HOTPLUG_CPU
+       /*
+        * Disable executability of the SMP trampoline:
+        */
+       set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
  }
  
  void __init smp_intr_init(void)
@@ -1394,3 +1509,16 @@ void __init smp_intr_init(void)
         /* IPI for generic function call */
         set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
  }
+
+/*
+ * If the BIOS enumerates physical processors before logical,
+ * maxcpus=N at enumeration-time can be used to disable HT.
+ */
+static int __init parse_maxcpus(char *arg)
+{
+       extern unsigned int maxcpus;
+
+       maxcpus = simple_strtoul(arg, NULL, 0);
+       return 0;
+}
+early_param("maxcpus", parse_maxcpus);