X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;ds=sidebyside;f=arch%2Fx86_64%2Fkernel%2Fsmpboot.c;h=28e351d002d8d5146774e79c640fa793c16b8e02;hb=43bc926fffd92024b46cafaf7350d669ba9ca884;hp=f1ec0f34594124f73feb76dcf0f270dc420dfca5;hpb=cee37fe97739d85991964371c1f3a745c00dd236;p=linux-2.6.git diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index f1ec0f345..28e351d00 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -34,6 +34,7 @@ * Andi Kleen : Converted to new state machine. * Various cleanups. * Probably mostly hotplug CPU ready now. + * Ashok Raj : CPU hotplug support */ @@ -43,7 +44,6 @@ #include #include #include -#include #include #include #include @@ -57,22 +57,22 @@ #include #include #include - -/* Change for real CPU hotplug. Note other files need to be fixed - first too. */ -#define __cpuinit __init -#define __cpuinitdata __initdata +#include +#include +#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; /* Package ID of each logical CPU */ -u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -EXPORT_SYMBOL(phys_proc_id); -EXPORT_SYMBOL(cpu_core_id); +u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +/* core ID of each logical CPU */ +u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; + +/* Last level cache ID of each logical CPU */ +u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; /* Bitmask of currently online CPUs */ -cpumask_t cpu_online_map; +cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); @@ -92,8 +92,11 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; /* Set when the idlers are all forked */ int smp_threads_ready; -cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; -cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; +/* representing HT siblings of each logical CPU */ +cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; + +/* representing HT and core siblings of each logical CPU */ +cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); /* @@ -103,6 +106,19 @@ EXPORT_SYMBOL(cpu_core_map); extern unsigned char trampoline_data[]; extern unsigned char trampoline_end[]; +/* State of each CPU */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; + +/* + * Store all idle threads, this can be reused instead of creating + * a new thread. Also avoids complicated thread destroy functionality + * for idle threads. + */ +struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; + +#define get_idle_for_cpu(x) (idle_thread_array[(x)]) +#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -202,9 +218,6 @@ static __cpuinit void sync_master(void *arg) { unsigned long flags, i; - if (smp_processor_id() != boot_cpu_id) - return; - go[MASTER] = 0; local_irq_save(flags); @@ -253,12 +266,12 @@ get_delta(long *rt, long *master) return tcenter - best_tm; } -static __cpuinit void sync_tsc(void) +static __cpuinit void sync_tsc(unsigned int master) { int i, done = 0; long delta, adj, adjust_latency = 0; unsigned long flags, rt, master_time_stamp, bound; -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC static struct syncdebug { long rt; /* roundtrip time */ long master; /* master's timestamp */ @@ -267,9 +280,17 @@ static __cpuinit void sync_tsc(void) } t[NUM_ROUNDS] __cpuinitdata; #endif + printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", + smp_processor_id(), master); + go[MASTER] = 1; - smp_call_function(sync_master, NULL, 1, 0); + /* It is dangerous to broadcast IPI as cpus are coming up, + * as they may not be ready to accept them. So since + * we only need to send the ipi to the boot cpu direct + * the message, and avoid the race. + */ + smp_call_function_single(master, sync_master, NULL, 1, 0); while (go[MASTER]) /* wait for master to be ready */ no_cpu_relax(); @@ -294,7 +315,7 @@ static __cpuinit void sync_tsc(void) rdtscll(t); wrmsrl(MSR_IA32_TSC, t + adj); } -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC t[i].rt = rt; t[i].master = master_time_stamp; t[i].diff = delta; @@ -304,7 +325,7 @@ static __cpuinit void sync_tsc(void) } spin_unlock_irqrestore(&tsc_sync_lock, flags); -#if DEBUG_TSC_SYNC +#ifdef DEBUG_TSC_SYNC for (i = 0; i < NUM_ROUNDS; ++i) printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", t[i].rt, t[i].master, t[i].diff, t[i].lat); @@ -313,22 +334,26 @@ static __cpuinit void sync_tsc(void) printk(KERN_INFO "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " "maxerr %lu cycles)\n", - smp_processor_id(), boot_cpu_id, delta, rt); + smp_processor_id(), master, delta, rt); } static void __cpuinit tsc_sync_wait(void) { - if (notscsync || !cpu_has_tsc) + /* + * When the CPU has synchronized TSCs assume the BIOS + * or the hardware already synced. Otherwise we could + * mess up a possible perfect synchronization with a + * not-quite-perfect algorithm. + */ + if (notscsync || !cpu_has_tsc || !unsynchronized_tsc()) return; - printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), - boot_cpu_id); - sync_tsc(); + sync_tsc(0); } static __init int notscsync_setup(char *s) { notscsync = 1; - return 0; + return 1; } __setup("notscsync", notscsync_setup); @@ -401,8 +426,13 @@ void __cpuinit smp_callin(void) /* * Get our bogomips. + * + * Need to enable IRQs because it can take longer and then + * the NMI watchdog might kill us. */ + local_irq_enable(); calibrate_delay(); + local_irq_disable(); Dprintk("Stack at about %p\n",&cpuid); disable_APIC_timer(); @@ -418,6 +448,83 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +/* maps the cpu to the sched domain representing multi-core */ +cpumask_t cpu_coregroup_map(int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + /* + * For perf, we return last level cache shared map. + * TBD: when power saving sched policy is added, we will return + * cpu_core_map when power saving policy is enabled + */ + return c->llc_shared_map; +} + +/* representing cpus for which sibling maps can be computed */ +static cpumask_t cpu_sibling_setup_map; + +static inline void set_cpu_sibling_map(int cpu) +{ + int i; + struct cpuinfo_x86 *c = cpu_data; + + cpu_set(cpu, cpu_sibling_setup_map); + + if (smp_num_siblings > 1) { + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (phys_proc_id[cpu] == phys_proc_id[i] && + cpu_core_id[cpu] == cpu_core_id[i]) { + cpu_set(i, cpu_sibling_map[cpu]); + cpu_set(cpu, cpu_sibling_map[i]); + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); + } + } + } else { + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + cpu_set(cpu, c[cpu].llc_shared_map); + + if (current_cpu_data.x86_max_cores == 1) { + cpu_core_map[cpu] = cpu_sibling_map[cpu]; + c[cpu].booted_cores = 1; + return; + } + + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (cpu_llc_id[cpu] != BAD_APICID && + cpu_llc_id[cpu] == cpu_llc_id[i]) { + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); + } + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + /* + * Does this new cpu bringup a new core? + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) { + /* + * for each core in package, increment + * the booted_cores for this new cpu + */ + if (first_cpu(cpu_sibling_map[i]) == i) + c[cpu].booted_cores++; + /* + * increment the core count for all + * the other cpus in this package + */ + if (i != cpu) + c[i].booted_cores++; + } else if (i != cpu && !c[cpu].booted_cores) + c[cpu].booted_cores = c[i].booted_cores; + } + } +} + /* * Setup code on secondary processor (after comming out of the trampoline) */ @@ -429,6 +536,7 @@ void __cpuinit start_secondary(void) * things done here to the most necessary things. */ cpu_init(); + preempt_disable(); smp_callin(); /* otherwise gcc will move up the smp_processor_id before the cpu_init */ @@ -447,16 +555,36 @@ void __cpuinit start_secondary(void) enable_APIC_timer(); + /* + * The sibling maps must be set before turing the online map on for + * this cpu + */ + set_cpu_sibling_map(smp_processor_id()); + + /* + * Wait for TSC sync to not schedule things before. + * We still process interrupts, which could see an inconsistent + * time in that window unfortunately. + * Do this here because TSC sync has global unprotected state. + */ + tsc_sync_wait(); + + /* + * We need to hold call_lock, so there is no inconsistency + * between the time smp_call_function() determines number of + * IPI receipients, and the time when the determination is made + * for which cpus receive the IPI in genapic_flat.c. Holding this + * lock helps us to not include this cpu in a currently in progress + * smp_call_function(). + */ + lock_ipi_call_lock(); + /* * Allow the master to continue. */ cpu_set(smp_processor_id(), cpu_online_map); - mb(); - - /* Wait for TSC sync to not schedule things before. - We still process interrupts, which could see an inconsistent - time in that window unfortunately. */ - tsc_sync_wait(); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + unlock_ipi_call_lock(); cpu_idle(); } @@ -464,7 +592,7 @@ void __cpuinit start_secondary(void) extern volatile unsigned long init_rsp; extern void (*initial_code)(void); -#if APIC_DEBUG +#ifdef APIC_DEBUG static void inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; @@ -481,8 +609,8 @@ static void inquire_remote_apic(int apicid) */ apic_wait_icr_idle(); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); timeout = 0; do { @@ -515,12 +643,12 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); @@ -536,10 +664,10 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta Dprintk("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); timeout = 0; @@ -549,18 +677,10 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; } while (send_status && (timeout++ < 1000)); + mb(); atomic_set(&init_deasserted, 1); - /* - * Should we send STARTUP IPIs ? - * - * Determine this based on the APIC version. - * If we don't have an integrated APIC, don't send the STARTUP IPIs. - */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) - num_starts = 2; - else - num_starts = 0; + num_starts = 2; /* * Run STARTUP IPI loop. @@ -571,7 +691,6 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n",j); - apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); Dprintk("After apic_write.\n"); @@ -581,12 +700,11 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_rip >> 12)); + apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12)); /* * Give the other CPU some time to accept the IPI. @@ -611,7 +729,6 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta * Due to the Pentium erratum 3AP. */ if (maxlvt > 3) { - apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); } accept_status = (apic_read(APIC_ESR) & 0xEF); @@ -628,36 +745,105 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta return (send_status | accept_status); } +struct create_idle { + struct task_struct *idle; + struct completion done; + int cpu; +}; + +void do_fork_idle(void *_c_idle) +{ + struct create_idle *c_idle = _c_idle; + + c_idle->idle = fork_idle(c_idle->cpu); + complete(&c_idle->done); +} + /* * Boot one CPU. */ static int __cpuinit do_boot_cpu(int cpu, int apicid) { - struct task_struct *idle; unsigned long boot_error; int timeout; unsigned long start_rip; + struct create_idle c_idle = { + .cpu = cpu, + .done = COMPLETION_INITIALIZER(c_idle.done), + }; + DECLARE_WORK(work, do_fork_idle, &c_idle); + + /* allocate memory for gdts of secondary cpus. Hotplug is considered */ + if (!cpu_gdt_descr[cpu].address && + !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { + printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu); + return -1; + } + + /* Allocate node local memory for AP pdas */ + if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { + struct x8664_pda *newpda, *pda; + int node = cpu_to_node(cpu); + pda = cpu_pda(cpu); + newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC, + node); + if (newpda) { + memcpy(newpda, pda, sizeof (struct x8664_pda)); + cpu_pda(cpu) = newpda; + } else + printk(KERN_ERR + "Could not allocate node local PDA for CPU %d on node %d\n", + cpu, node); + } + + + c_idle.idle = get_idle_for_cpu(cpu); + + if (c_idle.idle) { + c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); + init_idle(c_idle.idle, cpu); + goto do_rest; + } + /* - * We can't use kernel_thread since we must avoid to - * reschedule the child. + * During cold boot process, keventd thread is not spun up yet. + * When we do cpu hot-add, we create idle threads on the fly, we should + * not acquire any attributes from the calling context. Hence the clean + * way to create kernel_threads() is to do that from keventd(). + * We do the current_is_keventd() due to the fact that ACPI notifier + * was also queuing to keventd() and when the caller is already running + * in context of keventd(), we would end up with locking up the keventd + * thread. */ - idle = fork_idle(cpu); - if (IS_ERR(idle)) { + if (!keventd_up() || current_is_keventd()) + work.func(work.data); + else { + schedule_work(&work); + wait_for_completion(&c_idle.done); + } + + if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); - return PTR_ERR(idle); + return PTR_ERR(c_idle.idle); } - cpu_pda[cpu].pcurrent = idle; + set_idle_for_cpu(cpu, c_idle.idle); + +do_rest: + + cpu_pda(cpu)->pcurrent = c_idle.idle; start_rip = setup_trampoline(); - init_rsp = idle->thread.rsp; + init_rsp = c_idle.idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; initial_code = start_secondary; - clear_ti_thread_flag(idle->thread_info, TIF_FORK); + clear_tsk_thread_flag(c_idle.idle, TIF_FORK); - printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, - start_rip, init_rsp); + printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu, + cpus_weight(cpu_present_map), + apicid); /* * This grunge runs the startup process for @@ -679,11 +865,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[apicid])) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); /* * Status is now clean @@ -724,7 +907,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) else /* trampoline code not run */ printk("Not responding.\n"); -#if APIC_DEBUG +#ifdef APIC_DEBUG inquire_remote_apic(apicid); #endif } @@ -732,6 +915,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_node_cpumask(cpu); /* was set by numa_add_cpu */ cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); x86_cpu_to_apicid[cpu] = BAD_APICID; @@ -745,51 +929,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) cycles_t cacheflush_time; unsigned long cache_decay_ticks; -/* - * Construct cpu_sibling_map[], so that we can tell the sibling CPU - * on SMT systems efficiently. - */ -static __cpuinit void detect_siblings(void) -{ - int cpu; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); - } - - for_each_online_cpu (cpu) { - struct cpuinfo_x86 *c = cpu_data + cpu; - int siblings = 0; - int i; - if (smp_num_siblings > 1) { - for_each_online_cpu (i) { - if (cpu_core_id[cpu] == cpu_core_id[i]) { - siblings++; - cpu_set(i, cpu_sibling_map[cpu]); - } - } - } else { - siblings++; - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - if (siblings != smp_num_siblings) { - printk(KERN_WARNING - "WARNING: %d siblings found for CPU%d, should be %d\n", - siblings, cpu, smp_num_siblings); - smp_num_siblings = siblings; - } - if (c->x86_num_cores > 1) { - for_each_online_cpu(i) { - if (phys_proc_id[cpu] == phys_proc_id[i]) - cpu_set(i, cpu_core_map[cpu]); - } - } else - cpu_core_map[cpu] = cpu_sibling_map[cpu]; - } -} - /* * Cleanup possible dangling ends... */ @@ -805,17 +944,6 @@ static __cpuinit void smp_cleanup_boot(void) * Reset trampoline flag */ *((volatile int *) phys_to_virt(0x467)) = 0; - -#ifndef CONFIG_HOTPLUG_CPU - /* - * Free pages reserved for SMP bootup. - * When you add hotplug CPU support later remove this - * Note there is more work to be done for later CPU bootup. - */ - - free_page((unsigned long) __va(PAGE_SIZE)); - free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE)); -#endif } /* @@ -823,7 +951,7 @@ static __cpuinit void smp_cleanup_boot(void) * * RED-PEN audit/test this more. I bet there is more state messed up here. */ -static __cpuinit void disable_smp(void) +static __init void disable_smp(void) { cpu_present_map = cpumask_of_cpu(0); cpu_possible_map = cpumask_of_cpu(0); @@ -835,27 +963,60 @@ static __cpuinit void disable_smp(void) cpu_set(0, cpu_core_map[0]); } +#ifdef CONFIG_HOTPLUG_CPU + +int additional_cpus __initdata = -1; + /* - * Handle user cpus=... parameter. + * cpu_possible_map should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_map on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - The user can overwrite it with additional_cpus=NUM + * - Otherwise don't reserve additional CPUs. + * We do this because additional CPUs waste a lot of memory. + * -AK */ -static __cpuinit void enforce_max_cpus(unsigned max_cpus) +__init void prefill_possible_map(void) { - int i, k; - k = 0; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; - if (++k > max_cpus) { - cpu_clear(i, cpu_possible_map); - cpu_clear(i, cpu_present_map); - } + int i; + int possible; + + if (additional_cpus == -1) { + if (disabled_cpus > 0) + additional_cpus = disabled_cpus; + else + additional_cpus = 0; + } + possible = num_processors + additional_cpus; + if (possible > NR_CPUS) + possible = NR_CPUS; + + if (possible == 0) { /* Could be SMP kernel on UP hw with broken BIOS */ + possible = 1; + printk (KERN_DEBUG "BIOS never enumerated boot CPU, fixing.\n"); } + + printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + possible, + max_t(int, possible - num_processors, 0)); + + for (i = 0; i < possible; i++) + cpu_set(i, cpu_possible_map); } +#endif /* * Various sanity checks. */ -static int __cpuinit smp_sanity_check(unsigned max_cpus) +static int __init smp_sanity_check(unsigned max_cpus) { if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", @@ -889,7 +1050,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus) /* * If we couldn't find a local APIC, then get out of here now! */ - if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) { + if (!cpu_has_apic) { printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", boot_cpu_id); printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); @@ -913,28 +1074,12 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus) * Prepare for SMP bootup. The MP table or ACPI has been read * earlier. Just do some sanity checking here and enable APIC mode. */ -void __cpuinit smp_prepare_cpus(unsigned int max_cpus) +void __init smp_prepare_cpus(unsigned int max_cpus) { - int i; - nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ - - enforce_max_cpus(max_cpus); - - /* - * Fill in cpu_present_mask - */ - for (i = 0; i < NR_CPUS; i++) { - int apicid = cpu_present_to_apicid(i); - if (physid_isset(apicid, phys_cpu_present_map)) { - cpu_set(i, cpu_present_map); - /* possible map would be different if we supported real - CPU hotplug. */ - cpu_set(i, cpu_possible_map); - } - } + set_cpu_sibling_map(0); if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -978,13 +1123,11 @@ void __init smp_prepare_boot_cpu(void) int me = smp_processor_id(); cpu_set(me, cpu_online_map); cpu_set(me, cpu_callout_map); + per_cpu(cpu_state, me) = CPU_ONLINE; } /* * Entry point to boot a CPU. - * - * This is all __cpuinit, not __devinit for now because we don't support - * CPU hotplug (yet). */ int __cpuinit __cpu_up(unsigned int cpu) { @@ -1001,6 +1144,15 @@ int __cpuinit __cpu_up(unsigned int cpu) return -EINVAL; } + /* + * Already booted CPU? + */ + if (cpu_isset(cpu, cpu_callin_map)) { + Dprintk("do_boot_cpu %d Already started\n", cpu); + return -ENOSYS; + } + + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Boot it! */ err = do_boot_cpu(cpu, apicid); if (err < 0) { @@ -1013,23 +1165,128 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); - return 0; + err = 0; + + return err; } /* * Finish the SMP boot. */ -void __cpuinit smp_cpus_done(unsigned int max_cpus) +void __init smp_cpus_done(unsigned int max_cpus) { - zap_low_mappings(); smp_cleanup_boot(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif - detect_siblings(); - time_init_gtod(); - check_nmi_watchdog(); } + +#ifdef CONFIG_HOTPLUG_CPU + +static void remove_siblinginfo(int cpu) +{ + int sibling; + struct cpuinfo_x86 *c = cpu_data; + + for_each_cpu_mask(sibling, cpu_core_map[cpu]) { + cpu_clear(cpu, cpu_core_map[sibling]); + /* + * last thread sibling in this cpu core going down + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) + c[sibling].booted_cores--; + } + + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) + cpu_clear(cpu, cpu_sibling_map[sibling]); + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + phys_proc_id[cpu] = BAD_APICID; + cpu_core_id[cpu] = BAD_APICID; + cpu_clear(cpu, cpu_sibling_setup_map); +} + +void remove_cpu_from_maps(void) +{ + int cpu = smp_processor_id(); + + cpu_clear(cpu, cpu_callout_map); + cpu_clear(cpu, cpu_callin_map); + clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_node_cpumask(cpu); +} + +int __cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + clear_local_APIC(); + + /* + * HACK: + * Allow any queued timer interrupts to get serviced + * This is only a temporary solution until we cleanup + * fixup_irqs as we do for IA64. + */ + local_irq_enable(); + mdelay(1); + + local_irq_disable(); + remove_siblinginfo(cpu); + + /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpu_online_map); + remove_cpu_from_maps(); + fixup_irqs(cpu_online_map); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + /* We don't do anything here: idle task is faking death itself. */ + unsigned int i; + + for (i = 0; i < 10; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + printk ("CPU %d is now offline\n", cpu); + return; + } + msleep(100); + } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} + +__init int setup_additional_cpus(char *s) +{ + return get_option(&s, &additional_cpus); +} +__setup("additional_cpus=", setup_additional_cpus); + +#else /* ... !CONFIG_HOTPLUG_CPU */ + +int __cpu_disable(void) +{ + return -ENOSYS; +} + +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */