fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / drivers / xen / core / smpboot.c
1 /*
2  *      Xen SMP booting functions
3  *
4  *      See arch/i386/kernel/smpboot.c for copyright and credits for derived
5  *      portions of this file.
6  */
7
8 #include <linux/module.h>
9 #include <linux/init.h>
10 #include <linux/kernel.h>
11 #include <linux/mm.h>
12 #include <linux/sched.h>
13 #include <linux/kernel_stat.h>
14 #include <linux/smp_lock.h>
15 #include <linux/irq.h>
16 #include <linux/bootmem.h>
17 #include <linux/notifier.h>
18 #include <linux/cpu.h>
19 #include <linux/percpu.h>
20 #include <asm/desc.h>
21 #include <asm/arch_hooks.h>
22 #include <asm/pgalloc.h>
23 #if defined(__i386__)
24 #include <asm/pda.h>
25 #endif
26 #include <xen/evtchn.h>
27 #include <xen/interface/vcpu.h>
28 #include <xen/cpu_hotplug.h>
29 #include <xen/xenbus.h>
30
31 extern irqreturn_t smp_reschedule_interrupt(int, void *);
32 extern irqreturn_t smp_call_function_interrupt(int, void *);
33
34 extern void local_setup_timer(unsigned int cpu);
35 extern void local_teardown_timer(unsigned int cpu);
36
37 extern void hypervisor_callback(void);
38 extern void failsafe_callback(void);
39 extern void system_call(void);
40 extern void smp_trap_init(trap_info_t *);
41
42 /* Number of siblings per CPU package */
43 int smp_num_siblings = 1;
44 EXPORT_SYMBOL(smp_num_siblings);
45 #if defined(__i386__)
46 int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
47 #elif defined(__x86_64__)
48 u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
49 #endif
50 EXPORT_SYMBOL(cpu_llc_id);
51
52 cpumask_t cpu_online_map;
53 EXPORT_SYMBOL(cpu_online_map);
54 cpumask_t cpu_possible_map;
55 EXPORT_SYMBOL(cpu_possible_map);
56
57 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
58 EXPORT_SYMBOL(cpu_data);
59
60 #ifdef CONFIG_HOTPLUG_CPU
61 DEFINE_PER_CPU(int, cpu_state) = { 0 };
62 #endif
63
64 static DEFINE_PER_CPU(int, resched_irq);
65 static DEFINE_PER_CPU(int, callfunc_irq);
66 static char resched_name[NR_CPUS][15];
67 static char callfunc_name[NR_CPUS][15];
68
69 u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
70
71 void *xquad_portio;
72
73 cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
74 EXPORT_SYMBOL(cpu_sibling_map);
75 cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
76 EXPORT_SYMBOL(cpu_core_map);
77
78 #if defined(__i386__)
79 u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
80 EXPORT_SYMBOL(x86_cpu_to_apicid);
81 #endif
82
83 void __init prefill_possible_map(void)
84 {
85         int i, rc;
86
87         for (i = 0; i < NR_CPUS; i++) {
88                 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
89                 if (rc >= 0)
90                         cpu_set(i, cpu_possible_map);
91         }
92 }
93
94 void __init smp_alloc_memory(void)
95 {
96 }
97
98 static inline void
99 set_cpu_sibling_map(int cpu)
100 {
101         cpu_data[cpu].phys_proc_id = cpu;
102         cpu_data[cpu].cpu_core_id = 0;
103
104         cpu_sibling_map[cpu] = cpumask_of_cpu(cpu);
105         cpu_core_map[cpu]    = cpumask_of_cpu(cpu);
106
107         cpu_data[cpu].booted_cores = 1;
108 }
109
110 static void xen_smp_intr_init(unsigned int cpu)
111 {
112         sprintf(resched_name[cpu], "resched%d", cpu);
113         per_cpu(resched_irq, cpu) =
114                 bind_ipi_to_irqhandler(
115                         RESCHEDULE_VECTOR,
116                         cpu,
117                         smp_reschedule_interrupt,
118                         SA_INTERRUPT,
119                         resched_name[cpu],
120                         NULL);
121         BUG_ON(per_cpu(resched_irq, cpu) < 0);
122
123         sprintf(callfunc_name[cpu], "callfunc%d", cpu);
124         per_cpu(callfunc_irq, cpu) =
125                 bind_ipi_to_irqhandler(
126                         CALL_FUNCTION_VECTOR,
127                         cpu,
128                         smp_call_function_interrupt,
129                         SA_INTERRUPT,
130                         callfunc_name[cpu],
131                         NULL);
132         BUG_ON(per_cpu(callfunc_irq, cpu) < 0);
133
134         if (cpu != 0)
135                 local_setup_timer(cpu);
136 }
137
138 #ifdef CONFIG_HOTPLUG_CPU
139 static void xen_smp_intr_exit(unsigned int cpu)
140 {
141         if (cpu != 0)
142                 local_teardown_timer(cpu);
143
144         unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
145         unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
146 }
147 #endif
148
149 #ifdef __i386__
150 static inline void set_kernel_gs(void)
151 {
152         /* Set %gs for this CPU's PDA.  Memory clobber is to create a
153            barrier with respect to any PDA operations, so the compiler
154            doesn't move any before here. */
155         asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
156 }
157 #endif
158
159 void cpu_bringup(void)
160 {
161 #ifdef __i386__
162         set_kernel_gs();
163         secondary_cpu_init();
164 #else
165         cpu_init();
166 #endif
167         touch_softlockup_watchdog();
168         preempt_disable();
169         local_irq_enable();
170 }
171
172 static void cpu_bringup_and_idle(void)
173 {
174         cpu_bringup();
175         cpu_idle();
176 }
177
178 void cpu_initialize_context(unsigned int cpu)
179 {
180         vcpu_guest_context_t ctxt;
181         struct task_struct *idle = idle_task(cpu);
182 #ifdef __x86_64__
183         struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
184 #else
185         struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
186 #endif
187
188         if (cpu == 0)
189                 return;
190
191         memset(&ctxt, 0, sizeof(ctxt));
192
193         ctxt.flags = VGCF_IN_KERNEL;
194         ctxt.user_regs.ds = __USER_DS;
195         ctxt.user_regs.es = __USER_DS;
196         ctxt.user_regs.fs = 0;
197         ctxt.user_regs.gs = 0;
198         ctxt.user_regs.ss = __KERNEL_DS;
199         ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
200         ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
201
202         memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
203
204         smp_trap_init(ctxt.trap_ctxt);
205
206         ctxt.ldt_ents = 0;
207
208         ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
209         ctxt.gdt_ents      = gdt_descr->size / 8;
210
211 #ifdef __i386__
212         ctxt.user_regs.cs = __KERNEL_CS;
213         ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
214
215         ctxt.kernel_ss = __KERNEL_DS;
216         ctxt.kernel_sp = idle->thread.esp0;
217
218         ctxt.event_callback_cs     = __KERNEL_CS;
219         ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
220         ctxt.failsafe_callback_cs  = __KERNEL_CS;
221         ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
222
223         ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
224 #else /* __x86_64__ */
225         ctxt.user_regs.cs = __KERNEL_CS;
226         ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
227
228         ctxt.kernel_ss = __KERNEL_DS;
229         ctxt.kernel_sp = idle->thread.rsp0;
230
231         ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
232         ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
233         ctxt.syscall_callback_eip  = (unsigned long)system_call;
234
235         ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
236
237         ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
238 #endif
239
240         BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
241 }
242
243 void __init smp_prepare_cpus(unsigned int max_cpus)
244 {
245         int cpu;
246         struct task_struct *idle;
247 #ifdef __x86_64__
248         struct desc_ptr *gdt_descr;
249 #else
250         struct Xgt_desc_struct *gdt_descr;
251 #endif
252
253         boot_cpu_data.apicid = 0;
254         cpu_data[0] = boot_cpu_data;
255
256         cpu_2_logical_apicid[0] = 0;
257         x86_cpu_to_apicid[0] = 0;
258
259         current_thread_info()->cpu = 0;
260
261         for (cpu = 0; cpu < NR_CPUS; cpu++) {
262                 cpus_clear(cpu_sibling_map[cpu]);
263                 cpus_clear(cpu_core_map[cpu]);
264         }
265
266         set_cpu_sibling_map(0);
267
268         xen_smp_intr_init(0);
269
270         /* Restrict the possible_map according to max_cpus. */
271         while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
272                 for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
273                         continue;
274                 cpu_clear(cpu, cpu_possible_map);
275         }
276
277         for_each_possible_cpu (cpu) {
278 #ifdef __i386__
279                 struct i386_pda *pda;
280                 struct desc_struct *gdt;
281 #endif
282
283                 if (cpu == 0)
284                         continue;
285
286 #ifdef __x86_64__
287                 gdt_descr = &cpu_gdt_descr[cpu];
288 #else
289                 gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
290 #endif
291                 gdt_descr->address = get_zeroed_page(GFP_KERNEL);
292                 if (unlikely(!gdt_descr->address)) {
293                         printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
294                                cpu);
295                         continue;
296                 }
297                 gdt_descr->size = GDT_SIZE;
298                 memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
299 #ifdef __i386__
300                 gdt = (struct desc_struct *)gdt_descr->address;
301                 pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
302
303                 if (unlikely(!pda)) {
304                         printk(KERN_CRIT "CPU%d failed to allocate PDA\n",
305                                cpu);
306                         continue;
307                 }
308                 cpu_pda(cpu) = pda;
309                 cpu_pda(cpu)->cpu_number = cpu;
310                 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
311                                 (u32 *)&gdt[GDT_ENTRY_PDA].b,
312                                 (unsigned long)pda, sizeof(*pda) - 1,
313                                 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
314 #endif
315                 make_page_readonly(
316                         (void *)gdt_descr->address,
317                         XENFEAT_writable_descriptor_tables);
318
319                 cpu_data[cpu] = boot_cpu_data;
320                 cpu_data[cpu].apicid = cpu;
321
322                 cpu_2_logical_apicid[cpu] = cpu;
323                 x86_cpu_to_apicid[cpu] = cpu;
324
325                 idle = fork_idle(cpu);
326                 if (IS_ERR(idle))
327                         panic("failed fork for CPU %d", cpu);
328
329                 cpu_pda(cpu)->pcurrent = idle;
330 #ifdef __x86_64__
331                 cpu_pda(cpu)->cpunumber = cpu;
332                 clear_ti_thread_flag(idle->thread_info, TIF_FORK);
333 #endif
334
335                 irq_ctx_init(cpu);
336
337 #ifdef CONFIG_HOTPLUG_CPU
338                 if (is_initial_xendomain())
339                         cpu_set(cpu, cpu_present_map);
340 #else
341                 cpu_set(cpu, cpu_present_map);
342 #endif
343
344                 cpu_initialize_context(cpu);
345         }
346
347         init_xenbus_allowed_cpumask();
348
349         /*
350          * Here we can be sure that there is an IO-APIC in the system. Let's
351          * go and set it up:
352          */
353 #ifdef CONFIG_X86_IO_APIC
354         if (!skip_ioapic_setup && nr_ioapics)
355                 setup_IO_APIC();
356 #endif
357 }
358
359 void __init smp_prepare_boot_cpu(void)
360 {
361 }
362
363 #ifdef CONFIG_HOTPLUG_CPU
364
365 /*
366  * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
367  * But do it early enough to catch critical for_each_present_cpu() loops
368  * in i386-specific code.
369  */
370 static int __init initialize_cpu_present_map(void)
371 {
372         cpu_present_map = cpu_possible_map;
373         return 0;
374 }
375 core_initcall(initialize_cpu_present_map);
376
377 static void
378 remove_siblinginfo(int cpu)
379 {
380         cpu_data[cpu].phys_proc_id = BAD_APICID;
381         cpu_data[cpu].cpu_core_id = BAD_APICID;
382
383         cpus_clear(cpu_sibling_map[cpu]);
384         cpus_clear(cpu_core_map[cpu]);
385
386         cpu_data[cpu].booted_cores = 0;
387 }
388
389 int __cpu_disable(void)
390 {
391         cpumask_t map = cpu_online_map;
392         int cpu = smp_processor_id();
393
394         if (cpu == 0)
395                 return -EBUSY;
396
397         remove_siblinginfo(cpu);
398
399         cpu_clear(cpu, map);
400         fixup_irqs(map);
401         cpu_clear(cpu, cpu_online_map);
402
403         return 0;
404 }
405
406 void __cpu_die(unsigned int cpu)
407 {
408         while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
409                 current->state = TASK_UNINTERRUPTIBLE;
410                 schedule_timeout(HZ/10);
411         }
412
413         xen_smp_intr_exit(cpu);
414
415         if (num_online_cpus() == 1)
416                 alternatives_smp_switch(0);
417 }
418
419 #else /* !CONFIG_HOTPLUG_CPU */
420
421 int __cpu_disable(void)
422 {
423         return -ENOSYS;
424 }
425
426 void __cpu_die(unsigned int cpu)
427 {
428         BUG();
429 }
430
431 #endif /* CONFIG_HOTPLUG_CPU */
432
433 int __cpuinit __cpu_up(unsigned int cpu)
434 {
435         int rc;
436
437         rc = cpu_up_check(cpu);
438         if (rc)
439                 return rc;
440
441         if (num_online_cpus() == 1)
442                 alternatives_smp_switch(1);
443
444         /* This must be done before setting cpu_online_map */
445         set_cpu_sibling_map(cpu);
446         wmb();
447
448         xen_smp_intr_init(cpu);
449         cpu_set(cpu, cpu_online_map);
450
451         rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
452         BUG_ON(rc);
453
454         return 0;
455 }
456
457 void __init smp_cpus_done(unsigned int max_cpus)
458 {
459 }
460
461 #ifdef CONFIG_X86_MPPARSE
462 /*
463  * If the BIOS enumerates physical processors before logical,
464  * maxcpus=N at enumeration-time can be used to disable HT.
465  */
466 static int __init parse_maxcpus(char *arg)
467 {
468         extern unsigned int maxcpus;
469
470         maxcpus = simple_strtoul(arg, NULL, 0);
471         return 0;
472 }
473 early_param("maxcpus", parse_maxcpus);
474 #endif
475
476 #if defined(CONFIG_XEN_UNPRIVILEGED_GUEST) && defined(CONFIG_X86_32)
477 int setup_profiling_timer(unsigned int multiplier)
478 {
479         return -EINVAL;
480 }
481 #endif