fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / arch / i386 / kernel / nmi.c
1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson   : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson   : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson   : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson   : PM converted to driver model. Disable/enable API.
14  */
15
16 #include <linux/delay.h>
17 #include <linux/interrupt.h>
18 #include <linux/module.h>
19 #include <linux/nmi.h>
20 #include <linux/sysdev.h>
21 #include <linux/sysctl.h>
22 #include <linux/percpu.h>
23 #include <linux/dmi.h>
24 #include <linux/kprobes.h>
25 #include <linux/cpumask.h>
26
27 #include <asm/smp.h>
28 #include <asm/nmi.h>
29 #include <asm/kdebug.h>
30 #include <asm/intel_arch_perfmon.h>
31
32 #include "mach_traps.h"
33
34 int unknown_nmi_panic;
35 int nmi_watchdog_enabled;
36
37 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
38  * evtsel_nmi_owner tracks the ownership of the event selection
39  * - different performance counters/ event selection may be reserved for
40  *   different subsystems this reservation system just tries to coordinate
41  *   things a little
42  */
43
44 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
45  * offset from MSR_P4_BSU_ESCR0.  It will be the max for all platforms (for now)
46  */
47 #define NMI_MAX_COUNTER_BITS 66
48 #define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS)
49
50 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]);
51 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]);
52
53 static cpumask_t backtrace_mask = CPU_MASK_NONE;
54 /* nmi_active:
55  * >0: the lapic NMI watchdog is active, but can be disabled
56  * <0: the lapic NMI watchdog has not been set up, and cannot
57  *     be enabled
58  *  0: the lapic NMI watchdog is disabled, but can be enabled
59  */
60 atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
61
62 unsigned int nmi_watchdog = NMI_NONE;
63 static unsigned int nmi_hz = HZ;
64
65 struct nmi_watchdog_ctlblk {
66         int enabled;
67         u64 check_bit;
68         unsigned int cccr_msr;
69         unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
70         unsigned int evntsel_msr;  /* the MSR to select the events to handle */
71 };
72 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
73
74 /* local prototypes */
75 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
76
77 extern void show_registers(struct pt_regs *regs);
78 extern int unknown_nmi_panic;
79
80 /* converts an msr to an appropriate reservation bit */
81 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
82 {
83         /* returns the bit offset of the performance counter register */
84         switch (boot_cpu_data.x86_vendor) {
85         case X86_VENDOR_AMD:
86                 return (msr - MSR_K7_PERFCTR0);
87         case X86_VENDOR_INTEL:
88                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
89                         return (msr - MSR_ARCH_PERFMON_PERFCTR0);
90
91                 switch (boot_cpu_data.x86) {
92                 case 6:
93                         return (msr - MSR_P6_PERFCTR0);
94                 case 15:
95                         return (msr - MSR_P4_BPU_PERFCTR0);
96                 }
97         }
98         return 0;
99 }
100
101 /* converts an msr to an appropriate reservation bit */
102 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
103 {
104         /* returns the bit offset of the event selection register */
105         switch (boot_cpu_data.x86_vendor) {
106         case X86_VENDOR_AMD:
107                 return (msr - MSR_K7_EVNTSEL0);
108         case X86_VENDOR_INTEL:
109                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
110                         return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
111
112                 switch (boot_cpu_data.x86) {
113                 case 6:
114                         return (msr - MSR_P6_EVNTSEL0);
115                 case 15:
116                         return (msr - MSR_P4_BSU_ESCR0);
117                 }
118         }
119         return 0;
120 }
121
122 /* checks for a bit availability (hack for oprofile) */
123 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
124 {
125         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
128 }
129
130 /* checks the an msr for availability */
131 int avail_to_resrv_perfctr_nmi(unsigned int msr)
132 {
133         unsigned int counter;
134
135         counter = nmi_perfctr_msr_to_bit(msr);
136         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
137
138         return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
139 }
140
141 int reserve_perfctr_nmi(unsigned int msr)
142 {
143         unsigned int counter;
144
145         counter = nmi_perfctr_msr_to_bit(msr);
146         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
147
148         if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
149                 return 1;
150         return 0;
151 }
152
153 void release_perfctr_nmi(unsigned int msr)
154 {
155         unsigned int counter;
156
157         counter = nmi_perfctr_msr_to_bit(msr);
158         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
159
160         clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
161 }
162
163 int reserve_evntsel_nmi(unsigned int msr)
164 {
165         unsigned int counter;
166
167         counter = nmi_evntsel_msr_to_bit(msr);
168         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
169
170         if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
171                 return 1;
172         return 0;
173 }
174
175 void release_evntsel_nmi(unsigned int msr)
176 {
177         unsigned int counter;
178
179         counter = nmi_evntsel_msr_to_bit(msr);
180         BUG_ON(counter > NMI_MAX_COUNTER_BITS);
181
182         clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
183 }
184
185 static __cpuinit inline int nmi_known_cpu(void)
186 {
187         switch (boot_cpu_data.x86_vendor) {
188         case X86_VENDOR_AMD:
189                 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
190         case X86_VENDOR_INTEL:
191                 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
192                         return 1;
193                 else
194                         return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
195         }
196         return 0;
197 }
198
199 static int endflag __initdata = 0;
200
201 #ifdef CONFIG_SMP
202 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
203  * the CPU is idle. To make sure the NMI watchdog really ticks on all
204  * CPUs during the test make them busy.
205  */
206 static __init void nmi_cpu_busy(void *data)
207 {
208         local_irq_enable_in_hardirq();
209         /* Intentionally don't use cpu_relax here. This is
210            to make sure that the performance counter really ticks,
211            even if there is a simulator or similar that catches the
212            pause instruction. On a real HT machine this is fine because
213            all other CPUs are busy with "useless" delay loops and don't
214            care if they get somewhat less cycles. */
215         while (endflag == 0)
216                 mb();
217 }
218 #endif
219
220 static int __init check_nmi_watchdog(void)
221 {
222         unsigned int *prev_nmi_count;
223         int cpu;
224
225         /* Enable NMI watchdog for newer systems.
226            Probably safe on most older systems too, but let's be careful.
227            IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM
228            which hangs the system. Disable watchdog for all thinkpads */
229         if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 &&
230                 !dmi_name_in_vendors("ThinkPad"))
231                 nmi_watchdog = NMI_LOCAL_APIC;
232
233         if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
234                 return 0;
235
236         if (!atomic_read(&nmi_active))
237                 return 0;
238
239         prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
240         if (!prev_nmi_count)
241                 return -1;
242
243         printk(KERN_INFO "Testing NMI watchdog ... ");
244
245         if (nmi_watchdog == NMI_LOCAL_APIC)
246                 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
247
248         for_each_possible_cpu(cpu)
249                 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
250         local_irq_enable();
251         mdelay((10*1000)/nmi_hz); // wait 10 ticks
252
253         for_each_possible_cpu(cpu) {
254 #ifdef CONFIG_SMP
255                 /* Check cpu_callin_map here because that is set
256                    after the timer is started. */
257                 if (!cpu_isset(cpu, cpu_callin_map))
258                         continue;
259 #endif
260                 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
261                         continue;
262                 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
263                         printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
264                                 cpu,
265                                 prev_nmi_count[cpu],
266                                 nmi_count(cpu));
267                         per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
268                         atomic_dec(&nmi_active);
269                 }
270         }
271         if (!atomic_read(&nmi_active)) {
272                 kfree(prev_nmi_count);
273                 atomic_set(&nmi_active, -1);
274                 return -1;
275         }
276         endflag = 1;
277         printk("OK.\n");
278
279         /* now that we know it works we can reduce NMI frequency to
280            something more reasonable; makes a difference in some configs */
281         if (nmi_watchdog == NMI_LOCAL_APIC) {
282                 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
283
284                 nmi_hz = 1;
285                 /*
286                  * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
287                  * are writable, with higher bits sign extending from bit 31.
288                  * So, we can only program the counter with 31 bit values and
289                  * 32nd bit should be 1, for 33.. to be 1.
290                  * Find the appropriate nmi_hz
291                  */
292                 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
293                         ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
294                         u64 count = (u64)cpu_khz * 1000;
295                         do_div(count, 0x7fffffffUL);
296                         nmi_hz = count + 1;
297                 }
298         }
299
300         kfree(prev_nmi_count);
301         return 0;
302 }
303 /* This needs to happen later in boot so counters are working */
304 late_initcall(check_nmi_watchdog);
305
306 static int __init setup_nmi_watchdog(char *str)
307 {
308         int nmi;
309
310         get_option(&str, &nmi);
311
312         if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
313                 return 0;
314
315         nmi_watchdog = nmi;
316         return 1;
317 }
318
319 __setup("nmi_watchdog=", setup_nmi_watchdog);
320
321 static void disable_lapic_nmi_watchdog(void)
322 {
323         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
324
325         if (atomic_read(&nmi_active) <= 0)
326                 return;
327
328         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
329
330         BUG_ON(atomic_read(&nmi_active) != 0);
331 }
332
333 static void enable_lapic_nmi_watchdog(void)
334 {
335         BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
336
337         /* are we already enabled */
338         if (atomic_read(&nmi_active) != 0)
339                 return;
340
341         /* are we lapic aware */
342         if (nmi_known_cpu() <= 0)
343                 return;
344
345         on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
346         touch_nmi_watchdog();
347 }
348
349 void disable_timer_nmi_watchdog(void)
350 {
351         BUG_ON(nmi_watchdog != NMI_IO_APIC);
352
353         if (atomic_read(&nmi_active) <= 0)
354                 return;
355
356         disable_irq(0);
357         on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
358
359         BUG_ON(atomic_read(&nmi_active) != 0);
360 }
361
362 void enable_timer_nmi_watchdog(void)
363 {
364         BUG_ON(nmi_watchdog != NMI_IO_APIC);
365
366         if (atomic_read(&nmi_active) == 0) {
367                 touch_nmi_watchdog();
368                 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
369                 enable_irq(0);
370         }
371 }
372
373 #ifdef CONFIG_PM
374
375 static int nmi_pm_active; /* nmi_active before suspend */
376
377 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
378 {
379         /* only CPU0 goes here, other CPUs should be offline */
380         nmi_pm_active = atomic_read(&nmi_active);
381         stop_apic_nmi_watchdog(NULL);
382         BUG_ON(atomic_read(&nmi_active) != 0);
383         return 0;
384 }
385
386 static int lapic_nmi_resume(struct sys_device *dev)
387 {
388         /* only CPU0 goes here, other CPUs should be offline */
389         if (nmi_pm_active > 0) {
390                 setup_apic_nmi_watchdog(NULL);
391                 touch_nmi_watchdog();
392         }
393         return 0;
394 }
395
396
397 static struct sysdev_class nmi_sysclass = {
398         set_kset_name("lapic_nmi"),
399         .resume         = lapic_nmi_resume,
400         .suspend        = lapic_nmi_suspend,
401 };
402
403 static struct sys_device device_lapic_nmi = {
404         .id     = 0,
405         .cls    = &nmi_sysclass,
406 };
407
408 static int __init init_lapic_nmi_sysfs(void)
409 {
410         int error;
411
412         /* should really be a BUG_ON but b/c this is an
413          * init call, it just doesn't work.  -dcz
414          */
415         if (nmi_watchdog != NMI_LOCAL_APIC)
416                 return 0;
417
418         if ( atomic_read(&nmi_active) < 0 )
419                 return 0;
420
421         error = sysdev_class_register(&nmi_sysclass);
422         if (!error)
423                 error = sysdev_register(&device_lapic_nmi);
424         return error;
425 }
426 /* must come after the local APIC's device_initcall() */
427 late_initcall(init_lapic_nmi_sysfs);
428
429 #endif  /* CONFIG_PM */
430
431 /*
432  * Activate the NMI watchdog via the local APIC.
433  * Original code written by Keith Owens.
434  */
435
436 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
437 {
438         u64 count = (u64)cpu_khz * 1000;
439
440         do_div(count, nmi_hz);
441         if(descr)
442                 Dprintk("setting %s to -0x%08Lx\n", descr, count);
443         wrmsrl(perfctr_msr, 0 - count);
444 }
445
446 /* Note that these events don't tick when the CPU idles. This means
447    the frequency varies with CPU load. */
448
449 #define K7_EVNTSEL_ENABLE       (1 << 22)
450 #define K7_EVNTSEL_INT          (1 << 20)
451 #define K7_EVNTSEL_OS           (1 << 17)
452 #define K7_EVNTSEL_USR          (1 << 16)
453 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING    0x76
454 #define K7_NMI_EVENT            K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
455
456 static int setup_k7_watchdog(void)
457 {
458         unsigned int perfctr_msr, evntsel_msr;
459         unsigned int evntsel;
460         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
461
462         perfctr_msr = MSR_K7_PERFCTR0;
463         evntsel_msr = MSR_K7_EVNTSEL0;
464         if (!reserve_perfctr_nmi(perfctr_msr))
465                 goto fail;
466
467         if (!reserve_evntsel_nmi(evntsel_msr))
468                 goto fail1;
469
470         wrmsrl(perfctr_msr, 0UL);
471
472         evntsel = K7_EVNTSEL_INT
473                 | K7_EVNTSEL_OS
474                 | K7_EVNTSEL_USR
475                 | K7_NMI_EVENT;
476
477         /* setup the timer */
478         wrmsr(evntsel_msr, evntsel, 0);
479         write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
480         apic_write(APIC_LVTPC, APIC_DM_NMI);
481         evntsel |= K7_EVNTSEL_ENABLE;
482         wrmsr(evntsel_msr, evntsel, 0);
483
484         wd->perfctr_msr = perfctr_msr;
485         wd->evntsel_msr = evntsel_msr;
486         wd->cccr_msr = 0;  //unused
487         wd->check_bit = 1ULL<<63;
488         return 1;
489 fail1:
490         release_perfctr_nmi(perfctr_msr);
491 fail:
492         return 0;
493 }
494
495 static void stop_k7_watchdog(void)
496 {
497         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
498
499         wrmsr(wd->evntsel_msr, 0, 0);
500
501         release_evntsel_nmi(wd->evntsel_msr);
502         release_perfctr_nmi(wd->perfctr_msr);
503 }
504
505 #define P6_EVNTSEL0_ENABLE      (1 << 22)
506 #define P6_EVNTSEL_INT          (1 << 20)
507 #define P6_EVNTSEL_OS           (1 << 17)
508 #define P6_EVNTSEL_USR          (1 << 16)
509 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED  0x79
510 #define P6_NMI_EVENT            P6_EVENT_CPU_CLOCKS_NOT_HALTED
511
512 static int setup_p6_watchdog(void)
513 {
514         unsigned int perfctr_msr, evntsel_msr;
515         unsigned int evntsel;
516         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
517
518         perfctr_msr = MSR_P6_PERFCTR0;
519         evntsel_msr = MSR_P6_EVNTSEL0;
520         if (!reserve_perfctr_nmi(perfctr_msr))
521                 goto fail;
522
523         if (!reserve_evntsel_nmi(evntsel_msr))
524                 goto fail1;
525
526         wrmsrl(perfctr_msr, 0UL);
527
528         evntsel = P6_EVNTSEL_INT
529                 | P6_EVNTSEL_OS
530                 | P6_EVNTSEL_USR
531                 | P6_NMI_EVENT;
532
533         /* setup the timer */
534         wrmsr(evntsel_msr, evntsel, 0);
535         write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
536         apic_write(APIC_LVTPC, APIC_DM_NMI);
537         evntsel |= P6_EVNTSEL0_ENABLE;
538         wrmsr(evntsel_msr, evntsel, 0);
539
540         wd->perfctr_msr = perfctr_msr;
541         wd->evntsel_msr = evntsel_msr;
542         wd->cccr_msr = 0;  //unused
543         wd->check_bit = 1ULL<<39;
544         return 1;
545 fail1:
546         release_perfctr_nmi(perfctr_msr);
547 fail:
548         return 0;
549 }
550
551 static void stop_p6_watchdog(void)
552 {
553         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
554
555         wrmsr(wd->evntsel_msr, 0, 0);
556
557         release_evntsel_nmi(wd->evntsel_msr);
558         release_perfctr_nmi(wd->perfctr_msr);
559 }
560
561 /* Note that these events don't tick when the CPU idles. This means
562    the frequency varies with CPU load. */
563
564 #define MSR_P4_MISC_ENABLE_PERF_AVAIL   (1<<7)
565 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
566 #define P4_ESCR_OS              (1<<3)
567 #define P4_ESCR_USR             (1<<2)
568 #define P4_CCCR_OVF_PMI0        (1<<26)
569 #define P4_CCCR_OVF_PMI1        (1<<27)
570 #define P4_CCCR_THRESHOLD(N)    ((N)<<20)
571 #define P4_CCCR_COMPLEMENT      (1<<19)
572 #define P4_CCCR_COMPARE         (1<<18)
573 #define P4_CCCR_REQUIRED        (3<<16)
574 #define P4_CCCR_ESCR_SELECT(N)  ((N)<<13)
575 #define P4_CCCR_ENABLE          (1<<12)
576 #define P4_CCCR_OVF             (1<<31)
577 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
578    CRU_ESCR0 (with any non-null event selector) through a complemented
579    max threshold. [IA32-Vol3, Section 14.9.9] */
580
581 static int setup_p4_watchdog(void)
582 {
583         unsigned int perfctr_msr, evntsel_msr, cccr_msr;
584         unsigned int evntsel, cccr_val;
585         unsigned int misc_enable, dummy;
586         unsigned int ht_num;
587         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
588
589         rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
590         if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
591                 return 0;
592
593 #ifdef CONFIG_SMP
594         /* detect which hyperthread we are on */
595         if (smp_num_siblings == 2) {
596                 unsigned int ebx, apicid;
597
598                 ebx = cpuid_ebx(1);
599                 apicid = (ebx >> 24) & 0xff;
600                 ht_num = apicid & 1;
601         } else
602 #endif
603                 ht_num = 0;
604
605         /* performance counters are shared resources
606          * assign each hyperthread its own set
607          * (re-use the ESCR0 register, seems safe
608          * and keeps the cccr_val the same)
609          */
610         if (!ht_num) {
611                 /* logical cpu 0 */
612                 perfctr_msr = MSR_P4_IQ_PERFCTR0;
613                 evntsel_msr = MSR_P4_CRU_ESCR0;
614                 cccr_msr = MSR_P4_IQ_CCCR0;
615                 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
616         } else {
617                 /* logical cpu 1 */
618                 perfctr_msr = MSR_P4_IQ_PERFCTR1;
619                 evntsel_msr = MSR_P4_CRU_ESCR0;
620                 cccr_msr = MSR_P4_IQ_CCCR1;
621                 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
622         }
623
624         if (!reserve_perfctr_nmi(perfctr_msr))
625                 goto fail;
626
627         if (!reserve_evntsel_nmi(evntsel_msr))
628                 goto fail1;
629
630         evntsel = P4_ESCR_EVENT_SELECT(0x3F)
631                 | P4_ESCR_OS
632                 | P4_ESCR_USR;
633
634         cccr_val |= P4_CCCR_THRESHOLD(15)
635                  | P4_CCCR_COMPLEMENT
636                  | P4_CCCR_COMPARE
637                  | P4_CCCR_REQUIRED;
638
639         wrmsr(evntsel_msr, evntsel, 0);
640         wrmsr(cccr_msr, cccr_val, 0);
641         write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
642         apic_write(APIC_LVTPC, APIC_DM_NMI);
643         cccr_val |= P4_CCCR_ENABLE;
644         wrmsr(cccr_msr, cccr_val, 0);
645         wd->perfctr_msr = perfctr_msr;
646         wd->evntsel_msr = evntsel_msr;
647         wd->cccr_msr = cccr_msr;
648         wd->check_bit = 1ULL<<39;
649         return 1;
650 fail1:
651         release_perfctr_nmi(perfctr_msr);
652 fail:
653         return 0;
654 }
655
656 static void stop_p4_watchdog(void)
657 {
658         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
659
660         wrmsr(wd->cccr_msr, 0, 0);
661         wrmsr(wd->evntsel_msr, 0, 0);
662
663         release_evntsel_nmi(wd->evntsel_msr);
664         release_perfctr_nmi(wd->perfctr_msr);
665 }
666
667 #define ARCH_PERFMON_NMI_EVENT_SEL      ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
668 #define ARCH_PERFMON_NMI_EVENT_UMASK    ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
669
670 static int setup_intel_arch_watchdog(void)
671 {
672         unsigned int ebx;
673         union cpuid10_eax eax;
674         unsigned int unused;
675         unsigned int perfctr_msr, evntsel_msr;
676         unsigned int evntsel;
677         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
678
679         /*
680          * Check whether the Architectural PerfMon supports
681          * Unhalted Core Cycles Event or not.
682          * NOTE: Corresponding bit = 0 in ebx indicates event present.
683          */
684         cpuid(10, &(eax.full), &ebx, &unused, &unused);
685         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
686             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
687                 goto fail;
688
689         perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
690         evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
691
692         if (!reserve_perfctr_nmi(perfctr_msr))
693                 goto fail;
694
695         if (!reserve_evntsel_nmi(evntsel_msr))
696                 goto fail1;
697
698         wrmsrl(perfctr_msr, 0UL);
699
700         evntsel = ARCH_PERFMON_EVENTSEL_INT
701                 | ARCH_PERFMON_EVENTSEL_OS
702                 | ARCH_PERFMON_EVENTSEL_USR
703                 | ARCH_PERFMON_NMI_EVENT_SEL
704                 | ARCH_PERFMON_NMI_EVENT_UMASK;
705
706         /* setup the timer */
707         wrmsr(evntsel_msr, evntsel, 0);
708         write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
709         apic_write(APIC_LVTPC, APIC_DM_NMI);
710         evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
711         wrmsr(evntsel_msr, evntsel, 0);
712
713         wd->perfctr_msr = perfctr_msr;
714         wd->evntsel_msr = evntsel_msr;
715         wd->cccr_msr = 0;  //unused
716         wd->check_bit = 1ULL << (eax.split.bit_width - 1);
717         return 1;
718 fail1:
719         release_perfctr_nmi(perfctr_msr);
720 fail:
721         return 0;
722 }
723
724 static void stop_intel_arch_watchdog(void)
725 {
726         unsigned int ebx;
727         union cpuid10_eax eax;
728         unsigned int unused;
729         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
730
731         /*
732          * Check whether the Architectural PerfMon supports
733          * Unhalted Core Cycles Event or not.
734          * NOTE: Corresponding bit = 0 in ebx indicates event present.
735          */
736         cpuid(10, &(eax.full), &ebx, &unused, &unused);
737         if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
738             (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
739                 return;
740
741         wrmsr(wd->evntsel_msr, 0, 0);
742         release_evntsel_nmi(wd->evntsel_msr);
743         release_perfctr_nmi(wd->perfctr_msr);
744 }
745
746 void setup_apic_nmi_watchdog (void *unused)
747 {
748         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
749
750         /* only support LOCAL and IO APICs for now */
751         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
752             (nmi_watchdog != NMI_IO_APIC))
753                 return;
754
755         if (wd->enabled == 1)
756                 return;
757
758         /* cheap hack to support suspend/resume */
759         /* if cpu0 is not active neither should the other cpus */
760         if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
761                 return;
762
763         if (nmi_watchdog == NMI_LOCAL_APIC) {
764                 switch (boot_cpu_data.x86_vendor) {
765                 case X86_VENDOR_AMD:
766                         if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
767                                 return;
768                         if (!setup_k7_watchdog())
769                                 return;
770                         break;
771                 case X86_VENDOR_INTEL:
772                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
773                                 if (!setup_intel_arch_watchdog())
774                                         return;
775                                 break;
776                         }
777                         switch (boot_cpu_data.x86) {
778                         case 6:
779                                 if (boot_cpu_data.x86_model > 0xd)
780                                         return;
781
782                                 if (!setup_p6_watchdog())
783                                         return;
784                                 break;
785                         case 15:
786                                 if (boot_cpu_data.x86_model > 0x4)
787                                         return;
788
789                                 if (!setup_p4_watchdog())
790                                         return;
791                                 break;
792                         default:
793                                 return;
794                         }
795                         break;
796                 default:
797                         return;
798                 }
799         }
800         wd->enabled = 1;
801         atomic_inc(&nmi_active);
802 }
803
804 void stop_apic_nmi_watchdog(void *unused)
805 {
806         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
807
808         /* only support LOCAL and IO APICs for now */
809         if ((nmi_watchdog != NMI_LOCAL_APIC) &&
810             (nmi_watchdog != NMI_IO_APIC))
811                 return;
812
813         if (wd->enabled == 0)
814                 return;
815
816         if (nmi_watchdog == NMI_LOCAL_APIC) {
817                 switch (boot_cpu_data.x86_vendor) {
818                 case X86_VENDOR_AMD:
819                         stop_k7_watchdog();
820                         break;
821                 case X86_VENDOR_INTEL:
822                         if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
823                                 stop_intel_arch_watchdog();
824                                 break;
825                         }
826                         switch (boot_cpu_data.x86) {
827                         case 6:
828                                 if (boot_cpu_data.x86_model > 0xd)
829                                         break;
830                                 stop_p6_watchdog();
831                                 break;
832                         case 15:
833                                 if (boot_cpu_data.x86_model > 0x4)
834                                         break;
835                                 stop_p4_watchdog();
836                                 break;
837                         }
838                         break;
839                 default:
840                         return;
841                 }
842         }
843         wd->enabled = 0;
844         atomic_dec(&nmi_active);
845 }
846
847 /*
848  * the best way to detect whether a CPU has a 'hard lockup' problem
849  * is to check it's local APIC timer IRQ counts. If they are not
850  * changing then that CPU has some problem.
851  *
852  * as these watchdog NMI IRQs are generated on every CPU, we only
853  * have to check the current processor.
854  *
855  * since NMIs don't listen to _any_ locks, we have to be extremely
856  * careful not to rely on unsafe variables. The printk might lock
857  * up though, so we have to break up any console locks first ...
858  * [when there will be more tty-related locks, break them up
859  *  here too!]
860  */
861
862 static unsigned int
863         last_irq_sums [NR_CPUS],
864         alert_counter [NR_CPUS];
865
866 void touch_nmi_watchdog (void)
867 {
868         if (nmi_watchdog > 0) {
869                 unsigned cpu;
870
871                 /*
872                  * Just reset the alert counters, (other CPUs might be
873                  * spinning on locks we hold):
874                  */
875                 for_each_present_cpu (cpu)
876                         alert_counter[cpu] = 0;
877         }
878
879         /*
880          * Tickle the softlockup detector too:
881          */
882         touch_softlockup_watchdog();
883 }
884 EXPORT_SYMBOL(touch_nmi_watchdog);
885
886 extern void die_nmi(struct pt_regs *, const char *msg);
887
888 __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
889 {
890
891         /*
892          * Since current_thread_info()-> is always on the stack, and we
893          * always switch the stack NMI-atomically, it's safe to use
894          * smp_processor_id().
895          */
896         unsigned int sum;
897         int touched = 0;
898         int cpu = smp_processor_id();
899         struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
900         u64 dummy;
901         int rc=0;
902
903         /* check for other users first */
904         if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
905                         == NOTIFY_STOP) {
906                 rc = 1;
907                 touched = 1;
908         }
909
910         if (cpu_isset(cpu, backtrace_mask)) {
911                 static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
912
913                 spin_lock(&lock);
914                 printk("NMI backtrace for cpu %d\n", cpu);
915                 dump_stack();
916                 spin_unlock(&lock);
917                 cpu_clear(cpu, backtrace_mask);
918         }
919
920         sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
921
922         /* if the apic timer isn't firing, this cpu isn't doing much */
923         if (!touched && last_irq_sums[cpu] == sum) {
924                 /*
925                  * Ayiee, looks like this CPU is stuck ...
926                  * wait a few IRQs (5 seconds) before doing the oops ...
927                  */
928                 alert_counter[cpu]++;
929                 if (alert_counter[cpu] == 5*nmi_hz)
930                         /*
931                          * die_nmi will return ONLY if NOTIFY_STOP happens..
932                          */
933                         die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
934         } else {
935                 last_irq_sums[cpu] = sum;
936                 alert_counter[cpu] = 0;
937         }
938         /* see if the nmi watchdog went off */
939         if (wd->enabled) {
940                 if (nmi_watchdog == NMI_LOCAL_APIC) {
941                         rdmsrl(wd->perfctr_msr, dummy);
942                         if (dummy & wd->check_bit){
943                                 /* this wasn't a watchdog timer interrupt */
944                                 goto done;
945                         }
946
947                         /* only Intel P4 uses the cccr msr */
948                         if (wd->cccr_msr != 0) {
949                                 /*
950                                  * P4 quirks:
951                                  * - An overflown perfctr will assert its interrupt
952                                  *   until the OVF flag in its CCCR is cleared.
953                                  * - LVTPC is masked on interrupt and must be
954                                  *   unmasked by the LVTPC handler.
955                                  */
956                                 rdmsrl(wd->cccr_msr, dummy);
957                                 dummy &= ~P4_CCCR_OVF;
958                                 wrmsrl(wd->cccr_msr, dummy);
959                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
960                         }
961                         else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
962                                  wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
963                                 /* P6 based Pentium M need to re-unmask
964                                  * the apic vector but it doesn't hurt
965                                  * other P6 variant.
966                                  * ArchPerfom/Core Duo also needs this */
967                                 apic_write(APIC_LVTPC, APIC_DM_NMI);
968                         }
969                         /* start the cycle over again */
970                         write_watchdog_counter(wd->perfctr_msr, NULL);
971                         rc = 1;
972                 } else if (nmi_watchdog == NMI_IO_APIC) {
973                         /* don't know how to accurately check for this.
974                          * just assume it was a watchdog timer interrupt
975                          * This matches the old behaviour.
976                          */
977                         rc = 1;
978                 }
979         }
980 done:
981         return rc;
982 }
983
984 int do_nmi_callback(struct pt_regs * regs, int cpu)
985 {
986 #ifdef CONFIG_SYSCTL
987         if (unknown_nmi_panic)
988                 return unknown_nmi_panic_callback(regs, cpu);
989 #endif
990         return 0;
991 }
992
993 #ifdef CONFIG_SYSCTL
994
995 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
996 {
997         unsigned char reason = get_nmi_reason();
998         char buf[64];
999
1000         sprintf(buf, "NMI received for unknown reason %02x\n", reason);
1001         die_nmi(regs, buf);
1002         return 0;
1003 }
1004
1005 /*
1006  * proc handler for /proc/sys/kernel/nmi
1007  */
1008 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1009                         void __user *buffer, size_t *length, loff_t *ppos)
1010 {
1011         int old_state;
1012
1013         nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1014         old_state = nmi_watchdog_enabled;
1015         proc_dointvec(table, write, file, buffer, length, ppos);
1016         if (!!old_state == !!nmi_watchdog_enabled)
1017                 return 0;
1018
1019         if (atomic_read(&nmi_active) < 0) {
1020                 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1021                 return -EIO;
1022         }
1023
1024         if (nmi_watchdog == NMI_DEFAULT) {
1025                 if (nmi_known_cpu() > 0)
1026                         nmi_watchdog = NMI_LOCAL_APIC;
1027                 else
1028                         nmi_watchdog = NMI_IO_APIC;
1029         }
1030
1031         if (nmi_watchdog == NMI_LOCAL_APIC) {
1032                 if (nmi_watchdog_enabled)
1033                         enable_lapic_nmi_watchdog();
1034                 else
1035                         disable_lapic_nmi_watchdog();
1036         } else {
1037                 printk( KERN_WARNING
1038                         "NMI watchdog doesn't know what hardware to touch\n");
1039                 return -EIO;
1040         }
1041         return 0;
1042 }
1043
1044 #endif
1045
1046 void __trigger_all_cpu_backtrace(void)
1047 {
1048         int i;
1049
1050         backtrace_mask = cpu_online_map;
1051         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1052         for (i = 0; i < 10 * 1000; i++) {
1053                 if (cpus_empty(backtrace_mask))
1054                         break;
1055                 mdelay(1);
1056         }
1057 }
1058
1059 EXPORT_SYMBOL(nmi_active);
1060 EXPORT_SYMBOL(nmi_watchdog);
1061 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1062 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1063 EXPORT_SYMBOL(reserve_perfctr_nmi);
1064 EXPORT_SYMBOL(release_perfctr_nmi);
1065 EXPORT_SYMBOL(reserve_evntsel_nmi);
1066 EXPORT_SYMBOL(release_evntsel_nmi);
1067 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1068 EXPORT_SYMBOL(enable_timer_nmi_watchdog);