Fedora kernel-2.6.17-1.2142_FC4 patched with stable patch-2.6.17.4-vs2.0.2-rc26.diff
[linux-2.6.git] / arch / i386 / kernel / nmi.c
index 16d91e3..d43b498 100644 (file)
@@ -15,7 +15,6 @@
 
 #include <linux/config.h>
 #include <linux/mm.h>
-#include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/bootmem.h>
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/sysdev.h>
+#include <linux/sysctl.h>
 
 #include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
+#include <asm/div64.h>
 #include <asm/nmi.h>
 
+#include "mach_traps.h"
+
 unsigned int nmi_watchdog = NMI_NONE;
+extern int unknown_nmi_panic;
 static unsigned int nmi_hz = HZ;
 static unsigned int nmi_perfctr_msr;   /* the MSR to reset in NMI handler */
 static unsigned int nmi_p4_cccr_val;
@@ -46,7 +48,7 @@ extern void show_registers(struct pt_regs *regs);
  * This is maintained separately from nmi_active because the NMI
  * watchdog may also be driven from the I/O APIC timer.
  */
-static spinlock_t lapic_nmi_owner_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
 static unsigned int lapic_nmi_owner;
 #define LAPIC_NMI_WATCHDOG     (1<<0)
 #define LAPIC_NMI_RESERVED     (1<<1)
@@ -98,30 +100,69 @@ int nmi_active;
        (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|     \
         P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
 
-int __init check_nmi_watchdog (void)
+#ifdef CONFIG_SMP
+/* The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+static __init void nmi_cpu_busy(void *data)
+{
+       volatile int *endflag = data;
+       local_irq_enable();
+       /* Intentionally don't use cpu_relax here. This is
+          to make sure that the performance counter really ticks,
+          even if there is a simulator or similar that catches the
+          pause instruction. On a real HT machine this is fine because
+          all other CPUs are busy with "useless" delay loops and don't
+          care if they get somewhat less cycles. */
+       while (*endflag == 0)
+               barrier();
+}
+#endif
+
+static int __init check_nmi_watchdog(void)
 {
-       unsigned int prev_nmi_count[NR_CPUS];
+       volatile int endflag = 0;
+       unsigned int *prev_nmi_count;
        int cpu;
 
-       printk(KERN_INFO "testing NMI watchdog ... ");
+       if (nmi_watchdog == NMI_NONE)
+               return 0;
+
+       prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+       if (!prev_nmi_count)
+               return -1;
+
+       printk(KERN_INFO "Testing NMI watchdog ... ");
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 
-       for (cpu = 0; cpu < NR_CPUS; cpu++)
-               prev_nmi_count[cpu] = irq_stat[cpu].__nmi_count;
+       for_each_possible_cpu(cpu)
+               prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
        local_irq_enable();
        mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
-       /* FIXME: Only boot CPU is online at this stage.  Check CPUs
-           as they come up. */
-       for (cpu = 0; cpu < NR_CPUS; cpu++) {
-               if (!cpu_online(cpu))
+       for_each_possible_cpu(cpu) {
+#ifdef CONFIG_SMP
+               /* Check cpu_callin_map here because that is set
+                  after the timer is started. */
+               if (!cpu_isset(cpu, cpu_callin_map))
                        continue;
+#endif
                if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
-                       printk("CPU#%d: NMI appears to be stuck!\n", cpu);
+                       endflag = 1;
+                       printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
+                               cpu,
+                               prev_nmi_count[cpu],
+                               nmi_count(cpu));
                        nmi_active = 0;
                        lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
+                       kfree(prev_nmi_count);
                        return -1;
                }
        }
+       endflag = 1;
        printk("OK.\n");
 
        /* now that we know it works we can reduce NMI frequency to
@@ -129,8 +170,11 @@ int __init check_nmi_watchdog (void)
        if (nmi_watchdog == NMI_LOCAL_APIC)
                nmi_hz = 1;
 
+       kfree(prev_nmi_count);
        return 0;
 }
+/* This needs to happen later in boot so counters are working */
+late_initcall(check_nmi_watchdog);
 
 static int __init setup_nmi_watchdog(char *str)
 {
@@ -185,7 +229,7 @@ static void disable_lapic_nmi_watchdog(void)
                        wrmsr(MSR_P6_EVNTSEL0, 0, 0);
                        break;
                case 15:
-                       if (boot_cpu_data.x86_model > 0x3)
+                       if (boot_cpu_data.x86_model > 0x4)
                                break;
 
                        wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
@@ -257,7 +301,7 @@ void enable_timer_nmi_watchdog(void)
 
 static int nmi_pm_active; /* nmi_active before suspend */
 
-static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
+static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
 {
        nmi_pm_active = nmi_active;
        disable_lapic_nmi_watchdog();
@@ -313,6 +357,16 @@ static void clear_msr_range(unsigned int base, unsigned int n)
                wrmsr(base+i, 0, 0);
 }
 
+static void write_watchdog_counter(const char *descr)
+{
+       u64 count = (u64)cpu_khz * 1000;
+
+       do_div(count, nmi_hz);
+       if(descr)
+               Dprintk("setting %s to -0x%08Lx\n", descr, count);
+       wrmsrl(nmi_perfctr_msr, 0 - count);
+}
+
 static void setup_k7_watchdog(void)
 {
        unsigned int evntsel;
@@ -328,8 +382,7 @@ static void setup_k7_watchdog(void)
                | K7_NMI_EVENT;
 
        wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
-       Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
-       wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+       write_watchdog_counter("K7_PERFCTR0");
        apic_write(APIC_LVTPC, APIC_DM_NMI);
        evntsel |= K7_EVNTSEL_ENABLE;
        wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
@@ -350,8 +403,7 @@ static void setup_p6_watchdog(void)
                | P6_NMI_EVENT;
 
        wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
-       Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
-       wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0);
+       write_watchdog_counter("P6_PERFCTR0");
        apic_write(APIC_LVTPC, APIC_DM_NMI);
        evntsel |= P6_EVNTSEL0_ENABLE;
        wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
@@ -376,7 +428,13 @@ static int setup_p4_watchdog(void)
                clear_msr_range(0x3F1, 2);
        /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
           docs doesn't fully define it, so leave it alone for now. */
-       clear_msr_range(0x3A0, 31);
+       if (boot_cpu_data.x86_model >= 0x3) {
+               /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
+               clear_msr_range(0x3A0, 26);
+               clear_msr_range(0x3BC, 3);
+       } else {
+               clear_msr_range(0x3A0, 31);
+       }
        clear_msr_range(0x3C0, 6);
        clear_msr_range(0x3C8, 6);
        clear_msr_range(0x3E0, 2);
@@ -385,8 +443,7 @@ static int setup_p4_watchdog(void)
 
        wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
        wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
-       Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
-       wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+       write_watchdog_counter("P4_IQ_COUNTER0");
        apic_write(APIC_LVTPC, APIC_DM_NMI);
        wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
        return 1;
@@ -409,7 +466,7 @@ void setup_apic_nmi_watchdog (void)
                        setup_p6_watchdog();
                        break;
                case 15:
-                       if (boot_cpu_data.x86_model > 0x3)
+                       if (boot_cpu_data.x86_model > 0x4)
                                return;
 
                        if (!setup_p4_watchdog())
@@ -426,8 +483,6 @@ void setup_apic_nmi_watchdog (void)
        nmi_active = 1;
 }
 
-static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
-
 /*
  * the best way to detect whether a CPU has a 'hard lockup' problem
  * is to check it's local APIC timer IRQ counts. If they are not
@@ -455,10 +510,17 @@ void touch_nmi_watchdog (void)
         * Just reset the alert counters, (other CPUs might be
         * spinning on locks we hold):
         */
-       for (i = 0; i < NR_CPUS; i++)
+       for_each_possible_cpu(i)
                alert_counter[i] = 0;
+
+       /*
+        * Tickle the softlockup detector too:
+        */
+       touch_softlockup_watchdog();
 }
 
+extern void die_nmi(struct pt_regs *, const char *msg);
+
 void nmi_watchdog_tick (struct pt_regs * regs)
 {
 
@@ -467,9 +529,10 @@ void nmi_watchdog_tick (struct pt_regs * regs)
         * always switch the stack NMI-atomically, it's safe to use
         * smp_processor_id().
         */
-       int sum, cpu = smp_processor_id();
+       unsigned int sum;
+       int cpu = smp_processor_id();
 
-       sum = irq_stat[cpu].apic_timer_irqs;
+       sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
 
        if (last_irq_sums[cpu] == sum) {
                /*
@@ -477,21 +540,11 @@ void nmi_watchdog_tick (struct pt_regs * regs)
                 * wait a few IRQs (5 seconds) before doing the oops ...
                 */
                alert_counter[cpu]++;
-               if (alert_counter[cpu] == 5*nmi_hz) {
-                       spin_lock(&nmi_print_lock);
+               if (alert_counter[cpu] == 5*nmi_hz)
                        /*
-                        * We are in trouble anyway, lets at least try
-                        * to get a message out.
+                        * die_nmi will return ONLY if NOTIFY_STOP happens..
                         */
-                       bust_spinlocks(1);
-                       printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
-                       show_registers(regs);
-                       printk("console shuts up ...\n");
-                       console_silent();
-                       spin_unlock(&nmi_print_lock);
-                       bust_spinlocks(0);
-                       do_exit(SIGSEGV);
-               }
+                       die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
        } else {
                last_irq_sums[cpu] = sum;
                alert_counter[cpu] = 0;
@@ -514,10 +567,53 @@ void nmi_watchdog_tick (struct pt_regs * regs)
                         * other P6 variant */
                        apic_write(APIC_LVTPC, APIC_DM_NMI);
                }
-               wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+               write_watchdog_counter(NULL);
+       }
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+       unsigned char reason = get_nmi_reason();
+       char buf[64];
+
+       if (!(reason & 0xc0)) {
+               sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+               die_nmi(regs, buf);
+       }
+       return 0;
+}
+
+/*
+ * proc handler for /proc/sys/kernel/unknown_nmi_panic
+ */
+int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
+                       void __user *buffer, size_t *length, loff_t *ppos)
+{
+       int old_state;
+
+       old_state = unknown_nmi_panic;
+       proc_dointvec(table, write, file, buffer, length, ppos);
+       if (!!old_state == !!unknown_nmi_panic)
+               return 0;
+
+       if (unknown_nmi_panic) {
+               if (reserve_lapic_nmi() < 0) {
+                       unknown_nmi_panic = 0;
+                       return -EBUSY;
+               } else {
+                       set_nmi_callback(unknown_nmi_panic_callback);
+               }
+       } else {
+               release_lapic_nmi();
+               unset_nmi_callback();
        }
+       return 0;
 }
 
+#endif
+
 EXPORT_SYMBOL(nmi_active);
 EXPORT_SYMBOL(nmi_watchdog);
 EXPORT_SYMBOL(reserve_lapic_nmi);