fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / arch / x86_64 / kernel / apic.c
index 6fb84e2..fdbf536 100644 (file)
  *     Mikael Pettersson       :       PM converted to driver model.
  */
 
-#include <linux/config.h>
 #include <linux/init.h>
 
 #include <linux/mm.h>
-#include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/bootmem.h>
 #include <linux/smp_lock.h>
 #include <linux/mc146818rtc.h>
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
+#include <asm/mach_apic.h>
+#include <asm/nmi.h>
+#include <asm/idle.h>
+#include <asm/proto.h>
+#include <asm/timex.h>
+#include <asm/apic.h>
+
+int apic_mapped;
+int apic_verbosity;
+int apic_runs_main_timer;
+int apic_calibrate_pmtmr __initdata;
 
 int disable_apic_timer __initdata;
 
-/* Using APIC to generate smp_local_timer_interrupt? */
-int using_apic_timer = 0;
+static struct resource *ioapic_resources;
+static struct resource lapic_resource = {
+       .name = "Local APIC",
+       .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
 
-static DEFINE_PER_CPU(int, prof_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_counter) = 1;
+/*
+ * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
+ * IPIs in place of local APIC timers
+ */
+static cpumask_t timer_interrupt_broadcast_ipi_mask;
+
+/* Using APIC to generate smp_local_timer_interrupt? */
+int using_apic_timer __read_mostly = 0;
 
 static void apic_pm_activate(void);
 
 void enable_NMI_through_LVT0 (void * dummy)
 {
-       unsigned int v, ver;
+       unsigned int v;
        
-       ver = apic_read(APIC_LVR);
-       ver = GET_APIC_VERSION(ver);
        v = APIC_DM_NMI;                        /* unmask and set to NMI */
-       apic_write_around(APIC_LVT0, v);
+       apic_write(APIC_LVT0, v);
 }
 
 int get_maxlvt(void)
 {
-       unsigned int v, ver, maxlvt;
+       unsigned int v, maxlvt;
 
        v = apic_read(APIC_LVR);
-       ver = GET_APIC_VERSION(v);
        maxlvt = GET_APIC_MAXLVT(v);
        return maxlvt;
 }
 
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+       printk("unexpected IRQ trap at vector %02x\n", irq);
+       /*
+        * Currently unexpected vectors happen only on SMP and APIC.
+        * We _must_ ack these because every local APIC has only N
+        * irq slots per priority level, and a 'hanging, unacked' IRQ
+        * holds up an irq slot - in excessive cases (when multiple
+        * unexpected vectors occur) that might lock up the APIC
+        * completely.
+        * But don't ack when the APIC is disabled. -AK
+        */
+       if (!disable_apic)
+               ack_APIC_irq();
+}
+
 void clear_local_APIC(void)
 {
        int maxlvt;
@@ -72,76 +108,76 @@ void clear_local_APIC(void)
        maxlvt = get_maxlvt();
 
        /*
-        * Masking an LVT entry on a P6 can trigger a local APIC error
+        * Masking an LVT entry can trigger a local APIC error
         * if the vector is zero. Mask LVTERR first to prevent this.
         */
        if (maxlvt >= 3) {
                v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-               apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+               apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
        }
        /*
         * Careful: we have to set masks only first to deassert
         * any level-triggered sources.
         */
        v = apic_read(APIC_LVTT);
-       apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
        v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
        v = apic_read(APIC_LVT1);
-       apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+       apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
        if (maxlvt >= 4) {
                v = apic_read(APIC_LVTPC);
-               apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+               apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
        }
 
        /*
         * Clean APIC state for other OSs:
         */
-       apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+       apic_write(APIC_LVTT, APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, APIC_LVT_MASKED);
+       apic_write(APIC_LVT1, APIC_LVT_MASKED);
        if (maxlvt >= 3)
-               apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+               apic_write(APIC_LVTERR, APIC_LVT_MASKED);
        if (maxlvt >= 4)
-               apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
-       v = GET_APIC_VERSION(apic_read(APIC_LVR));
-       if (APIC_INTEGRATED(v)) {       /* !82489DX */
-               if (maxlvt > 3)         /* Due to Pentium errata 3AP and 11AP. */
-                       apic_write(APIC_ESR, 0);
-               apic_read(APIC_ESR);
-       }
+               apic_write(APIC_LVTPC, APIC_LVT_MASKED);
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
 }
 
-void __init connect_bsp_APIC(void)
+void disconnect_bsp_APIC(int virt_wire_setup)
 {
-       if (pic_mode) {
-               /*
-                * Do not trust the local APIC being empty at bootup.
-                */
-               clear_local_APIC();
-               /*
-                * PIC mode, enable APIC mode in the IMCR, i.e.
-                * connect BSP's local APIC to INT and NMI lines.
-                */
-               printk(KERN_INFO "leaving PIC mode, enabling APIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x01, 0x23);
-       }
-}
+       /* Go back to Virtual Wire compatibility mode */
+       unsigned long value;
 
-void disconnect_bsp_APIC(void)
-{
-       if (pic_mode) {
-               /*
-                * Put the board back into PIC mode (has an effect
-                * only on certain older boards).  Note that APIC
-                * interrupts, including IPIs, won't work beyond
-                * this point!  The only exception are INIT IPIs.
-                */
-               printk(KERN_INFO "disabling APIC mode, entering PIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x00, 0x23);
+       /* For the spurious interrupt use vector F, and enable it */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_VECTOR_MASK;
+       value |= APIC_SPIV_APIC_ENABLED;
+       value |= 0xf;
+       apic_write(APIC_SPIV, value);
+
+       if (!virt_wire_setup) {
+               /* For LVT0 make it edge triggered, active high, external and enabled */
+               value = apic_read(APIC_LVT0);
+               value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+               apic_write(APIC_LVT0, value);
+       } else {
+               /* Disable LVT0 */
+               apic_write(APIC_LVT0, APIC_LVT_MASKED);
        }
+
+       /* For LVT1 make it edge triggered, active high, nmi and enabled */
+       value = apic_read(APIC_LVT1);
+       value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+       value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+       value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+       apic_write(APIC_LVT1, value);
 }
 
 void disable_local_APIC(void)
@@ -156,7 +192,7 @@ void disable_local_APIC(void)
         */
        value = apic_read(APIC_SPIV);
        value &= ~APIC_SPIV_APIC_ENABLED;
-       apic_write_around(APIC_SPIV, value);
+       apic_write(APIC_SPIV, value);
 }
 
 /*
@@ -172,10 +208,10 @@ int __init verify_local_APIC(void)
         * The version register is read-only in a real APIC.
         */
        reg0 = apic_read(APIC_LVR);
-       Dprintk("Getting VERSION: %x\n", reg0);
+       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
        apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
        reg1 = apic_read(APIC_LVR);
-       Dprintk("Getting VERSION: %x\n", reg1);
+       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
 
        /*
         * The two version reads above should print the same
@@ -199,10 +235,10 @@ int __init verify_local_APIC(void)
         * The ID register is read/write in a real APIC.
         */
        reg0 = apic_read(APIC_ID);
-       Dprintk("Getting ID: %x\n", reg0);
+       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
        apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
        reg1 = apic_read(APIC_ID);
-       Dprintk("Getting ID: %x\n", reg1);
+       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
        apic_write(APIC_ID, reg0);
        if (reg1 != (reg0 ^ APIC_ID_MASK))
                return 0;
@@ -213,33 +249,36 @@ int __init verify_local_APIC(void)
         * compatibility mode, but most boxes are anymore.
         */
        reg0 = apic_read(APIC_LVT0);
-       Dprintk("Getting LVT0: %x\n", reg0);
+       apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
        reg1 = apic_read(APIC_LVT1);
-       Dprintk("Getting LVT1: %x\n", reg1);
+       apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
 
        return 1;
 }
 
 void __init sync_Arb_IDs(void)
 {
+       /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
+       unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+       if (ver >= 0x14)        /* P4 or higher */
+               return;
+
        /*
         * Wait for idle.
         */
        apic_wait_icr_idle();
 
-       Dprintk("Synchronizing Arb IDs.\n");
-       apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
+       apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+       apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
                                | APIC_DM_INIT);
 }
 
-extern void __error_in_apic_c (void);
-
 /*
  * An initial setup of the virtual wire mode.
  */
 void __init init_bsp_APIC(void)
 {
-       unsigned int value, ver;
+       unsigned int value;
 
        /*
         * Don't do the setup now if we have a SMP BIOS as the
@@ -249,7 +288,6 @@ void __init init_bsp_APIC(void)
                return;
 
        value = apic_read(APIC_LVR);
-       ver = GET_APIC_VERSION(value);
 
        /*
         * Do not trust the local APIC being empty at bootup.
@@ -264,42 +302,30 @@ void __init init_bsp_APIC(void)
        value |= APIC_SPIV_APIC_ENABLED;
        value |= APIC_SPIV_FOCUS_DISABLED;
        value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
+       apic_write(APIC_SPIV, value);
 
        /*
         * Set up the virtual wire mode.
         */
-       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+       apic_write(APIC_LVT0, APIC_DM_EXTINT);
        value = APIC_DM_NMI;
-       if (!APIC_INTEGRATED(ver))              /* 82489DX */
-               value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
+       apic_write(APIC_LVT1, value);
 }
 
-void __init setup_local_APIC (void)
+void __cpuinit setup_local_APIC (void)
 {
-       unsigned int value, ver, maxlvt;
-
-       /* Pound the ESR really hard over the head with a big hammer - mbligh */
-       if (esr_disable) {
-               apic_write(APIC_ESR, 0);
-               apic_write(APIC_ESR, 0);
-               apic_write(APIC_ESR, 0);
-               apic_write(APIC_ESR, 0);
-       }
+       unsigned int value, maxlvt;
+       int i, j;
 
        value = apic_read(APIC_LVR);
-       ver = GET_APIC_VERSION(value);
 
-       if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
-               __error_in_apic_c();
+       BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
 
        /*
         * Double-check whether this APIC is really registered.
         * This is meaningless in clustered apic mode, so we skip it.
         */
-       if (!clustered_apic_mode &&
-               !physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map))
+       if (!apic_id_registered())
                BUG();
 
        /*
@@ -307,23 +333,7 @@ void __init setup_local_APIC (void)
         * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
         * document number 292116).  So here it goes...
         */
-
-       if (!clustered_apic_mode) {
-               /*
-                * In clustered apic mode, the firmware does this for us 
-                * Put the APIC into flat delivery mode.
-                * Must be "all ones" explicitly for 82489DX.
-                */
-               apic_write_around(APIC_DFR, 0xffffffff);
-
-               /*
-                * Set up the logical destination ID.
-                */
-               value = apic_read(APIC_LDR);
-               value &= ~APIC_LDR_MASK;
-               value |= (1<<(smp_processor_id()+24));
-               apic_write_around(APIC_LDR, value);
-       }
+       init_apic_ldr();
 
        /*
         * Set Task Priority to 'accept all'. We never change this
@@ -331,7 +341,26 @@ void __init setup_local_APIC (void)
         */
        value = apic_read(APIC_TASKPRI);
        value &= ~APIC_TPRI_MASK;
-       apic_write_around(APIC_TASKPRI, value);
+       apic_write(APIC_TASKPRI, value);
+
+       /*
+        * After a crash, we no longer service the interrupts and a pending
+        * interrupt from previous kernel might still have ISR bit set.
+        *
+        * Most probably by now CPU has serviced that pending interrupt and
+        * it might not have done the ack_APIC_irq() because it thought,
+        * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
+        * does not clear the ISR bit and cpu thinks it has already serivced
+        * the interrupt. Hence a vector might get locked. It was noticed
+        * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+        */
+       for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+               value = apic_read(APIC_ISR + i*0x10);
+               for (j = 31; j >= 0; j--) {
+                       if (value & (1<<j))
+                               ack_APIC_irq();
+               }
+       }
 
        /*
         * Now that we are all set up, enable the APIC
@@ -343,37 +372,13 @@ void __init setup_local_APIC (void)
         */
        value |= APIC_SPIV_APIC_ENABLED;
 
-       /*
-        * Some unknown Intel IO/APIC (or APIC) errata is biting us with
-        * certain networking cards. If high frequency interrupts are
-        * happening on a particular IOAPIC pin, plus the IOAPIC routing
-        * entry is masked/unmasked at a high rate as well then sooner or
-        * later IOAPIC line gets 'stuck', no more interrupts are received
-        * from the device. If focus CPU is disabled then the hang goes
-        * away, oh well :-(
-        *
-        * [ This bug can be reproduced easily with a level-triggered
-        *   PCI Ne2000 networking cards and PII/PIII processors, dual
-        *   BX chipset. ]
-        */
-       /*
-        * Actually disabling the focus CPU check just makes the hang less
-        * frequent as it makes the interrupt distributon model be more
-        * like LRU than MRU (the short-term load is more even across CPUs).
-        * See also the comment in end_level_ioapic_irq().  --macro
-        */
-#if 1
-       /* Enable focus processor (bit==0) */
-       value &= ~APIC_SPIV_FOCUS_DISABLED;
-#else
-       /* Disable focus processor (bit==1) */
-       value |= APIC_SPIV_FOCUS_DISABLED;
-#endif
+       /* We always use processor focus */
+
        /*
         * Set spurious IRQ vector
         */
        value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
+       apic_write(APIC_SPIV, value);
 
        /*
         * Set up LVT0, LVT1:
@@ -386,14 +391,14 @@ void __init setup_local_APIC (void)
         * TODO: set up through-local-APIC from through-I/O-APIC? --macro
         */
        value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
-       if (!smp_processor_id() && (pic_mode || !value)) {
+       if (!smp_processor_id() && !value) {
                value = APIC_DM_EXTINT;
-               Dprintk(KERN_INFO "enabled ExtINT on CPU#%d\n", smp_processor_id());
+               apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
        } else {
                value = APIC_DM_EXTINT | APIC_LVT_MASKED;
-               Dprintk(KERN_INFO "masked ExtINT on CPU#%d\n", smp_processor_id());
+               apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
        }
-       apic_write_around(APIC_LVT0, value);
+       apic_write(APIC_LVT0, value);
 
        /*
         * only the BP should see the LINT1 NMI signal, obviously.
@@ -402,42 +407,28 @@ void __init setup_local_APIC (void)
                value = APIC_DM_NMI;
        else
                value = APIC_DM_NMI | APIC_LVT_MASKED;
-       if (!APIC_INTEGRATED(ver))              /* 82489DX */
-               value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
+       apic_write(APIC_LVT1, value);
 
-       if (APIC_INTEGRATED(ver) && !esr_disable) {             /* !82489DX */
+       {
+               unsigned oldvalue;
                maxlvt = get_maxlvt();
-               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
-                       apic_write(APIC_ESR, 0);
-               value = apic_read(APIC_ESR);
-               Dprintk("ESR value before enabling vector: %08x\n", value);
-
+               oldvalue = apic_read(APIC_ESR);
                value = ERROR_APIC_VECTOR;      // enables sending errors
-               apic_write_around(APIC_LVTERR, value);
+               apic_write(APIC_LVTERR, value);
                /*
                 * spec says clear errors after enabling vector.
                 */
                if (maxlvt > 3)
                        apic_write(APIC_ESR, 0);
                value = apic_read(APIC_ESR);
-               Dprintk("ESR value after enabling vector: %08x\n", value);
-       } else {
-               if (esr_disable)        
-                       /* 
-                        * Something untraceble is creating bad interrupts on 
-                        * secondary quads ... for the moment, just leave the
-                        * ESR disabled - we can't do anything useful with the
-                        * errors anyway - mbligh
-                        */
-                       printk("Leaving ESR disabled.\n");
-               else 
-                       printk("No ESR for 82489DX.\n");
+               if (value != oldvalue)
+                       apic_printk(APIC_VERBOSE,
+                       "ESR value after enabling vector: %08x, after %08x\n",
+                       oldvalue, value);
        }
 
        nmi_watchdog_default();
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               setup_apic_nmi_watchdog();
+       setup_apic_nmi_watchdog(NULL);
        apic_pm_activate();
 }
 
@@ -464,28 +455,34 @@ static struct {
        unsigned int apic_thmr;
 } apic_pm_state;
 
-static int lapic_suspend(struct sys_device *dev, u32 state)
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 {
        unsigned long flags;
+       int maxlvt;
 
        if (!apic_pm_state.active)
                return 0;
 
+       maxlvt = get_maxlvt();
+
        apic_pm_state.apic_id = apic_read(APIC_ID);
        apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
        apic_pm_state.apic_ldr = apic_read(APIC_LDR);
        apic_pm_state.apic_dfr = apic_read(APIC_DFR);
        apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
        apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-       apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+       if (maxlvt >= 4)
+               apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
        apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
        apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
        apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
        apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
        apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-       apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-       local_save_flags(flags);
-       local_irq_disable();
+#ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 5)
+               apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+       local_irq_save(flags);
        disable_local_APIC();
        local_irq_restore(flags);
        return 0;
@@ -495,17 +492,17 @@ static int lapic_resume(struct sys_device *dev)
 {
        unsigned int l, h;
        unsigned long flags;
+       int maxlvt;
 
        if (!apic_pm_state.active)
                return 0;
 
-       /* XXX: Pavel needs this for S3 resume, but can't explain why */
-       set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
+       maxlvt = get_maxlvt();
 
        local_irq_save(flags);
        rdmsr(MSR_IA32_APICBASE, l, h);
        l &= ~MSR_IA32_APICBASE_BASE;
-       l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+       l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
        wrmsr(MSR_IA32_APICBASE, l, h);
        apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
        apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -515,8 +512,12 @@ static int lapic_resume(struct sys_device *dev)
        apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
        apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
        apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-       apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-       apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+#ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 5)
+               apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+       if (maxlvt >= 4)
+               apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
        apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
        apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
        apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
@@ -540,7 +541,7 @@ static struct sys_device device_lapic = {
        .cls            = &lapic_sysclass,
 };
 
-static void __init apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
 {
        apic_pm_state.active = 1;
 }
@@ -564,6 +565,27 @@ static void apic_pm_activate(void) { }
 
 #endif /* CONFIG_PM */
 
+static int __init apic_set_verbosity(char *str)
+{
+       if (str == NULL)  {
+               skip_ioapic_setup = 0;
+               ioapic_force = 1;
+               return 0;
+       }
+       if (strcmp("debug", str) == 0)
+               apic_verbosity = APIC_DEBUG;
+       else if (strcmp("verbose", str) == 0)
+               apic_verbosity = APIC_VERBOSE;
+       else {
+               printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+                               " use apic=verbose or apic=debug\n", str);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+early_param("apic", apic_set_verbosity);
+
 /*
  * Detect and enable local APICs on non-SMP boards.
  * Original code written by Keir Fraser.
@@ -583,6 +605,64 @@ static int __init detect_init_APIC (void)
        return 0;
 }
 
+#ifdef CONFIG_X86_IO_APIC
+static struct resource * __init ioapic_setup_resources(void)
+{
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+       unsigned long n;
+       struct resource *res;
+       char *mem;
+       int i;
+
+       if (nr_ioapics <= 0)
+               return NULL;
+
+       n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+       n *= nr_ioapics;
+
+       mem = alloc_bootmem(n);
+       res = (void *)mem;
+
+       if (mem != NULL) {
+               memset(mem, 0, n);
+               mem += sizeof(struct resource) * nr_ioapics;
+
+               for (i = 0; i < nr_ioapics; i++) {
+                       res[i].name = mem;
+                       res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                       sprintf(mem,  "IOAPIC %u", i);
+                       mem += IOAPIC_RESOURCE_NAME_SIZE;
+               }
+       }
+
+       ioapic_resources = res;
+
+       return res;
+}
+
+static int __init ioapic_insert_resources(void)
+{
+       int i;
+       struct resource *r = ioapic_resources;
+
+       if (!r) {
+               printk("IO APIC resources could be not be allocated.\n");
+               return -1;
+       }
+
+       for (i = 0; i < nr_ioapics; i++) {
+               insert_resource(&iomem_resource, r);
+               r++;
+       }
+
+       return 0;
+}
+
+/* Insert the IO APIC resources after PCI initialization has occured to handle
+ * IO APICS that are mapped in on a BAR in PCI space. */
+late_initcall(ioapic_insert_resources);
+#endif
+
 void __init init_apic_mappings(void)
 {
        unsigned long apic_phys;
@@ -599,20 +679,26 @@ void __init init_apic_mappings(void)
                apic_phys = mp_lapic_addr;
 
        set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-       Dprintk("mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
+       apic_mapped = 1;
+       apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
+
+       /* Put local APIC into the resource map. */
+       lapic_resource.start = apic_phys;
+       lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+       insert_resource(&iomem_resource, &lapic_resource);
 
        /*
         * Fetch the APIC ID of the BSP in case we have a
         * default configuration (or the MP table is broken).
         */
-       if (boot_cpu_id == -1U)
-               boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+       boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
 
-#ifdef CONFIG_X86_IO_APIC
        {
                unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
                int i;
+               struct resource *ioapic_res;
 
+               ioapic_res = ioapic_setup_resources();
                for (i = 0; i < nr_ioapics; i++) {
                        if (smp_found_config) {
                                ioapic_phys = mp_ioapics[i].mpc_apicaddr;
@@ -621,12 +707,17 @@ void __init init_apic_mappings(void)
                                ioapic_phys = __pa(ioapic_phys);
                        }
                        set_fixmap_nocache(idx, ioapic_phys);
-                       Dprintk("mapped IOAPIC to %016lx (%016lx)\n",
+                       apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
                                        __fix_to_virt(idx), ioapic_phys);
                        idx++;
+
+                       if (ioapic_res != NULL) {
+                               ioapic_res->start = ioapic_phys;
+                               ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+                               ioapic_res++;
+                       }
                }
        }
-#endif
 }
 
 /*
@@ -642,25 +733,27 @@ void __init init_apic_mappings(void)
 
 #define APIC_DIVISOR 16
 
-void __setup_APIC_LVTT(unsigned int clocks)
+static void __setup_APIC_LVTT(unsigned int clocks)
 {
-       unsigned int lvtt_value, tmp_value, ver;
+       unsigned int lvtt_value, tmp_value;
+       int cpu = smp_processor_id();
 
-       ver = GET_APIC_VERSION(apic_read(APIC_LVR));
        lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
-       if (!APIC_INTEGRATED(ver))
-               lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
-       apic_write_around(APIC_LVTT, lvtt_value);
+
+       if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
+               lvtt_value |= APIC_LVT_MASKED;
+
+       apic_write(APIC_LVTT, lvtt_value);
 
        /*
         * Divide PICLK by 16
         */
        tmp_value = apic_read(APIC_TDCR);
-       apic_write_around(APIC_TDCR, (tmp_value
+       apic_write(APIC_TDCR, (tmp_value
                                & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
                                | APIC_TDR_DIV_16);
 
-       apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+       apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
 }
 
 static void setup_APIC_timer(unsigned int clocks)
@@ -669,14 +762,8 @@ static void setup_APIC_timer(unsigned int clocks)
 
        local_irq_save(flags);
 
-       /* For some reasons this doesn't work on Simics, so fake it for now */ 
-       if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { 
-       __setup_APIC_LVTT(clocks);
-               return;
-       } 
-
        /* wait for irq slice */
-       if (vxtime.hpet_address) {
+       if (vxtime.hpet_address && hpet_use_timer) {
                int trigger = hpet_readl(HPET_T0_CMP);
                while (hpet_readl(HPET_COUNTER) >= trigger)
                        /* do nothing */ ;
@@ -687,16 +774,24 @@ static void setup_APIC_timer(unsigned int clocks)
                outb_p(0x00, 0x43);
                c2 = inb_p(0x40);
                c2 |= inb_p(0x40) << 8;
-       do {
+               do {
                        c1 = c2;
                        outb_p(0x00, 0x43);
                        c2 = inb_p(0x40);
                        c2 |= inb_p(0x40) << 8;
                } while (c2 - c1 < 300);
        }
-
        __setup_APIC_LVTT(clocks);
-
+       /* Turn off PIT interrupt if we use APIC timer as main timer.
+          Only works with the PM timer right now
+          TBD fix it for HPET too. */
+       if (vxtime.mode == VXTIME_PMTMR &&
+               smp_processor_id() == boot_cpu_id &&
+               apic_runs_main_timer == 1 &&
+               !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) {
+               stop_timer_interrupt();
+               apic_runs_main_timer++;
+       }
        local_irq_restore(flags);
 }
 
@@ -715,7 +810,7 @@ static void setup_APIC_timer(unsigned int clocks)
 
 #define TICK_COUNT 100000000
 
-int __init calibrate_APIC_clock(void)
+static int __init calibrate_APIC_clock(void)
 {
        int apic, apic_start, tsc, tsc_start;
        int result;
@@ -727,14 +822,27 @@ int __init calibrate_APIC_clock(void)
        __setup_APIC_LVTT(1000000000);
 
        apic_start = apic_read(APIC_TMCCT);
-       rdtscl(tsc_start);
-
-       do {
+#ifdef CONFIG_X86_PM_TIMER
+       if (apic_calibrate_pmtmr && pmtmr_ioport) {
+               pmtimer_wait(5000);  /* 5ms wait */
                apic = apic_read(APIC_TMCCT);
-               rdtscl(tsc);
-       } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT);
+               result = (apic_start - apic) * 1000L / 5;
+       } else
+#endif
+       {
+               rdtscl(tsc_start);
+
+               do {
+                       apic = apic_read(APIC_TMCCT);
+                       rdtscl(tsc);
+               } while ((tsc - tsc_start) < TICK_COUNT &&
+                               (apic - apic_start) < TICK_COUNT);
+
+               result = (apic_start - apic) * 1000L * cpu_khz /
+                                       (tsc - tsc_start);
+       }
+       printk("result %d\n", result);
 
-       result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start);
 
        printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
                result / 1000 / 1000, result / 1000 % 1000);
@@ -765,59 +873,94 @@ void __init setup_boot_APIC_clock (void)
        local_irq_enable();
 }
 
-void __init setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
 {
        local_irq_disable(); /* FIXME: Do we need this? --RR */
        setup_APIC_timer(calibration_result);
        local_irq_enable();
 }
 
-void __init disable_APIC_timer(void)
+void disable_APIC_timer(void)
 {
        if (using_apic_timer) {
                unsigned long v;
 
                v = apic_read(APIC_LVTT);
-               apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+               /*
+                * When an illegal vector value (0-15) is written to an LVT
+                * entry and delivery mode is Fixed, the APIC may signal an
+                * illegal vector error, with out regard to whether the mask
+                * bit is set or whether an interrupt is actually seen on input.
+                *
+                * Boot sequence might call this function when the LVTT has
+                * '0' vector value. So make sure vector field is set to
+                * valid value.
+                */
+               v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+               apic_write(APIC_LVTT, v);
        }
 }
 
 void enable_APIC_timer(void)
 {
-       if (using_apic_timer) {
+       int cpu = smp_processor_id();
+
+       if (using_apic_timer &&
+           !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
                unsigned long v;
 
                v = apic_read(APIC_LVTT);
-               apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
+               apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED);
        }
 }
 
-/*
- * the frequency of the profiling timer can be changed
- * by writing a multiplier value into /proc/profile.
- */
-int setup_profiling_timer(unsigned int multiplier)
+void switch_APIC_timer_to_ipi(void *cpumask)
 {
-       int i;
+       cpumask_t mask = *(cpumask_t *)cpumask;
+       int cpu = smp_processor_id();
 
-       /*
-        * Sanity check. [at least 500 APIC cycles should be
-        * between APIC interrupts as a rule of thumb, to avoid
-        * irqs flooding us]
-        */
-       if ( (!multiplier) || (calibration_result/multiplier < 500))
-               return -EINVAL;
+       if (cpu_isset(cpu, mask) &&
+           !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+               disable_APIC_timer();
+               cpu_set(cpu, timer_interrupt_broadcast_ipi_mask);
+       }
+}
+EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
 
-       /* 
-        * Set the new multiplier for each CPU. CPUs don't start using the
-        * new values until the next timer interrupt in which they do process
-        * accounting. At that time they also adjust their APIC timers
-        * accordingly.
-        */
-       for (i = 0; i < NR_CPUS; ++i)
-               per_cpu(prof_multiplier, i) = multiplier;
+void smp_send_timer_broadcast_ipi(void)
+{
+       cpumask_t mask;
 
-       return 0;
+       cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask);
+       if (!cpus_empty(mask)) {
+               send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+       }
+}
+
+void switch_ipi_to_APIC_timer(void *cpumask)
+{
+       cpumask_t mask = *(cpumask_t *)cpumask;
+       int cpu = smp_processor_id();
+
+       if (cpu_isset(cpu, mask) &&
+           cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+               cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask);
+               enable_APIC_timer();
+       }
+}
+EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+       return -EINVAL;
+}
+
+void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
+                           unsigned char msg_type, unsigned char mask)
+{
+       unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
+       unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
+       apic_write(reg, v);
 }
 
 #undef APIC_DIVISOR
@@ -832,40 +975,19 @@ int setup_profiling_timer(unsigned int multiplier)
  * value into /proc/profile.
  */
 
-void smp_local_timer_interrupt(struct pt_regs *regs)
+void smp_local_timer_interrupt(void)
 {
-       int cpu = smp_processor_id();
-
-       x86_do_profile(regs);
-
-       if (--per_cpu(prof_counter, cpu) <= 0) {
-               /*
-                * The multiplier may have changed since the last time we got
-                * to this point as a result of the user writing to
-                * /proc/profile. In this case we need to adjust the APIC
-                * timer accordingly.
-                *
-                * Interrupts are already masked off at this point.
-                */
-               per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
-               if (per_cpu(prof_counter, cpu) != 
-                   per_cpu(prof_old_multiplier, cpu)) {
-                       __setup_APIC_LVTT(calibration_result/
-                                       per_cpu(prof_counter, cpu));
-                       per_cpu(prof_old_multiplier, cpu) =
-                               per_cpu(prof_counter, cpu);
-               }
-
+       profile_tick(CPU_PROFILING);
 #ifdef CONFIG_SMP
-               update_process_times(user_mode(regs));
+       update_process_times(user_mode(get_irq_regs()));
 #endif
-       }
-
+       if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id)
+               main_timer_handler();
        /*
         * We take the 'long' return path, and there every subsystem
         * grabs the appropriate locks (kernel lock/ irq lock).
         *
-        * we might want to decouple profiling from the 'long path',
+        * We might want to decouple profiling from the 'long path',
         * and do the profiling totally in assembly.
         *
         * Currently this isn't too much of an issue (performance wise),
@@ -883,6 +1005,8 @@ void smp_local_timer_interrupt(struct pt_regs *regs)
  */
 void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
+       struct pt_regs *old_regs = set_irq_regs(regs);
+
        /*
         * the NMI deadlock-detector uses this.
         */
@@ -898,9 +1022,58 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
         * Besides, if we don't timer interrupts ignore the global
         * interrupt lock, which is the WrongThing (tm) to do.
         */
+       exit_idle();
        irq_enter();
-       smp_local_timer_interrupt(regs);
+       smp_local_timer_interrupt();
        irq_exit();
+       set_irq_regs(old_regs);
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+       int i, clusters, zeros;
+       unsigned id;
+       DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+
+       bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
+
+       for (i = 0; i < NR_CPUS; i++) {
+               id = bios_cpu_apicid[i];
+               if (id != BAD_APICID)
+                       __set_bit(APIC_CLUSTERID(id), clustermap);
+       }
+
+       /* Problem:  Partially populated chassis may not have CPUs in some of
+        * the APIC clusters they have been allocated.  Only present CPUs have
+        * bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
+        * clusters are allocated sequentially, count zeros only if they are
+        * bounded by ones.
+        */
+       clusters = 0;
+       zeros = 0;
+       for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+               if (test_bit(i, clustermap)) {
+                       clusters += 1 + zeros;
+                       zeros = 0;
+               } else
+                       ++zeros;
+       }
+
+       /*
+        * If clusters > 2, then should be multi-chassis.
+        * May have to revisit this when multi-core + hyperthreaded CPUs come
+        * out, but AFAIK this will work even for them.
+        */
+       return (clusters > 2);
 }
 
 /*
@@ -909,6 +1082,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
 asmlinkage void smp_spurious_interrupt(void)
 {
        unsigned int v;
+       exit_idle();
        irq_enter();
        /*
         * Check if this really is a spurious interrupt and ACK it
@@ -944,6 +1118,7 @@ asmlinkage void smp_error_interrupt(void)
 {
        unsigned int v, v1;
 
+       exit_idle();
        irq_enter();
        /* First tickle the hardware, only then report what went on. -- REW */
        v = apic_read(APIC_ESR);
@@ -962,7 +1137,8 @@ asmlinkage void smp_error_interrupt(void)
           6: Received illegal vector
           7: Illegal register address
        */
-       printk (KERN_INFO "APIC error on CPU%d: %02x(%02x)\n",
+       if (num_online_cpus() > 1)
+               printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
                smp_processor_id(), v , v1);
        irq_exit();
 }
@@ -987,47 +1163,65 @@ int __init APIC_init_uniprocessor (void)
 
        verify_local_APIC();
 
-       connect_bsp_APIC();
-
-       phys_cpu_present_map = physid_mask_of_physid(0);
-       apic_write_around(APIC_ID, boot_cpu_id);
+       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
+       apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
 
        setup_local_APIC();
 
-#ifdef CONFIG_X86_IO_APIC
        if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
-                       setup_IO_APIC();
+               setup_IO_APIC();
        else
                nr_ioapics = 0;
-#endif
        setup_boot_APIC_clock();
-
+       check_nmi_watchdog();
        return 0;
 }
 
 static __init int setup_disableapic(char *str) 
 { 
        disable_apic = 1;
+       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
        return 0;
-} 
+}
+early_param("disableapic", setup_disableapic);
 
+/* same as disableapic, for compatibility */
 static __init int setup_nolapic(char *str) 
 { 
-       disable_apic = 1;
-       return 0;
+       return setup_disableapic(str);
 } 
+early_param("nolapic", setup_nolapic);
 
 static __init int setup_noapictimer(char *str) 
 { 
+       if (str[0] != ' ' && str[0] != 0)
+               return 0;
        disable_apic_timer = 1;
-       return 0;
+       return 1;
 } 
 
-/* dummy parsing: see setup.c */
+static __init int setup_apicmaintimer(char *str)
+{
+       apic_runs_main_timer = 1;
+       nohpet = 1;
+       return 1;
+}
+__setup("apicmaintimer", setup_apicmaintimer);
 
-__setup("disableapic", setup_disableapic); 
-__setup("nolapic", setup_nolapic);  /* same as disableapic, for compatibility */
+static __init int setup_noapicmaintimer(char *str)
+{
+       apic_runs_main_timer = -1;
+       return 1;
+}
+__setup("noapicmaintimer", setup_noapicmaintimer);
+
+static __init int setup_apicpmtimer(char *s)
+{
+       apic_calibrate_pmtmr = 1;
+       notsc_setup(NULL);
+       return setup_apicmaintimer(NULL);
+}
+__setup("apicpmtimer", setup_apicpmtimer);
 
 __setup("noapictimer", setup_noapictimer); 
 
-/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */