fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / arch / x86_64 / kernel / time.c
index d47d3f8..5cc76d0 100644 (file)
@@ -8,10 +8,9 @@
  *  Copyright (c) 1995  Markus Kuhn
  *  Copyright (c) 1996  Ingo Molnar
  *  Copyright (c) 1998  Andrea Arcangeli
- *  Copyright (c) 2002  Vojtech Pavlik
+ *  Copyright (c) 2002,2006  Vojtech Pavlik
  *  Copyright (c) 2003  Andi Kleen
  *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
- *
  */
 
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/mc146818rtc.h>
-#include <linux/irq.h>
 #include <linux/time.h>
 #include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/sysdev.h>
 #include <linux/bcd.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
 #include <linux/kallsyms.h>
+#include <linux/acpi.h>
+#ifdef CONFIG_ACPI
+#include <acpi/achware.h>      /* for PM timer frequency */
+#include <acpi/acpi_bus.h>
+#endif
 #include <asm/8253pit.h>
 #include <asm/pgtable.h>
 #include <asm/vsyscall.h>
 #include <asm/timex.h>
 #include <asm/proto.h>
 #include <asm/hpet.h>
+#include <asm/sections.h>
 #include <linux/cpufreq.h>
-#ifdef CONFIG_X86_LOCAL_APIC
+#include <linux/hpet.h>
 #include <asm/apic.h>
-#endif
-
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
 
+#ifdef CONFIG_CPU_FREQ
+static void cpufreq_delayed_get(void);
+#endif
+extern void i8254_timer_resume(void);
 extern int using_apic_timer;
 
-spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
-spinlock_t i8253_lock = SPIN_LOCK_UNLOCKED;
+static char *timename = NULL;
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+DEFINE_SPINLOCK(i8253_lock);
 
-static int nohpet __initdata = 0;
+int nohpet __initdata = 0;
+static int notsc __initdata = 0;
 
-#undef HPET_HACK_ENABLE_DANGEROUS
+#define USEC_PER_TICK (USEC_PER_SEC / HZ)
+#define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
+#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
 
+#define NS_SCALE       10 /* 2^10, carefully chosen */
+#define US_SCALE       32 /* 2^32, arbitralrily chosen */
 
 unsigned int cpu_khz;                                  /* TSC clocks / usec, not used here */
-unsigned long hpet_period;                             /* fsecs / HPET clock */
+EXPORT_SYMBOL(cpu_khz);
+static unsigned long hpet_period;                      /* fsecs / HPET clock */
 unsigned long hpet_tick;                               /* HPET clocks / interrupt */
+int hpet_use_timer;                            /* Use counter of hpet for time keeping, otherwise PIT */
 unsigned long vxtime_hz = PIT_TICK_RATE;
 int report_lost_ticks;                         /* command line option */
 unsigned long long monotonic_base;
@@ -62,18 +77,9 @@ unsigned long long monotonic_base;
 struct vxtime_data __vxtime __section_vxtime;  /* for vsyscalls */
 
 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
 struct timespec __xtime __section_xtime;
 struct timezone __sys_tz __section_sys_tz;
 
-static inline void rdtscll_sync(unsigned long *tsc)
-{
-#ifdef CONFIG_SMP
-       sync_core();
-#endif
-       rdtscll(*tsc);
-}
-
 /*
  * do_gettimeoffset() returns microseconds since last timer interrupt was
  * triggered by hardware. A memory read of HPET is slower than a register read
@@ -82,21 +88,24 @@ static inline void rdtscll_sync(unsigned long *tsc)
  * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
  * This is not a problem, because jiffies hasn't updated either. They are bound
  * together by xtime_lock.
        */
+ */
 
 static inline unsigned int do_gettimeoffset_tsc(void)
 {
        unsigned long t;
        unsigned long x;
-       rdtscll_sync(&t);
-       if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
-       x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
+       t = get_cycles_sync();
+       if (t < vxtime.last_tsc) 
+               t = vxtime.last_tsc; /* hack */
+       x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE;
        return x;
 }
 
 static inline unsigned int do_gettimeoffset_hpet(void)
 {
-       return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32;
+       /* cap counter read to one tick to avoid inconsistencies */
+       unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
+       return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE;
 }
 
 unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
@@ -109,32 +118,29 @@ unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
 
 void do_gettimeofday(struct timeval *tv)
 {
-       unsigned long seq, t;
+       unsigned long seq;
        unsigned int sec, usec;
 
        do {
                seq = read_seqbegin(&xtime_lock);
 
                sec = xtime.tv_sec;
-               usec = xtime.tv_nsec / 1000;
+               usec = xtime.tv_nsec / NSEC_PER_USEC;
 
                /* i386 does some correction here to keep the clock 
-                  monotonus even when ntpd is fixing drift.
+                  monotonous even when ntpd is fixing drift.
                   But they didn't work for me, there is a non monotonic
                   clock anyways with ntp.
                   I dropped all corrections now until a real solution can
                   be found. Note when you fix it here you need to do the same
                   in arch/x86_64/kernel/vsyscall.c and export all needed
                   variables in vmlinux.lds. -AK */ 
-
-               t = (jiffies - wall_jiffies) * (1000000L / HZ) +
-                       do_gettimeoffset();
-               usec += t;
+               usec += do_gettimeoffset();
 
        } while (read_seqretry(&xtime_lock, seq));
 
-       tv->tv_sec = sec + usec / 1000000;
-       tv->tv_usec = usec % 1000000;
+       tv->tv_sec = sec + usec / USEC_PER_SEC;
+       tv->tv_usec = usec % USEC_PER_SEC;
 }
 
 EXPORT_SYMBOL(do_gettimeofday);
@@ -155,8 +161,7 @@ int do_settimeofday(struct timespec *tv)
 
        write_seqlock_irq(&xtime_lock);
 
-       nsec -= do_gettimeoffset() * 1000 +
-               (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
+       nsec -= do_gettimeoffset() * NSEC_PER_USEC;
 
        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
        wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
@@ -164,10 +169,7 @@ int do_settimeofday(struct timespec *tv)
        set_normalized_timespec(&xtime, sec, nsec);
        set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-       time_adjust = 0;                /* stop active adjtime() */
-       time_status |= STA_UNSYNC;
-       time_maxerror = NTP_PHASE_LIMIT;
-       time_esterror = NTP_PHASE_LIMIT;
+       ntp_clear();
 
        write_sequnlock_irq(&xtime_lock);
        clock_was_set();
@@ -176,6 +178,24 @@ int do_settimeofday(struct timespec *tv)
 
 EXPORT_SYMBOL(do_settimeofday);
 
+unsigned long profile_pc(struct pt_regs *regs)
+{
+       unsigned long pc = instruction_pointer(regs);
+
+       /* Assume the lock function has either no stack frame or a copy
+          of eflags from PUSHF
+          Eflags always has bits 22 and up cleared unlike kernel addresses. */
+       if (!user_mode(regs) && in_lock_functions(pc)) {
+               unsigned long *sp = (unsigned long *)regs->rsp;
+               if (sp[0] >> 22)
+                       return sp[0];
+               if (sp[1] >> 22)
+                       return sp[1];
+       }
+       return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
 /*
  * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
  * ms after the second nowtime has started, because when nowtime is written
@@ -214,7 +234,7 @@ static void set_rtc_mmss(unsigned long nowtime)
  * overflow. This avoids messing with unknown time zones but requires your RTC
  * not to be off by more than 15 minutes. Since we're calling it only when
  * our clock is externally synchronized using NTP, this shouldn't be a problem.
       */
+ */
 
        real_seconds = nowtime % 60;
        real_minutes = nowtime / 60;
@@ -222,19 +242,12 @@ static void set_rtc_mmss(unsigned long nowtime)
                real_minutes += 30;             /* correct for half hour time zone */
        real_minutes %= 60;
 
-#if 0
-       /* AMD 8111 is a really bad time keeper and hits this regularly. 
-          It probably was an attempt to avoid screwing up DST, but ignore
-          that for now. */        
        if (abs(real_minutes - cmos_minutes) >= 30) {
                printk(KERN_WARNING "time.c: can't update CMOS clock "
                       "from %d to %d\n", cmos_minutes, real_minutes);
-       } else
-#endif
-
-       {
-                       BIN_TO_BCD(real_seconds);
-                       BIN_TO_BCD(real_minutes);
+       } else {
+               BIN_TO_BCD(real_seconds);
+               BIN_TO_BCD(real_minutes);
                CMOS_WRITE(real_seconds, RTC_SECONDS);
                CMOS_WRITE(real_minutes, RTC_MINUTES);
        }
@@ -258,6 +271,7 @@ static void set_rtc_mmss(unsigned long nowtime)
  *             Note: This function is required to return accurate
  *             time even in the absence of multiple timer ticks.
  */
+static inline unsigned long long cycles_2_ns(unsigned long long cyc);
 unsigned long long monotonic_clock(void)
 {
        unsigned long seq;
@@ -270,49 +284,87 @@ unsigned long long monotonic_clock(void)
 
                        last_offset = vxtime.last;
                        base = monotonic_base;
-                       this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-
+                       this_offset = hpet_readl(HPET_COUNTER);
                } while (read_seqretry(&xtime_lock, seq));
                offset = (this_offset - last_offset);
-               offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
-               return base + offset;
-       }else{
+               offset *= NSEC_PER_TICK / hpet_tick;
+       } else {
                do {
                        seq = read_seqbegin(&xtime_lock);
 
                        last_offset = vxtime.last_tsc;
                        base = monotonic_base;
                } while (read_seqretry(&xtime_lock, seq));
-               sync_core();
-               rdtscll(this_offset);
-               offset = (this_offset - last_offset)*1000/cpu_khz; 
-               return base + offset;
+               this_offset = get_cycles_sync();
+               offset = cycles_2_ns(this_offset - last_offset);
        }
-
-
+       return base + offset;
 }
 EXPORT_SYMBOL(monotonic_clock);
 
+static noinline void handle_lost_ticks(int lost)
+{
+       static long lost_count;
+       static int warned;
+       if (report_lost_ticks) {
+               printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost);
+               print_symbol("rip %s)\n", get_irq_regs()->rip);
+       }
+
+       if (lost_count == 1000 && !warned) {
+               printk(KERN_WARNING "warning: many lost ticks.\n"
+                      KERN_WARNING "Your time source seems to be instable or "
+                               "some driver is hogging interupts\n");
+               print_symbol("rip %s\n", get_irq_regs()->rip);
+               if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
+                       printk(KERN_WARNING "Falling back to HPET\n");
+                       if (hpet_use_timer)
+                               vxtime.last = hpet_readl(HPET_T0_CMP) - 
+                                                       hpet_tick;
+                       else
+                               vxtime.last = hpet_readl(HPET_COUNTER);
+                       vxtime.mode = VXTIME_HPET;
+                       do_gettimeoffset = do_gettimeoffset_hpet;
+               }
+               /* else should fall back to PIT, but code missing. */
+               warned = 1;
+       } else
+               lost_count++;
 
-static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+#ifdef CONFIG_CPU_FREQ
+       /* In some cases the CPU can change frequency without us noticing
+          Give cpufreq a change to catch up. */
+       if ((lost_count+1) % 25 == 0)
+               cpufreq_delayed_get();
+#endif
+}
+
+void main_timer_handler(void)
 {
        static unsigned long rtc_update = 0;
-       unsigned long tsc, lost = 0;
-       int delay, offset = 0;
+       unsigned long tsc;
+       int delay = 0, offset = 0, lost = 0;
 
 /*
  * Here we are in the timer irq handler. We have irqs locally disabled (so we
  * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
  * on the other CPU, so we need a lock. We also need to lock the vsyscall
  * variables, because both do_timer() and us change them -arca+vojtech
       */
+ */
 
        write_seqlock(&xtime_lock);
 
-       if (vxtime.hpet_address) {
+       if (vxtime.hpet_address)
+               offset = hpet_readl(HPET_COUNTER);
+
+       if (hpet_use_timer) {
+               /* if we're using the hpet timer functionality,
+                * we can more accurately know the counter value
+                * when the timer interrupt occured.
+                */
                offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
                delay = hpet_readl(HPET_COUNTER) - offset;
-       } else {
+       } else if (!pmtmr_ioport) {
                spin_lock(&i8253_lock);
                outb_p(0x00, 0x43);
                delay = inb_p(0x40);
@@ -321,7 +373,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
                delay = LATCH - 1 - delay;
        }
 
-       rdtscll_sync(&tsc);
+       tsc = get_cycles_sync();
 
        if (vxtime.mode == VXTIME_HPET) {
                if (offset - vxtime.last > hpet_tick) {
@@ -329,45 +381,48 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
                }
 
                monotonic_base += 
-                       (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
+                       (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick;
 
                vxtime.last = offset;
+#ifdef CONFIG_X86_PM_TIMER
+       } else if (vxtime.mode == VXTIME_PMTMR) {
+               lost = pmtimer_mark_offset();
+#endif
        } else {
                offset = (((tsc - vxtime.last_tsc) *
-                          vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
+                          vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK;
 
                if (offset < 0)
                        offset = 0;
 
-               if (offset > (USEC_PER_SEC / HZ)) {
-                       lost = offset / (USEC_PER_SEC / HZ);
-                       offset %= (USEC_PER_SEC / HZ);
+               if (offset > USEC_PER_TICK) {
+                       lost = offset / USEC_PER_TICK;
+                       offset %= USEC_PER_TICK;
                }
 
-               monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
+               monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc);
 
                vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
 
                if ((((tsc - vxtime.last_tsc) *
-                     vxtime.tsc_quot) >> 32) < offset)
+                     vxtime.tsc_quot) >> US_SCALE) < offset)
                        vxtime.last_tsc = tsc -
-                               (((long) offset << 32) / vxtime.tsc_quot) - 1;
+                               (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;
        }
 
-       if (lost) {
-               if (report_lost_ticks) {
-                       printk(KERN_WARNING "time.c: Lost %ld timer "
-                              "tick(s)! ", lost);
-                       print_symbol("rip %s)\n", regs->rip);
-               }
-               jiffies += lost;
-       }
+       if (lost > 0)
+               handle_lost_ticks(lost);
+       else
+               lost = 0;
 
 /*
  * Do the timer stuff.
  */
 
-       do_timer(regs);
+       do_timer(lost + 1);
+#ifndef CONFIG_SMP
+       update_process_times(user_mode(get_irq_regs()));
+#endif
 
 /*
  * In the SMP case we use the local APIC timer interrupt to do the profiling,
@@ -375,12 +430,8 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  * have to call the local interrupt handler.
  */
 
-#ifndef CONFIG_X86_LOCAL_APIC
-       x86_do_profile(regs);
-#else
        if (!using_apic_timer)
-               smp_local_timer_interrupt(regs);
-#endif
+               smp_local_timer_interrupt();
 
 /*
  * If we have an externally synchronized Linux clock, then update CMOS clock
@@ -390,28 +441,35 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  * off) isn't likely to go away much sooner anyway.
  */
 
-       if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update &&
+       if (ntp_synced() && xtime.tv_sec > rtc_update &&
                abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
                set_rtc_mmss(xtime.tv_sec);
                rtc_update = xtime.tv_sec + 660;
        }
  
        write_sequnlock(&xtime_lock);
+}
 
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+       if (apic_runs_main_timer > 1)
+               return IRQ_HANDLED;
+       main_timer_handler();
+       if (using_apic_timer)
+               smp_send_timer_broadcast_ipi();
        return IRQ_HANDLED;
 }
 
-static unsigned int cyc2ns_scale;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+static unsigned int cyc2ns_scale __read_mostly;
 
-static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
 {
-       cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
+       cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz;
 }
 
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
-       return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+       return (cyc * cyc2ns_scale) >> NS_SCALE;
 }
 
 unsigned long long sched_clock(void)
@@ -424,7 +482,7 @@ unsigned long long sched_clock(void)
            Disadvantage is a small drift between CPUs in some configurations,
           but that should be tolerable. */
        if (__vxtime.mode == VXTIME_HPET)
-               return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32;
+               return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE;
 #endif
 
        /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
@@ -436,63 +494,53 @@ unsigned long long sched_clock(void)
        return cycles_2_ns(a);
 }
 
-unsigned long get_cmos_time(void)
+static unsigned long get_cmos_time(void)
 {
-       unsigned int timeout, year, mon, day, hour, min, sec;
-       unsigned char last, this;
+       unsigned int year, mon, day, hour, min, sec;
        unsigned long flags;
-
-/*
- * The Linux interpretation of the CMOS clock register contents: When the
- * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the
- * second which has precisely just started. Waiting for this can take up to 1
- * second, we timeout approximately after 2.4 seconds on a machine with
- * standard 8.3 MHz ISA bus.
- */
+       unsigned extyear = 0;
 
        spin_lock_irqsave(&rtc_lock, flags);
 
-       timeout = 1000000;
-       last = this = 0;
-
-       while (timeout && last && !this) {
-               last = this;
-               this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
-               timeout--;
-       }
-
-/*
- * Here we are safe to assume the registers won't change for a whole second, so
- * we just go ahead and read them.
-        */
-
+       do {
                sec = CMOS_READ(RTC_SECONDS);
                min = CMOS_READ(RTC_MINUTES);
                hour = CMOS_READ(RTC_HOURS);
                day = CMOS_READ(RTC_DAY_OF_MONTH);
                mon = CMOS_READ(RTC_MONTH);
                year = CMOS_READ(RTC_YEAR);
+#ifdef CONFIG_ACPI
+               if (acpi_fadt.revision >= FADT2_REVISION_ID &&
+                                       acpi_fadt.century)
+                       extyear = CMOS_READ(acpi_fadt.century);
+#endif
+       } while (sec != CMOS_READ(RTC_SECONDS));
 
        spin_unlock_irqrestore(&rtc_lock, flags);
 
-/*
- * We know that x86-64 always uses BCD format, no need to check the config
- * register.
- */
-
-           BCD_TO_BIN(sec);
-           BCD_TO_BIN(min);
-           BCD_TO_BIN(hour);
-           BCD_TO_BIN(day);
-           BCD_TO_BIN(mon);
-           BCD_TO_BIN(year);
-
-/*
- * This will work up to Dec 31, 2069.
- */
-
-       if ((year += 1900) < 1970)
-               year += 100;
+       /*
+        * We know that x86-64 always uses BCD format, no need to check the
+        * config register.
+        */
+
+       BCD_TO_BIN(sec);
+       BCD_TO_BIN(min);
+       BCD_TO_BIN(hour);
+       BCD_TO_BIN(day);
+       BCD_TO_BIN(mon);
+       BCD_TO_BIN(year);
+
+       if (extyear) {
+               BCD_TO_BIN(extyear);
+               year += extyear;
+               printk(KERN_INFO "Extended CMOS year: %d\n", extyear);
+       } else { 
+               /*
+                * x86-64 systems only exists since 2002.
+                * This will work up to Dec 31, 2100
+                */
+               year += 2000;
+       }
 
        return mktime(year, mon, day, hour, min, sec);
 }
@@ -509,6 +557,39 @@ unsigned long get_cmos_time(void)
    Should fix up last_tsc too. Currently gettimeofday in the
    first tick after the change will be slightly wrong. */
 
+#include <linux/workqueue.h>
+
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+
+static void handle_cpufreq_delayed_get(struct work_struct *v)
+{
+       unsigned int cpu;
+       for_each_online_cpu(cpu) {
+               cpufreq_get(cpu);
+       }
+       cpufreq_delayed_issched = 0;
+}
+
+/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
+ * to verify the CPU frequency the timing core thinks the CPU is running
+ * at is still correct.
+ */
+static void cpufreq_delayed_get(void)
+{
+       static int warned;
+       if (cpufreq_init && !cpufreq_delayed_issched) {
+               cpufreq_delayed_issched = 1;
+               if (!warned) {
+                       warned = 1;
+                       printk(KERN_DEBUG 
+       "Losing some ticks... checking if CPU frequency changed.\n");
+               }
+               schedule_work(&cpufreq_delayed_get_work);
+       }
+}
+
 static unsigned int  ref_freq = 0;
 static unsigned long loops_per_jiffy_ref = 0;
 
@@ -518,12 +599,17 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                                 void *data)
 {
         struct cpufreq_freqs *freq = data;
-       unsigned long *lpj;
+       unsigned long *lpj, dummy;
 
+       if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
+               return 0;
+
+       lpj = &dummy;
+       if (!(freq->flags & CPUFREQ_CONST_LOOPS))
 #ifdef CONFIG_SMP
-       lpj = &cpu_data[freq->cpu].loops_per_jiffy;
+               lpj = &cpu_data[freq->cpu].loops_per_jiffy;
 #else
-       lpj = &boot_cpu_data.loops_per_jiffy;
+               lpj = &boot_cpu_data.loops_per_jiffy;
 #endif
 
        if (!ref_freq) {
@@ -538,10 +624,11 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
 
                cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
-               vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+               if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+                       vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
        }
        
-       set_cyc2ns_scale(cpu_khz_ref / 1000);
+       set_cyc2ns_scale(cpu_khz_ref);
 
        return 0;
 }
@@ -549,6 +636,18 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 static struct notifier_block time_cpufreq_notifier_block = {
          .notifier_call  = time_cpufreq_notifier
 };
+
+static int __init cpufreq_tsc(void)
+{
+       INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
+       if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
+                                      CPUFREQ_TRANSITION_NOTIFIER))
+               cpufreq_init = 1;
+       return 0;
+}
+
+core_initcall(cpufreq_tsc);
+
 #endif
 
 /*
@@ -557,6 +656,25 @@ static struct notifier_block time_cpufreq_notifier_block = {
  */
 
 #define TICK_COUNT 100000000
+#define TICK_MIN   5000
+
+/*
+ * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
+ * occurs between the reads of the hpet & TSC.
+ */
+static void __init read_hpet_tsc(int *hpet, int *tsc)
+{
+       int tsc1, tsc2, hpet1;
+
+       do {
+               tsc1 = get_cycles_sync();
+               hpet1 = hpet_readl(HPET_COUNTER);
+               tsc2 = get_cycles_sync();
+       } while (tsc2 - tsc1 > TICK_MIN);
+       *hpet = hpet1;
+       *tsc = tsc2;
+}
+
 
 static unsigned int __init hpet_calibrate_tsc(void)
 {
@@ -567,14 +685,11 @@ static unsigned int __init hpet_calibrate_tsc(void)
        local_irq_save(flags);
        local_irq_disable();
 
-       hpet_start = hpet_readl(HPET_COUNTER);
-       rdtscl(tsc_start);
+       read_hpet_tsc(&hpet_start, &tsc_start);
 
        do {
                local_irq_disable();
-               hpet_now = hpet_readl(HPET_COUNTER);
-               sync_core();
-               rdtscl(tsc_now);
+               read_hpet_tsc(&hpet_now, &tsc_now);
                local_irq_restore(flags);
        } while ((tsc_now - tsc_start) < TICK_COUNT &&
                 (hpet_now - hpet_start) < TICK_COUNT);
@@ -604,42 +719,67 @@ static unsigned int __init pit_calibrate_tsc(void)
        outb(0xb0, 0x43);
        outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
        outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
-       rdtscll(start);
-       sync_core();
+       start = get_cycles_sync();
        while ((inb(0x61) & 0x20) == 0);
-       sync_core();
-       rdtscll(end);
+       end = get_cycles_sync();
 
        spin_unlock_irqrestore(&i8253_lock, flags);
        
        return (end - start) / 50;
 }
 
-static int hpet_init(void)
+#ifdef CONFIG_HPET
+static __init int late_hpet_init(void)
 {
-       unsigned int cfg, id;
+       struct hpet_data        hd;
+       unsigned int            ntimer;
 
        if (!vxtime.hpet_address)
-               return -1;
-       set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
-       __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+               return 0;
 
-/*
- * Read the period, compute tick and quotient.
- */
+       memset(&hd, 0, sizeof (hd));
 
-       id = hpet_readl(HPET_ID);
+       ntimer = hpet_readl(HPET_ID);
+       ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+       ntimer++;
 
-       if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) ||
-           !(id & HPET_ID_LEGSUP))
-               return -1;
+       /*
+        * Register with driver.
+        * Timer0 and Timer1 is used by platform.
+        */
+       hd.hd_phys_address = vxtime.hpet_address;
+       hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
+       hd.hd_nirqs = ntimer;
+       hd.hd_flags = HPET_DATA_PLATFORM;
+       hpet_reserve_timer(&hd, 0);
+#ifdef CONFIG_HPET_EMULATE_RTC
+       hpet_reserve_timer(&hd, 1);
+#endif
+       hd.hd_irq[0] = HPET_LEGACY_8254;
+       hd.hd_irq[1] = HPET_LEGACY_RTC;
+       if (ntimer > 2) {
+               struct hpet             *hpet;
+               struct hpet_timer       *timer;
+               int                     i;
+
+               hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
+               timer = &hpet->hpet_timers[2];
+               for (i = 2; i < ntimer; timer++, i++)
+                       hd.hd_irq[i] = (timer->hpet_config &
+                                       Tn_INT_ROUTE_CNF_MASK) >>
+                               Tn_INT_ROUTE_CNF_SHIFT;
 
-       hpet_period = hpet_readl(HPET_PERIOD);
-       if (hpet_period < 100000 || hpet_period > 100000000)
-               return -1;
+       }
 
-       hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
-               hpet_period;
+       hpet_alloc(&hd);
+       return 0;
+}
+fs_initcall(late_hpet_init);
+#endif
+
+static int hpet_timer_stop_set_go(unsigned long tick)
+{
+       unsigned int cfg;
 
 /*
  * Stop the timers and reset the main counter.
@@ -655,33 +795,94 @@ static int hpet_init(void)
  * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
  * and period also hpet_tick.
  */
-
-       hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+       if (hpet_use_timer) {
+               hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
                    HPET_TN_32BIT, HPET_T0_CFG);
-       hpet_writel(hpet_tick, HPET_T0_CMP);
-       hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
-
+               hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
+               hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
+               cfg |= HPET_CFG_LEGACY;
+       }
 /*
  * Go!
  */
 
-       cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
+       cfg |= HPET_CFG_ENABLE;
        hpet_writel(cfg, HPET_CFG);
 
        return 0;
 }
 
-void __init pit_init(void)
+static int hpet_init(void)
+{
+       unsigned int id;
+
+       if (!vxtime.hpet_address)
+               return -1;
+       set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address);
+       __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+
+/*
+ * Read the period, compute tick and quotient.
+ */
+
+       id = hpet_readl(HPET_ID);
+
+       if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
+               return -1;
+
+       hpet_period = hpet_readl(HPET_PERIOD);
+       if (hpet_period < 100000 || hpet_period > 100000000)
+               return -1;
+
+       hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
+
+       hpet_use_timer = (id & HPET_ID_LEGSUP);
+
+       return hpet_timer_stop_set_go(hpet_tick);
+}
+
+static int hpet_reenable(void)
+{
+       return hpet_timer_stop_set_go(hpet_tick);
+}
+
+#define PIT_MODE 0x43
+#define PIT_CH0  0x40
+
+static void __init __pit_init(int val, u8 mode)
 {
        unsigned long flags;
 
        spin_lock_irqsave(&i8253_lock, flags);
-       outb_p(0x34, 0x43);             /* binary, mode 2, LSB/MSB, ch 0 */
-       outb_p(LATCH & 0xff, 0x40);     /* LSB */
-       outb_p(LATCH >> 8, 0x40);       /* MSB */
+       outb_p(mode, PIT_MODE);
+       outb_p(val & 0xff, PIT_CH0);    /* LSB */
+       outb_p(val >> 8, PIT_CH0);      /* MSB */
        spin_unlock_irqrestore(&i8253_lock, flags);
 }
 
+void __init pit_init(void)
+{
+       __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
+}
+
+void __init pit_stop_interrupt(void)
+{
+       __pit_init(0, 0x30); /* mode 0 */
+}
+
+void __init stop_timer_interrupt(void)
+{
+       char *name;
+       if (vxtime.hpet_address) {
+               name = "HPET";
+               hpet_timer_stop_set_go(0);
+       } else {
+               name = "PIT";
+               pit_stop_interrupt();
+       }
+       printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
+}
+
 int __init time_setup(char *str)
 {
        report_lost_ticks = 1;
@@ -689,27 +890,11 @@ int __init time_setup(char *str)
 }
 
 static struct irqaction irq0 = {
-       timer_interrupt, SA_INTERRUPT, 0, "timer", NULL, NULL
+       timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
 };
 
-extern void __init config_acpi_tables(void);
-
 void __init time_init(void)
 {
-       char *timename;
-
-#ifdef HPET_HACK_ENABLE_DANGEROUS
-        if (!vxtime.hpet_address) {
-               printk(KERN_WARNING "time.c: WARNING: Enabling HPET base "
-                      "manually!\n");
-                outl(0x800038a0, 0xcf8);
-                outl(0xff000001, 0xcfc);
-                outl(0x800038a0, 0xcf8);
-                hpet_address = inl(0xcfc) & 0xfffffffe;
-               printk(KERN_WARNING "time.c: WARNING: Enabled HPET "
-                      "at %#lx.\n", hpet_address);
-        }
-#endif
        if (nohpet)
                vxtime.hpet_address = 0;
 
@@ -719,94 +904,196 @@ void __init time_init(void)
        set_normalized_timespec(&wall_to_monotonic,
                                -xtime.tv_sec, -xtime.tv_nsec);
 
-       if (!hpet_init()) {
-                vxtime_hz = (1000000000000000L + hpet_period / 2) /
-                       hpet_period;
+       if (!hpet_init())
+                vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period;
+       else
+               vxtime.hpet_address = 0;
+
+       if (hpet_use_timer) {
+               /* set tick_nsec to use the proper rate for HPET */
+               tick_nsec = TICK_NSEC_HPET;
                cpu_khz = hpet_calibrate_tsc();
                timename = "HPET";
+#ifdef CONFIG_X86_PM_TIMER
+       } else if (pmtmr_ioport && !vxtime.hpet_address) {
+               vxtime_hz = PM_TIMER_FREQUENCY;
+               timename = "PM";
+               pit_init();
+               cpu_khz = pit_calibrate_tsc();
+#endif
        } else {
-       pit_init();
-       cpu_khz = pit_calibrate_tsc();
+               pit_init();
+               cpu_khz = pit_calibrate_tsc();
                timename = "PIT";
        }
 
-       printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n",
-              vxtime_hz / 1000000, vxtime_hz % 1000000, timename);
-       printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
-               cpu_khz / 1000, cpu_khz % 1000);
        vxtime.mode = VXTIME_TSC;
-       vxtime.quot = (1000000L << 32) / vxtime_hz;
-       vxtime.tsc_quot = (1000L << 32) / cpu_khz;
-       vxtime.hz = vxtime_hz;
-       rdtscll_sync(&vxtime.last_tsc);
+       vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
+       vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
+       vxtime.last_tsc = get_cycles_sync();
+       set_cyc2ns_scale(cpu_khz);
        setup_irq(0, &irq0);
 
-       set_cyc2ns_scale(cpu_khz / 1000);
+#ifndef CONFIG_SMP
+       time_init_gtod();
+#endif
+}
 
-#ifdef CONFIG_CPU_FREQ
-       cpufreq_register_notifier(&time_cpufreq_notifier_block, 
-                                 CPUFREQ_TRANSITION_NOTIFIER);
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+#ifdef CONFIG_SMP
+       if (apic_is_clustered_box())
+               return 1;
 #endif
+       /* Most intel systems have synchronized TSCs except for
+          multi node systems */
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+#ifdef CONFIG_ACPI
+               /* But TSC doesn't tick in C3 so don't use it there */
+               if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 1000)
+                       return 1;
+#endif
+               return 0;
+       }
+
+       /* Assume multi socket systems are not synchronized */
+       return num_present_cpus() > 1;
 }
 
-void __init time_init_smp(void)
+/*
+ * Decide what mode gettimeofday should use.
+ */
+void time_init_gtod(void)
 {
        char *timetype;
 
-       if (vxtime.hpet_address) {
-               timetype = "HPET";
-               vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+       if (unsynchronized_tsc())
+               notsc = 1;
+
+       if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+               vgetcpu_mode = VGETCPU_RDTSCP;
+       else
+               vgetcpu_mode = VGETCPU_LSL;
+
+       if (vxtime.hpet_address && notsc) {
+               timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
+               if (hpet_use_timer)
+                       vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+               else
+                       vxtime.last = hpet_readl(HPET_COUNTER);
                vxtime.mode = VXTIME_HPET;
                do_gettimeoffset = do_gettimeoffset_hpet;
+#ifdef CONFIG_X86_PM_TIMER
+       /* Using PM for gettimeofday is quite slow, but we have no other
+          choice because the TSC is too unreliable on some systems. */
+       } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
+               timetype = "PM";
+               do_gettimeoffset = do_gettimeoffset_pm;
+               vxtime.mode = VXTIME_PMTMR;
+               sysctl_vsyscall = 0;
+               printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
+#endif
        } else {
-               timetype = "PIT/TSC";
+               timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
                vxtime.mode = VXTIME_TSC;
        }
-       printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype);
+
+       printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
+              vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype);
+       printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
+               cpu_khz / 1000, cpu_khz % 1000);
+       vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
+       vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
+       vxtime.last_tsc = get_cycles_sync();
+
+       set_cyc2ns_scale(cpu_khz);
 }
 
 __setup("report_lost_ticks", time_setup);
 
 static long clock_cmos_diff;
+static unsigned long sleep_start;
+
+/*
+ * sysfs support for the timer.
+ */
 
-static int time_suspend(struct sys_device *dev, u32 state)
+static int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
        /*
         * Estimate time zone so that set_time can update the clock
         */
-       clock_cmos_diff = -get_cmos_time();
+       long cmos_time =  get_cmos_time();
+
+       clock_cmos_diff = -cmos_time;
        clock_cmos_diff += get_seconds();
+       sleep_start = cmos_time;
        return 0;
 }
 
-static int time_resume(struct sys_device *dev)
+static int timer_resume(struct sys_device *dev)
 {
-       unsigned long sec = get_cmos_time() + clock_cmos_diff;
-       write_seqlock_irq(&xtime_lock);
+       unsigned long flags;
+       unsigned long sec;
+       unsigned long ctime = get_cmos_time();
+       long sleep_length = (ctime - sleep_start) * HZ;
+
+       if (sleep_length < 0) {
+               printk(KERN_WARNING "Time skew detected in timer resume!\n");
+               /* The time after the resume must not be earlier than the time
+                * before the suspend or some nasty things will happen
+                */
+               sleep_length = 0;
+               ctime = sleep_start;
+       }
+       if (vxtime.hpet_address)
+               hpet_reenable();
+       else
+               i8254_timer_resume();
+
+       sec = ctime + clock_cmos_diff;
+       write_seqlock_irqsave(&xtime_lock,flags);
        xtime.tv_sec = sec;
        xtime.tv_nsec = 0;
-       write_sequnlock_irq(&xtime_lock);
+       if (vxtime.mode == VXTIME_HPET) {
+               if (hpet_use_timer)
+                       vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+               else
+                       vxtime.last = hpet_readl(HPET_COUNTER);
+#ifdef CONFIG_X86_PM_TIMER
+       } else if (vxtime.mode == VXTIME_PMTMR) {
+               pmtimer_resume();
+#endif
+       } else
+               vxtime.last_tsc = get_cycles_sync();
+       write_sequnlock_irqrestore(&xtime_lock,flags);
+       jiffies += sleep_length;
+       monotonic_base += sleep_length * (NSEC_PER_SEC/HZ);
+       touch_softlockup_watchdog();
        return 0;
 }
 
-static struct sysdev_class pit_sysclass = {
-       .resume = time_resume,
-       .suspend = time_suspend,
-       set_kset_name("pit"),
+static struct sysdev_class timer_sysclass = {
+       .resume = timer_resume,
+       .suspend = timer_suspend,
+       set_kset_name("timer"),
 };
 
-
 /* XXX this driverfs stuff should probably go elsewhere later -john */
-static struct sys_device device_i8253 = {
+static struct sys_device device_timer = {
        .id     = 0,
-       .cls    = &pit_sysclass,
+       .cls    = &timer_sysclass,
 };
 
 static int time_init_device(void)
 {
-       int error = sysdev_class_register(&pit_sysclass);
+       int error = sysdev_class_register(&timer_sysclass);
        if (!error)
-               error = sysdev_register(&device_i8253);
+               error = sysdev_register(&device_timer);
        return error;
 }
 
@@ -827,11 +1114,8 @@ device_initcall(time_init_device);
  * For (3), we use interrupts at 64Hz or user specified periodic
  * frequency, whichever is higher.
  */
-#include <linux/mc146818rtc.h>
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ   64
 #define RTC_NUM_INTS           1
 
@@ -846,6 +1130,7 @@ static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
 static unsigned long PIE_count;
 
 static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
 
 int is_hpet_enabled(void)
 {
@@ -879,24 +1164,32 @@ int hpet_rtc_timer_init(void)
                hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
        local_irq_save(flags);
+
        cnt = hpet_readl(HPET_COUNTER);
        cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
        hpet_writel(cnt, HPET_T1_CMP);
-       local_irq_restore(flags);
+       hpet_t1_cmp = cnt;
 
        cfg = hpet_readl(HPET_T1_CFG);
-       cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
+       cfg &= ~HPET_TN_PERIODIC;
+       cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
        hpet_writel(cfg, HPET_T1_CFG);
 
+       local_irq_restore(flags);
+
        return 1;
 }
 
 static void hpet_rtc_timer_reinit(void)
 {
-       unsigned int cfg, cnt;
+       unsigned int cfg, cnt, ticks_per_int, lost_ints;
 
-       if (!(PIE_on | AIE_on | UIE_on))
+       if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+               cfg = hpet_readl(HPET_T1_CFG);
+               cfg &= ~HPET_TN_ENABLE;
+               hpet_writel(cfg, HPET_T1_CFG);
                return;
+       }
 
        if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
                hpet_rtc_int_freq = PIE_freq;
@@ -904,15 +1197,33 @@ static void hpet_rtc_timer_reinit(void)
                hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
        /* It is more accurate to use the comparator value than current count.*/
-       cnt = hpet_readl(HPET_T1_CMP);
-       cnt += hpet_tick*HZ/hpet_rtc_int_freq;
-       hpet_writel(cnt, HPET_T1_CMP);
+       ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+       hpet_t1_cmp += ticks_per_int;
+       hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
 
-       cfg = hpet_readl(HPET_T1_CFG);
-       cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
-       hpet_writel(cfg, HPET_T1_CFG);
+       /*
+        * If the interrupt handler was delayed too long, the write above tries
+        * to schedule the next interrupt in the past and the hardware would
+        * not interrupt until the counter had wrapped around.
+        * So we have to check that the comparator wasn't set to a past time.
+        */
+       cnt = hpet_readl(HPET_COUNTER);
+       if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+               lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+               /* Make sure that, even with the time needed to execute
+                * this code, the next scheduled interrupt has been moved
+                * back to the future: */
+               lost_ints++;
 
-       return;
+               hpet_t1_cmp += lost_ints * ticks_per_int;
+               hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+               if (PIE_on)
+                       PIE_count += lost_ints;
+
+               printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+                      hpet_rtc_int_freq);
+       }
 }
 
 /*
@@ -1032,7 +1343,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
        }
        if (call_rtc_interrupt) {
                rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
-               rtc_interrupt(rtc_int_flag, dev_id, regs);
+               rtc_interrupt(rtc_int_flag, dev_id);
        }
        return IRQ_HANDLED;
 }
@@ -1041,7 +1352,15 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 static int __init nohpet_setup(char *s) 
 { 
        nohpet = 1;
-       return 0;
+       return 1;
 } 
 
 __setup("nohpet", nohpet_setup);
+
+int __init notsc_setup(char *s)
+{
+       notsc = 1;
+       return 1;
+}
+
+__setup("notsc", notsc_setup);