Merge to Fedora kernel-2.6.18-1.2224_FC5 patched with stable patch-2.6.18.1-vs2.0...
[linux-2.6.git] / arch / x86_64 / kernel / time.c
index b0df27f..ffd1cb8 100644 (file)
@@ -8,7 +8,7 @@
  *  Copyright (c) 1995  Markus Kuhn
  *  Copyright (c) 1996  Ingo Molnar
  *  Copyright (c) 1998  Andrea Arcangeli
- *  Copyright (c) 2002  Vojtech Pavlik
+ *  Copyright (c) 2002,2006  Vojtech Pavlik
  *  Copyright (c) 2003  Andi Kleen
  *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
  */
@@ -18,7 +18,6 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/mc146818rtc.h>
-#include <linux/irq.h>
 #include <linux/time.h>
 #include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
 #include <linux/bcd.h>
 #include <linux/kallsyms.h>
+#include <linux/acpi.h>
+#ifdef CONFIG_ACPI
+#include <acpi/achware.h>      /* for PM timer frequency */
+#include <acpi/acpi_bus.h>
+#endif
 #include <asm/8253pit.h>
 #include <asm/pgtable.h>
 #include <asm/vsyscall.h>
 #include <asm/apic.h>
 #endif
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #ifdef CONFIG_CPU_FREQ
 static void cpufreq_delayed_get(void);
 #endif
-
+extern void i8254_timer_resume(void);
 extern int using_apic_timer;
 
+static char *time_init_gtod(void);
+
 DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
 DEFINE_SPINLOCK(i8253_lock);
 
-static int nohpet __initdata = 0;
+int nohpet __initdata = 0;
 static int notsc __initdata = 0;
 
-#undef HPET_HACK_ENABLE_DANGEROUS
+#define USEC_PER_TICK (USEC_PER_SEC / HZ)
+#define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
+#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
+
+#define NS_SCALE       10 /* 2^10, carefully chosen */
+#define US_SCALE       32 /* 2^32, arbitralrily chosen */
 
 unsigned int cpu_khz;                                  /* TSC clocks / usec, not used here */
-unsigned long hpet_period;                             /* fsecs / HPET clock */
+EXPORT_SYMBOL(cpu_khz);
+static unsigned long hpet_period;                      /* fsecs / HPET clock */
 unsigned long hpet_tick;                               /* HPET clocks / interrupt */
+int hpet_use_timer;                            /* Use counter of hpet for time keeping, otherwise PIT */
 unsigned long vxtime_hz = PIT_TICK_RATE;
 int report_lost_ticks;                         /* command line option */
 unsigned long long monotonic_base;
@@ -71,14 +81,6 @@ unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
 struct timespec __xtime __section_xtime;
 struct timezone __sys_tz __section_sys_tz;
 
-static inline void rdtscll_sync(unsigned long *tsc)
-{
-#ifdef CONFIG_SMP
-       sync_core();
-#endif
-       rdtscll(*tsc);
-}
-
 /*
  * do_gettimeoffset() returns microseconds since last timer interrupt was
  * triggered by hardware. A memory read of HPET is slower than a register read
@@ -93,15 +95,18 @@ static inline unsigned int do_gettimeoffset_tsc(void)
 {
        unsigned long t;
        unsigned long x;
-       rdtscll_sync(&t);
-       if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
-       x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
+       t = get_cycles_sync();
+       if (t < vxtime.last_tsc) 
+               t = vxtime.last_tsc; /* hack */
+       x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE;
        return x;
 }
 
 static inline unsigned int do_gettimeoffset_hpet(void)
 {
-       return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32;
+       /* cap counter read to one tick to avoid inconsistencies */
+       unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
+       return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE;
 }
 
 unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
@@ -121,7 +126,7 @@ void do_gettimeofday(struct timeval *tv)
                seq = read_seqbegin(&xtime_lock);
 
                sec = xtime.tv_sec;
-               usec = xtime.tv_nsec / 1000;
+               usec = xtime.tv_nsec / NSEC_PER_USEC;
 
                /* i386 does some correction here to keep the clock 
                   monotonous even when ntpd is fixing drift.
@@ -132,14 +137,14 @@ void do_gettimeofday(struct timeval *tv)
                   in arch/x86_64/kernel/vsyscall.c and export all needed
                   variables in vmlinux.lds. -AK */ 
 
-               t = (jiffies - wall_jiffies) * (1000000L / HZ) +
+               t = (jiffies - wall_jiffies) * USEC_PER_TICK +
                        do_gettimeoffset();
                usec += t;
 
        } while (read_seqretry(&xtime_lock, seq));
 
-       tv->tv_sec = sec + usec / 1000000;
-       tv->tv_usec = usec % 1000000;
+       tv->tv_sec = sec + usec / USEC_PER_SEC;
+       tv->tv_usec = usec % USEC_PER_SEC;
 }
 
 EXPORT_SYMBOL(do_gettimeofday);
@@ -160,8 +165,8 @@ int do_settimeofday(struct timespec *tv)
 
        write_seqlock_irq(&xtime_lock);
 
-       nsec -= do_gettimeoffset() * 1000 +
-               (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
+       nsec -= do_gettimeoffset() * NSEC_PER_USEC +
+               (jiffies - wall_jiffies) * NSEC_PER_TICK;
 
        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
        wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
@@ -169,10 +174,7 @@ int do_settimeofday(struct timespec *tv)
        set_normalized_timespec(&xtime, sec, nsec);
        set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-       time_adjust = 0;                /* stop active adjtime() */
-       time_status |= STA_UNSYNC;
-       time_maxerror = NTP_PHASE_LIMIT;
-       time_esterror = NTP_PHASE_LIMIT;
+       ntp_clear();
 
        write_sequnlock_irq(&xtime_lock);
        clock_was_set();
@@ -185,13 +187,14 @@ unsigned long profile_pc(struct pt_regs *regs)
 {
        unsigned long pc = instruction_pointer(regs);
 
-       /* Assume the lock function has either no stack frame or only a single word.
-          This checks if the address on the stack looks like a kernel text address.
+       /* Assume the lock function has either no stack frame or only a single 
+          word.  This checks if the address on the stack looks like a kernel 
+          text address.
           There is a small window for false hits, but in that case the tick
           is just accounted to the spinlock function.
           Better would be to write these functions in assembler again
           and check exactly. */
-       if (in_lock_functions(pc)) {
+       if (!user_mode(regs) && in_lock_functions(pc)) {
                char *v = *(char **)regs->rsp;
                if ((v >= _stext && v <= _etext) ||
                        (v >= _sinittext && v <= _einittext) ||
@@ -249,19 +252,12 @@ static void set_rtc_mmss(unsigned long nowtime)
                real_minutes += 30;             /* correct for half hour time zone */
        real_minutes %= 60;
 
-#if 0
-       /* AMD 8111 is a really bad time keeper and hits this regularly. 
-          It probably was an attempt to avoid screwing up DST, but ignore
-          that for now. */        
        if (abs(real_minutes - cmos_minutes) >= 30) {
                printk(KERN_WARNING "time.c: can't update CMOS clock "
                       "from %d to %d\n", cmos_minutes, real_minutes);
-       } else
-#endif
-
-       {
-                       BIN_TO_BCD(real_seconds);
-                       BIN_TO_BCD(real_minutes);
+       } else {
+               BIN_TO_BCD(real_seconds);
+               BIN_TO_BCD(real_minutes);
                CMOS_WRITE(real_seconds, RTC_SECONDS);
                CMOS_WRITE(real_minutes, RTC_MINUTES);
        }
@@ -297,72 +293,67 @@ unsigned long long monotonic_clock(void)
 
                        last_offset = vxtime.last;
                        base = monotonic_base;
-                       this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-
+                       this_offset = hpet_readl(HPET_COUNTER);
                } while (read_seqretry(&xtime_lock, seq));
                offset = (this_offset - last_offset);
-               offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
-               return base + offset;
-       }else{
+               offset *= NSEC_PER_TICK / hpet_tick;
+       } else {
                do {
                        seq = read_seqbegin(&xtime_lock);
 
                        last_offset = vxtime.last_tsc;
                        base = monotonic_base;
                } while (read_seqretry(&xtime_lock, seq));
-               sync_core();
-               rdtscll(this_offset);
-               offset = (this_offset - last_offset)*1000/cpu_khz; 
-               return base + offset;
+               this_offset = get_cycles_sync();
+               /* FIXME: 1000 or 1000000? */
+               offset = (this_offset - last_offset)*1000 / cpu_khz;
        }
-
-
+       return base + offset;
 }
 EXPORT_SYMBOL(monotonic_clock);
 
 static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
 {
-    static long lost_count;
-    static int warned;
-
-    if (report_lost_ticks) {
-           printk(KERN_WARNING "time.c: Lost %d timer "
-                  "tick(s)! ", lost);
-           print_symbol("rip %s)\n", regs->rip);
-    }
-
-    if (lost_count == 1000 && !warned) {
-           printk(KERN_WARNING
-                  "warning: many lost ticks.\n"
-                  KERN_WARNING "Your time source seems to be instable or "
+       static long lost_count;
+       static int warned;
+       if (report_lost_ticks) {
+               printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost);
+               print_symbol("rip %s)\n", regs->rip);
+       }
+
+       if (lost_count == 1000 && !warned) {
+               printk(KERN_WARNING "warning: many lost ticks.\n"
+                      KERN_WARNING "Your time source seems to be instable or "
                                "some driver is hogging interupts\n");
-           print_symbol("rip %s\n", regs->rip);
-           if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
-                   printk(KERN_WARNING "Falling back to HPET\n");
-                   vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-                   vxtime.mode = VXTIME_HPET;
-                   do_gettimeoffset = do_gettimeoffset_hpet;
-           }
-           /* else should fall back to PIT, but code missing. */
-           warned = 1;
-    } else
-           lost_count++;
+               print_symbol("rip %s\n", regs->rip);
+               if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
+                       printk(KERN_WARNING "Falling back to HPET\n");
+                       if (hpet_use_timer)
+                               vxtime.last = hpet_readl(HPET_T0_CMP) - 
+                                                       hpet_tick;
+                       else
+                               vxtime.last = hpet_readl(HPET_COUNTER);
+                       vxtime.mode = VXTIME_HPET;
+                       do_gettimeoffset = do_gettimeoffset_hpet;
+               }
+               /* else should fall back to PIT, but code missing. */
+               warned = 1;
+       } else
+               lost_count++;
 
 #ifdef CONFIG_CPU_FREQ
-    /* In some cases the CPU can change frequency without us noticing
-       (like going into thermal throttle)
-       Give cpufreq a change to catch up. */
-    if ((lost_count+1) % 25 == 0) {
-           cpufreq_delayed_get();
-    }
+       /* In some cases the CPU can change frequency without us noticing
+          Give cpufreq a change to catch up. */
+       if ((lost_count+1) % 25 == 0)
+               cpufreq_delayed_get();
 #endif
 }
 
-static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+void main_timer_handler(struct pt_regs *regs)
 {
        static unsigned long rtc_update = 0;
        unsigned long tsc;
-       int delay, offset = 0, lost = 0;
+       int delay = 0, offset = 0, lost = 0;
 
 /*
  * Here we are in the timer irq handler. We have irqs locally disabled (so we
@@ -373,10 +364,17 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 
        write_seqlock(&xtime_lock);
 
-       if (vxtime.hpet_address) {
+       if (vxtime.hpet_address)
+               offset = hpet_readl(HPET_COUNTER);
+
+       if (hpet_use_timer) {
+               /* if we're using the hpet timer functionality,
+                * we can more accurately know the counter value
+                * when the timer interrupt occured.
+                */
                offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
                delay = hpet_readl(HPET_COUNTER) - offset;
-       } else {
+       } else if (!pmtmr_ioport) {
                spin_lock(&i8253_lock);
                outb_p(0x00, 0x43);
                delay = inb_p(0x40);
@@ -385,7 +383,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
                delay = LATCH - 1 - delay;
        }
 
-       rdtscll_sync(&tsc);
+       tsc = get_cycles_sync();
 
        if (vxtime.mode == VXTIME_HPET) {
                if (offset - vxtime.last > hpet_tick) {
@@ -393,29 +391,34 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
                }
 
                monotonic_base += 
-                       (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
+                       (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick;
 
                vxtime.last = offset;
+#ifdef CONFIG_X86_PM_TIMER
+       } else if (vxtime.mode == VXTIME_PMTMR) {
+               lost = pmtimer_mark_offset();
+#endif
        } else {
                offset = (((tsc - vxtime.last_tsc) *
-                          vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
+                          vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK;
 
                if (offset < 0)
                        offset = 0;
 
-               if (offset > (USEC_PER_SEC / HZ)) {
-                       lost = offset / (USEC_PER_SEC / HZ);
-                       offset %= (USEC_PER_SEC / HZ);
+               if (offset > USEC_PER_TICK) {
+                       lost = offset / USEC_PER_TICK;
+                       offset %= USEC_PER_TICK;
                }
 
-               monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
+               /* FIXME: 1000 or 1000000? */
+               monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz;
 
                vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
 
                if ((((tsc - vxtime.last_tsc) *
-                     vxtime.tsc_quot) >> 32) < offset)
+                     vxtime.tsc_quot) >> US_SCALE) < offset)
                        vxtime.last_tsc = tsc -
-                               (((long) offset << 32) / vxtime.tsc_quot) - 1;
+                               (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;
        }
 
        if (lost > 0) {
@@ -453,28 +456,37 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  * off) isn't likely to go away much sooner anyway.
  */
 
-       if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update &&
+       if (ntp_synced() && xtime.tv_sec > rtc_update &&
                abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
                set_rtc_mmss(xtime.tv_sec);
                rtc_update = xtime.tv_sec + 660;
        }
  
        write_sequnlock(&xtime_lock);
+}
 
+static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+       if (apic_runs_main_timer > 1)
+               return IRQ_HANDLED;
+       main_timer_handler(regs);
+#ifdef CONFIG_X86_LOCAL_APIC
+       if (using_apic_timer)
+               smp_send_timer_broadcast_ipi();
+#endif
        return IRQ_HANDLED;
 }
 
-static unsigned int cyc2ns_scale;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+static unsigned int cyc2ns_scale __read_mostly;
 
-static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
 {
-       cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
+       cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz;
 }
 
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
-       return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+       return (cyc * cyc2ns_scale) >> NS_SCALE;
 }
 
 unsigned long long sched_clock(void)
@@ -487,7 +499,7 @@ unsigned long long sched_clock(void)
            Disadvantage is a small drift between CPUs in some configurations,
           but that should be tolerable. */
        if (__vxtime.mode == VXTIME_HPET)
-               return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32;
+               return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE;
 #endif
 
        /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
@@ -499,63 +511,53 @@ unsigned long long sched_clock(void)
        return cycles_2_ns(a);
 }
 
-unsigned long get_cmos_time(void)
+static unsigned long get_cmos_time(void)
 {
-       unsigned int timeout, year, mon, day, hour, min, sec;
-       unsigned char last, this;
+       unsigned int year, mon, day, hour, min, sec;
        unsigned long flags;
-
-/*
- * The Linux interpretation of the CMOS clock register contents: When the
- * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the
- * second which has precisely just started. Waiting for this can take up to 1
- * second, we timeout approximately after 2.4 seconds on a machine with
- * standard 8.3 MHz ISA bus.
- */
+       unsigned extyear = 0;
 
        spin_lock_irqsave(&rtc_lock, flags);
 
-       timeout = 1000000;
-       last = this = 0;
-
-       while (timeout && last && !this) {
-               last = this;
-               this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
-               timeout--;
-       }
-
-/*
- * Here we are safe to assume the registers won't change for a whole second, so
- * we just go ahead and read them.
-        */
-
+       do {
                sec = CMOS_READ(RTC_SECONDS);
                min = CMOS_READ(RTC_MINUTES);
                hour = CMOS_READ(RTC_HOURS);
                day = CMOS_READ(RTC_DAY_OF_MONTH);
                mon = CMOS_READ(RTC_MONTH);
                year = CMOS_READ(RTC_YEAR);
+#ifdef CONFIG_ACPI
+               if (acpi_fadt.revision >= FADT2_REVISION_ID &&
+                                       acpi_fadt.century)
+                       extyear = CMOS_READ(acpi_fadt.century);
+#endif
+       } while (sec != CMOS_READ(RTC_SECONDS));
 
        spin_unlock_irqrestore(&rtc_lock, flags);
 
-/*
- * We know that x86-64 always uses BCD format, no need to check the config
- * register.
- */
-
-           BCD_TO_BIN(sec);
-           BCD_TO_BIN(min);
-           BCD_TO_BIN(hour);
-           BCD_TO_BIN(day);
-           BCD_TO_BIN(mon);
-           BCD_TO_BIN(year);
-
-/*
- * This will work up to Dec 31, 2069.
- */
-
-       if ((year += 1900) < 1970)
-               year += 100;
+       /*
+        * We know that x86-64 always uses BCD format, no need to check the
+        * config register.
+        */
+
+       BCD_TO_BIN(sec);
+       BCD_TO_BIN(min);
+       BCD_TO_BIN(hour);
+       BCD_TO_BIN(day);
+       BCD_TO_BIN(mon);
+       BCD_TO_BIN(year);
+
+       if (extyear) {
+               BCD_TO_BIN(extyear);
+               year += extyear;
+               printk(KERN_INFO "Extended CMOS year: %d\n", extyear);
+       } else { 
+               /*
+                * x86-64 systems only exists since 2002.
+                * This will work up to Dec 31, 2100
+                */
+               year += 2000;
+       }
 
        return mktime(year, mon, day, hour, min, sec);
 }
@@ -598,7 +600,8 @@ static void cpufreq_delayed_get(void)
                cpufreq_delayed_issched = 1;
                if (!warned) {
                        warned = 1;
-                       printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
+                       printk(KERN_DEBUG 
+       "Losing some ticks... checking if CPU frequency changed.\n");
                }
                schedule_work(&cpufreq_delayed_get_work);
        }
@@ -615,16 +618,17 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
         struct cpufreq_freqs *freq = data;
        unsigned long *lpj, dummy;
 
+       if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
+               return 0;
+
        lpj = &dummy;
        if (!(freq->flags & CPUFREQ_CONST_LOOPS))
 #ifdef CONFIG_SMP
-       lpj = &cpu_data[freq->cpu].loops_per_jiffy;
+               lpj = &cpu_data[freq->cpu].loops_per_jiffy;
 #else
-       lpj = &boot_cpu_data.loops_per_jiffy;
+               lpj = &boot_cpu_data.loops_per_jiffy;
 #endif
 
-
-
        if (!ref_freq) {
                ref_freq = freq->old;
                loops_per_jiffy_ref = *lpj;
@@ -638,10 +642,10 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 
                cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
                if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-                       vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+                       vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
        }
        
-       set_cyc2ns_scale(cpu_khz_ref / 1000);
+       set_cyc2ns_scale(cpu_khz_ref);
 
        return 0;
 }
@@ -685,8 +689,7 @@ static unsigned int __init hpet_calibrate_tsc(void)
        do {
                local_irq_disable();
                hpet_now = hpet_readl(HPET_COUNTER);
-               sync_core();
-               rdtscl(tsc_now);
+               tsc_now = get_cycles_sync();
                local_irq_restore(flags);
        } while ((tsc_now - tsc_start) < TICK_COUNT &&
                 (hpet_now - hpet_start) < TICK_COUNT);
@@ -716,11 +719,9 @@ static unsigned int __init pit_calibrate_tsc(void)
        outb(0xb0, 0x43);
        outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
        outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
-       rdtscll(start);
-       sync_core();
+       start = get_cycles_sync();
        while ((inb(0x61) & 0x20) == 0);
-       sync_core();
-       rdtscll(end);
+       end = get_cycles_sync();
 
        spin_unlock_irqrestore(&i8253_lock, flags);
        
@@ -734,7 +735,7 @@ static __init int late_hpet_init(void)
        unsigned int            ntimer;
 
        if (!vxtime.hpet_address)
-          return -1;
+               return 0;
 
        memset(&hd, 0, sizeof (hd));
 
@@ -747,7 +748,7 @@ static __init int late_hpet_init(void)
         * Timer0 and Timer1 is used by platform.
         */
        hd.hd_phys_address = vxtime.hpet_address;
-       hd.hd_address = (void *)fix_to_virt(FIX_HPET_BASE);
+       hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
        hd.hd_nirqs = ntimer;
        hd.hd_flags = HPET_DATA_PLATFORM;
        hpet_reserve_timer(&hd, 0);
@@ -762,9 +763,8 @@ static __init int late_hpet_init(void)
                int                     i;
 
                hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
-
-               for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer;
-                    timer++, i++)
+               timer = &hpet->hpet_timers[2];
+               for (i = 2; i < ntimer; timer++, i++)
                        hd.hd_irq[i] = (timer->hpet_config &
                                        Tn_INT_ROUTE_CNF_MASK) >>
                                Tn_INT_ROUTE_CNF_SHIFT;
@@ -795,17 +795,18 @@ static int hpet_timer_stop_set_go(unsigned long tick)
  * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
  * and period also hpet_tick.
  */
-
-       hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+       if (hpet_use_timer) {
+               hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
                    HPET_TN_32BIT, HPET_T0_CFG);
-       hpet_writel(hpet_tick, HPET_T0_CMP);
-       hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
-
+               hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
+               hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
+               cfg |= HPET_CFG_LEGACY;
+       }
 /*
  * Go!
  */
 
-       cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
+       cfg |= HPET_CFG_ENABLE;
        hpet_writel(cfg, HPET_CFG);
 
        return 0;
@@ -826,16 +827,16 @@ static int hpet_init(void)
 
        id = hpet_readl(HPET_ID);
 
-       if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) ||
-           !(id & HPET_ID_LEGSUP))
+       if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
                return -1;
 
        hpet_period = hpet_readl(HPET_PERIOD);
        if (hpet_period < 100000 || hpet_period > 100000000)
                return -1;
 
-       hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
-               hpet_period;
+       hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
+
+       hpet_use_timer = (id & HPET_ID_LEGSUP);
 
        return hpet_timer_stop_set_go(hpet_tick);
 }
@@ -845,17 +846,43 @@ static int hpet_reenable(void)
        return hpet_timer_stop_set_go(hpet_tick);
 }
 
-void __init pit_init(void)
+#define PIT_MODE 0x43
+#define PIT_CH0  0x40
+
+static void __init __pit_init(int val, u8 mode)
 {
        unsigned long flags;
 
        spin_lock_irqsave(&i8253_lock, flags);
-       outb_p(0x34, 0x43);             /* binary, mode 2, LSB/MSB, ch 0 */
-       outb_p(LATCH & 0xff, 0x40);     /* LSB */
-       outb_p(LATCH >> 8, 0x40);       /* MSB */
+       outb_p(mode, PIT_MODE);
+       outb_p(val & 0xff, PIT_CH0);    /* LSB */
+       outb_p(val >> 8, PIT_CH0);      /* MSB */
        spin_unlock_irqrestore(&i8253_lock, flags);
 }
 
+void __init pit_init(void)
+{
+       __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
+}
+
+void __init pit_stop_interrupt(void)
+{
+       __pit_init(0, 0x30); /* mode 0 */
+}
+
+void __init stop_timer_interrupt(void)
+{
+       char *name;
+       if (vxtime.hpet_address) {
+               name = "HPET";
+               hpet_timer_stop_set_go(0);
+       } else {
+               name = "PIT";
+               pit_stop_interrupt();
+       }
+       printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
+}
+
 int __init time_setup(char *str)
 {
        report_lost_ticks = 1;
@@ -863,27 +890,14 @@ int __init time_setup(char *str)
 }
 
 static struct irqaction irq0 = {
-       timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
+       timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
 };
 
-extern void __init config_acpi_tables(void);
-
 void __init time_init(void)
 {
        char *timename;
+       char *gtod;
 
-#ifdef HPET_HACK_ENABLE_DANGEROUS
-        if (!vxtime.hpet_address) {
-               printk(KERN_WARNING "time.c: WARNING: Enabling HPET base "
-                      "manually!\n");
-                outl(0x800038a0, 0xcf8);
-                outl(0xff000001, 0xcfc);
-                outl(0x800038a0, 0xcf8);
-                vxtime.hpet_address = inl(0xcfc) & 0xfffffffe;
-               printk(KERN_WARNING "time.c: WARNING: Enabled HPET "
-                      "at %#lx.\n", vxtime.hpet_address);
-        }
-#endif
        if (nohpet)
                vxtime.hpet_address = 0;
 
@@ -893,64 +907,101 @@ void __init time_init(void)
        set_normalized_timespec(&wall_to_monotonic,
                                -xtime.tv_sec, -xtime.tv_nsec);
 
-       if (!hpet_init()) {
-                vxtime_hz = (1000000000000000L + hpet_period / 2) /
-                       hpet_period;
+       if (!hpet_init())
+                vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period;
+       else
+               vxtime.hpet_address = 0;
+
+       if (hpet_use_timer) {
+               /* set tick_nsec to use the proper rate for HPET */
+               tick_nsec = TICK_NSEC_HPET;
                cpu_khz = hpet_calibrate_tsc();
                timename = "HPET";
+#ifdef CONFIG_X86_PM_TIMER
+       } else if (pmtmr_ioport && !vxtime.hpet_address) {
+               vxtime_hz = PM_TIMER_FREQUENCY;
+               timename = "PM";
+               pit_init();
+               cpu_khz = pit_calibrate_tsc();
+#endif
        } else {
                pit_init();
                cpu_khz = pit_calibrate_tsc();
                timename = "PIT";
        }
 
-       printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n",
-              vxtime_hz / 1000000, vxtime_hz % 1000000, timename);
+       vxtime.mode = VXTIME_TSC;
+       gtod = time_init_gtod();
+
+       printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
+              vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod);
        printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
                cpu_khz / 1000, cpu_khz % 1000);
-       vxtime.mode = VXTIME_TSC;
-       vxtime.quot = (1000000L << 32) / vxtime_hz;
-       vxtime.tsc_quot = (1000L << 32) / cpu_khz;
-       vxtime.hz = vxtime_hz;
-       rdtscll_sync(&vxtime.last_tsc);
+       vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
+       vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
+       vxtime.last_tsc = get_cycles_sync();
        setup_irq(0, &irq0);
 
-       set_cyc2ns_scale(cpu_khz / 1000);
+       set_cyc2ns_scale(cpu_khz);
+}
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+#ifdef CONFIG_SMP
+       if (apic_is_clustered_box())
+               return 1;
+#endif
+       /* Most intel systems have synchronized TSCs except for
+          multi node systems */
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+#ifdef CONFIG_ACPI
+               /* But TSC doesn't tick in C3 so don't use it there */
+               if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 1000)
+                       return 1;
+#endif
+               return 0;
+       }
+
+       /* Assume multi socket systems are not synchronized */
+       return num_present_cpus() > 1;
 }
 
-void __init time_init_smp(void)
+/*
+ * Decide what mode gettimeofday should use.
+ */
+__init static char *time_init_gtod(void)
 {
        char *timetype;
 
-       /*
-        * AMD systems with more than one CPU don't have fully synchronized
-        * TSCs. Always use HPET gettimeofday for these, although it is slower.
-        * Intel SMP systems usually have synchronized TSCs, so use always
-        * the TSC.
-        *
-        * Exceptions:
-        * IBM Summit2 checked by oem_force_hpet_timer().
-        * AMD dual core may also not need HPET. Check me.
-        *
-        * Can be turned off with "notsc".
-        */
-       if (num_online_cpus() > 1 &&
-           boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-               notsc = 1;
-       /* Some systems will want to disable TSC and use HPET. */
-       if (oem_force_hpet_timer())
+       if (unsynchronized_tsc())
                notsc = 1;
        if (vxtime.hpet_address && notsc) {
-               timetype = "HPET";
-               vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+               timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
+               if (hpet_use_timer)
+                       vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+               else
+                       vxtime.last = hpet_readl(HPET_COUNTER);
                vxtime.mode = VXTIME_HPET;
                do_gettimeoffset = do_gettimeoffset_hpet;
+#ifdef CONFIG_X86_PM_TIMER
+       /* Using PM for gettimeofday is quite slow, but we have no other
+          choice because the TSC is too unreliable on some systems. */
+       } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
+               timetype = "PM";
+               do_gettimeoffset = do_gettimeoffset_pm;
+               vxtime.mode = VXTIME_PMTMR;
+               sysctl_vsyscall = 0;
+               printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
+#endif
        } else {
-               timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
+               timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
                vxtime.mode = VXTIME_TSC;
        }
-
-       printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype);
+       return timetype;
 }
 
 __setup("report_lost_ticks", time_setup);
@@ -958,7 +1009,11 @@ __setup("report_lost_ticks", time_setup);
 static long clock_cmos_diff;
 static unsigned long sleep_start;
 
-static int timer_suspend(struct sys_device *dev, u32 state)
+/*
+ * sysfs support for the timer.
+ */
+
+static int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
        /*
         * Estimate time zone so that set_time can update the clock
@@ -980,14 +1035,29 @@ static int timer_resume(struct sys_device *dev)
 
        if (vxtime.hpet_address)
                hpet_reenable();
+       else
+               i8254_timer_resume();
 
        sec = ctime + clock_cmos_diff;
        write_seqlock_irqsave(&xtime_lock,flags);
        xtime.tv_sec = sec;
        xtime.tv_nsec = 0;
+       if (vxtime.mode == VXTIME_HPET) {
+               if (hpet_use_timer)
+                       vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+               else
+                       vxtime.last = hpet_readl(HPET_COUNTER);
+#ifdef CONFIG_X86_PM_TIMER
+       } else if (vxtime.mode == VXTIME_PMTMR) {
+               pmtimer_resume();
+#endif
+       } else
+               vxtime.last_tsc = get_cycles_sync();
        write_sequnlock_irqrestore(&xtime_lock,flags);
        jiffies += sleep_length;
        wall_jiffies += sleep_length;
+       monotonic_base += sleep_length * (NSEC_PER_SEC/HZ);
+       touch_softlockup_watchdog();
        return 0;
 }
 
@@ -997,7 +1067,6 @@ static struct sysdev_class timer_sysclass = {
        set_kset_name("timer"),
 };
 
-
 /* XXX this driverfs stuff should probably go elsewhere later -john */
 static struct sys_device device_timer = {
        .id     = 0,
@@ -1031,8 +1100,6 @@ device_initcall(time_init_device);
  */
 #include <linux/rtc.h>
 
-extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs);
-
 #define DEFAULT_RTC_INT_FREQ   64
 #define RTC_NUM_INTS           1
 
@@ -1047,6 +1114,7 @@ static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
 static unsigned long PIE_count;
 
 static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
 
 int is_hpet_enabled(void)
 {
@@ -1083,10 +1151,12 @@ int hpet_rtc_timer_init(void)
        cnt = hpet_readl(HPET_COUNTER);
        cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
        hpet_writel(cnt, HPET_T1_CMP);
+       hpet_t1_cmp = cnt;
        local_irq_restore(flags);
 
        cfg = hpet_readl(HPET_T1_CFG);
-       cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
+       cfg &= ~HPET_TN_PERIODIC;
+       cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
        hpet_writel(cfg, HPET_T1_CFG);
 
        return 1;
@@ -1096,8 +1166,12 @@ static void hpet_rtc_timer_reinit(void)
 {
        unsigned int cfg, cnt;
 
-       if (!(PIE_on | AIE_on | UIE_on))
+       if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+               cfg = hpet_readl(HPET_T1_CFG);
+               cfg &= ~HPET_TN_ENABLE;
+               hpet_writel(cfg, HPET_T1_CFG);
                return;
+       }
 
        if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
                hpet_rtc_int_freq = PIE_freq;
@@ -1105,15 +1179,10 @@ static void hpet_rtc_timer_reinit(void)
                hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
        /* It is more accurate to use the comparator value than current count.*/
-       cnt = hpet_readl(HPET_T1_CMP);
+       cnt = hpet_t1_cmp;
        cnt += hpet_tick*HZ/hpet_rtc_int_freq;
        hpet_writel(cnt, HPET_T1_CMP);
-
-       cfg = hpet_readl(HPET_T1_CFG);
-       cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
-       hpet_writel(cfg, HPET_T1_CFG);
-
-       return;
+       hpet_t1_cmp = cnt;
 }
 
 /*
@@ -1239,23 +1308,18 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 }
 #endif
 
-
-
 static int __init nohpet_setup(char *s) 
 { 
        nohpet = 1;
-       return 0;
+       return 1;
 } 
 
 __setup("nohpet", nohpet_setup);
 
-
-static int __init notsc_setup(char *s)
+int __init notsc_setup(char *s)
 {
        notsc = 1;
-       return 0;
+       return 1;
 }
 
 __setup("notsc", notsc_setup);
-
-