X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=arch%2Fx86_64%2Fkernel%2Ftime.c;h=5cc76d0d331f0d0fe2b1f7913d7705887112b073;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=10c511f50d1dfdeacd3f7ed8eeae44e0b3c60c02;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 10c511f50..5cc76d0d3 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -8,10 +8,9 @@ * Copyright (c) 1995 Markus Kuhn * Copyright (c) 1996 Ingo Molnar * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002 Vojtech Pavlik + * Copyright (c) 2002,2006 Vojtech Pavlik * Copyright (c) 2003 Andi Kleen * RTC support code taken from arch/i386/kernel/timers/time_hpet.c - * */ #include @@ -19,42 +18,58 @@ #include #include #include -#include #include #include #include #include #include #include +#include +#include #include +#include +#ifdef CONFIG_ACPI +#include /* for PM timer frequency */ +#include +#endif #include #include #include #include #include #include +#include #include -#ifdef CONFIG_X86_LOCAL_APIC +#include #include -#endif - -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); +#ifdef CONFIG_CPU_FREQ +static void cpufreq_delayed_get(void); +#endif +extern void i8254_timer_resume(void); extern int using_apic_timer; -spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; -spinlock_t i8253_lock = SPIN_LOCK_UNLOCKED; +static char *timename = NULL; + +DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); +DEFINE_SPINLOCK(i8253_lock); -static int nohpet __initdata = 0; +int nohpet __initdata = 0; +static int notsc __initdata = 0; -#undef HPET_HACK_ENABLE_DANGEROUS +#define USEC_PER_TICK (USEC_PER_SEC / HZ) +#define NSEC_PER_TICK (NSEC_PER_SEC / HZ) +#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) +#define NS_SCALE 10 /* 2^10, carefully chosen */ +#define US_SCALE 32 /* 2^32, arbitralrily chosen */ unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -unsigned long hpet_period; /* fsecs / HPET clock */ +EXPORT_SYMBOL(cpu_khz); +static unsigned long hpet_period; /* fsecs / HPET clock */ unsigned long hpet_tick; /* HPET clocks / interrupt */ +int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ unsigned long vxtime_hz = PIT_TICK_RATE; int report_lost_ticks; /* command line option */ unsigned long long monotonic_base; @@ -62,18 +77,9 @@ unsigned long long monotonic_base; struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; struct timespec __xtime __section_xtime; struct timezone __sys_tz __section_sys_tz; -static inline void rdtscll_sync(unsigned long *tsc) -{ -#ifdef CONFIG_SMP - sync_core(); -#endif - rdtscll(*tsc); -} - /* * do_gettimeoffset() returns microseconds since last timer interrupt was * triggered by hardware. A memory read of HPET is slower than a register read @@ -82,21 +88,24 @@ static inline void rdtscll_sync(unsigned long *tsc) * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. * This is not a problem, because jiffies hasn't updated either. They are bound * together by xtime_lock. - */ + */ static inline unsigned int do_gettimeoffset_tsc(void) { unsigned long t; unsigned long x; - rdtscll_sync(&t); - if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; + t = get_cycles_sync(); + if (t < vxtime.last_tsc) + t = vxtime.last_tsc; /* hack */ + x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE; return x; } static inline unsigned int do_gettimeoffset_hpet(void) { - return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32; + /* cap counter read to one tick to avoid inconsistencies */ + unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; + return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE; } unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; @@ -109,32 +118,29 @@ unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; void do_gettimeofday(struct timeval *tv) { - unsigned long seq, t; + unsigned long seq; unsigned int sec, usec; do { seq = read_seqbegin(&xtime_lock); sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000; + usec = xtime.tv_nsec / NSEC_PER_USEC; /* i386 does some correction here to keep the clock - monotonus even when ntpd is fixing drift. + monotonous even when ntpd is fixing drift. But they didn't work for me, there is a non monotonic clock anyways with ntp. I dropped all corrections now until a real solution can be found. Note when you fix it here you need to do the same in arch/x86_64/kernel/vsyscall.c and export all needed variables in vmlinux.lds. -AK */ - - t = (jiffies - wall_jiffies) * (1000000L / HZ) + - do_gettimeoffset(); - usec += t; + usec += do_gettimeoffset(); } while (read_seqretry(&xtime_lock, seq)); - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + tv->tv_sec = sec + usec / USEC_PER_SEC; + tv->tv_usec = usec % USEC_PER_SEC; } EXPORT_SYMBOL(do_gettimeofday); @@ -155,8 +161,7 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irq(&xtime_lock); - nsec -= do_gettimeoffset() * 1000 + - (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); + nsec -= do_gettimeoffset() * NSEC_PER_USEC; wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); @@ -164,10 +169,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&xtime, sec, nsec); set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; + ntp_clear(); write_sequnlock_irq(&xtime_lock); clock_was_set(); @@ -176,6 +178,24 @@ int do_settimeofday(struct timespec *tv) EXPORT_SYMBOL(do_settimeofday); +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + /* Assume the lock function has either no stack frame or a copy + of eflags from PUSHF + Eflags always has bits 22 and up cleared unlike kernel addresses. */ + if (!user_mode(regs) && in_lock_functions(pc)) { + unsigned long *sp = (unsigned long *)regs->rsp; + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; + } + return pc; +} +EXPORT_SYMBOL(profile_pc); + /* * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 * ms after the second nowtime has started, because when nowtime is written @@ -214,7 +234,7 @@ static void set_rtc_mmss(unsigned long nowtime) * overflow. This avoids messing with unknown time zones but requires your RTC * not to be off by more than 15 minutes. Since we're calling it only when * our clock is externally synchronized using NTP, this shouldn't be a problem. - */ + */ real_seconds = nowtime % 60; real_minutes = nowtime / 60; @@ -222,19 +242,12 @@ static void set_rtc_mmss(unsigned long nowtime) real_minutes += 30; /* correct for half hour time zone */ real_minutes %= 60; -#if 0 - /* AMD 8111 is a really bad time keeper and hits this regularly. - It probably was an attempt to avoid screwing up DST, but ignore - that for now. */ if (abs(real_minutes - cmos_minutes) >= 30) { printk(KERN_WARNING "time.c: can't update CMOS clock " "from %d to %d\n", cmos_minutes, real_minutes); - } else -#endif - - { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); + } else { + BIN_TO_BCD(real_seconds); + BIN_TO_BCD(real_minutes); CMOS_WRITE(real_seconds, RTC_SECONDS); CMOS_WRITE(real_minutes, RTC_MINUTES); } @@ -258,6 +271,7 @@ static void set_rtc_mmss(unsigned long nowtime) * Note: This function is required to return accurate * time even in the absence of multiple timer ticks. */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc); unsigned long long monotonic_clock(void) { unsigned long seq; @@ -270,49 +284,87 @@ unsigned long long monotonic_clock(void) last_offset = vxtime.last; base = monotonic_base; - this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - + this_offset = hpet_readl(HPET_COUNTER); } while (read_seqretry(&xtime_lock, seq)); offset = (this_offset - last_offset); - offset *=(NSEC_PER_SEC/HZ)/hpet_tick; - return base + offset; - }else{ + offset *= NSEC_PER_TICK / hpet_tick; + } else { do { seq = read_seqbegin(&xtime_lock); last_offset = vxtime.last_tsc; base = monotonic_base; } while (read_seqretry(&xtime_lock, seq)); - sync_core(); - rdtscll(this_offset); - offset = (this_offset - last_offset)*1000/cpu_khz; - return base + offset; + this_offset = get_cycles_sync(); + offset = cycles_2_ns(this_offset - last_offset); } - - + return base + offset; } EXPORT_SYMBOL(monotonic_clock); +static noinline void handle_lost_ticks(int lost) +{ + static long lost_count; + static int warned; + if (report_lost_ticks) { + printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost); + print_symbol("rip %s)\n", get_irq_regs()->rip); + } + + if (lost_count == 1000 && !warned) { + printk(KERN_WARNING "warning: many lost ticks.\n" + KERN_WARNING "Your time source seems to be instable or " + "some driver is hogging interupts\n"); + print_symbol("rip %s\n", get_irq_regs()->rip); + if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { + printk(KERN_WARNING "Falling back to HPET\n"); + if (hpet_use_timer) + vxtime.last = hpet_readl(HPET_T0_CMP) - + hpet_tick; + else + vxtime.last = hpet_readl(HPET_COUNTER); + vxtime.mode = VXTIME_HPET; + do_gettimeoffset = do_gettimeoffset_hpet; + } + /* else should fall back to PIT, but code missing. */ + warned = 1; + } else + lost_count++; -static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +#ifdef CONFIG_CPU_FREQ + /* In some cases the CPU can change frequency without us noticing + Give cpufreq a change to catch up. */ + if ((lost_count+1) % 25 == 0) + cpufreq_delayed_get(); +#endif +} + +void main_timer_handler(void) { static unsigned long rtc_update = 0; - unsigned long tsc, lost = 0; - int delay, offset = 0; + unsigned long tsc; + int delay = 0, offset = 0, lost = 0; /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running * on the other CPU, so we need a lock. We also need to lock the vsyscall * variables, because both do_timer() and us change them -arca+vojtech - */ + */ write_seqlock(&xtime_lock); - if (vxtime.hpet_address) { + if (vxtime.hpet_address) + offset = hpet_readl(HPET_COUNTER); + + if (hpet_use_timer) { + /* if we're using the hpet timer functionality, + * we can more accurately know the counter value + * when the timer interrupt occured. + */ offset = hpet_readl(HPET_T0_CMP) - hpet_tick; delay = hpet_readl(HPET_COUNTER) - offset; - } else { + } else if (!pmtmr_ioport) { spin_lock(&i8253_lock); outb_p(0x00, 0x43); delay = inb_p(0x40); @@ -321,7 +373,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) delay = LATCH - 1 - delay; } - rdtscll_sync(&tsc); + tsc = get_cycles_sync(); if (vxtime.mode == VXTIME_HPET) { if (offset - vxtime.last > hpet_tick) { @@ -329,45 +381,48 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) } monotonic_base += - (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; + (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick; vxtime.last = offset; +#ifdef CONFIG_X86_PM_TIMER + } else if (vxtime.mode == VXTIME_PMTMR) { + lost = pmtimer_mark_offset(); +#endif } else { offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); + vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK; if (offset < 0) offset = 0; - if (offset > (USEC_PER_SEC / HZ)) { - lost = offset / (USEC_PER_SEC / HZ); - offset %= (USEC_PER_SEC / HZ); + if (offset > USEC_PER_TICK) { + lost = offset / USEC_PER_TICK; + offset %= USEC_PER_TICK; } - monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; + monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc); vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) < offset) + vxtime.tsc_quot) >> US_SCALE) < offset) vxtime.last_tsc = tsc - - (((long) offset << 32) / vxtime.tsc_quot) - 1; + (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1; } - if (lost) { - if (report_lost_ticks) { - printk(KERN_WARNING "time.c: Lost %ld timer " - "tick(s)! ", lost); - print_symbol("rip %s)\n", regs->rip); - } - jiffies += lost; - } + if (lost > 0) + handle_lost_ticks(lost); + else + lost = 0; /* * Do the timer stuff. */ - do_timer(regs); + do_timer(lost + 1); +#ifndef CONFIG_SMP + update_process_times(user_mode(get_irq_regs())); +#endif /* * In the SMP case we use the local APIC timer interrupt to do the profiling, @@ -375,12 +430,8 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) * have to call the local interrupt handler. */ -#ifndef CONFIG_X86_LOCAL_APIC - x86_do_profile(regs); -#else if (!using_apic_timer) - smp_local_timer_interrupt(regs); -#endif + smp_local_timer_interrupt(); /* * If we have an externally synchronized Linux clock, then update CMOS clock @@ -390,28 +441,35 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) * off) isn't likely to go away much sooner anyway. */ - if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update && + if (ntp_synced() && xtime.tv_sec > rtc_update && abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { set_rtc_mmss(xtime.tv_sec); rtc_update = xtime.tv_sec + 660; } write_sequnlock(&xtime_lock); +} +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + if (apic_runs_main_timer > 1) + return IRQ_HANDLED; + main_timer_handler(); + if (using_apic_timer) + smp_send_timer_broadcast_ipi(); return IRQ_HANDLED; } -static unsigned int cyc2ns_scale; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ +static unsigned int cyc2ns_scale __read_mostly; -static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +static inline void set_cyc2ns_scale(unsigned long cpu_khz) { - cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; + return (cyc * cyc2ns_scale) >> NS_SCALE; } unsigned long long sched_clock(void) @@ -424,7 +482,7 @@ unsigned long long sched_clock(void) Disadvantage is a small drift between CPUs in some configurations, but that should be tolerable. */ if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; + return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE; #endif /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, @@ -436,63 +494,53 @@ unsigned long long sched_clock(void) return cycles_2_ns(a); } -unsigned long get_cmos_time(void) +static unsigned long get_cmos_time(void) { - unsigned int timeout, year, mon, day, hour, min, sec; - unsigned char last, this; + unsigned int year, mon, day, hour, min, sec; unsigned long flags; - -/* - * The Linux interpretation of the CMOS clock register contents: When the - * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the - * second which has precisely just started. Waiting for this can take up to 1 - * second, we timeout approximately after 2.4 seconds on a machine with - * standard 8.3 MHz ISA bus. - */ + unsigned extyear = 0; spin_lock_irqsave(&rtc_lock, flags); - timeout = 1000000; - last = this = 0; - - while (timeout && last && !this) { - last = this; - this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP; - timeout--; - } - -/* - * Here we are safe to assume the registers won't change for a whole second, so - * we just go ahead and read them. - */ - + do { sec = CMOS_READ(RTC_SECONDS); min = CMOS_READ(RTC_MINUTES); hour = CMOS_READ(RTC_HOURS); day = CMOS_READ(RTC_DAY_OF_MONTH); mon = CMOS_READ(RTC_MONTH); year = CMOS_READ(RTC_YEAR); +#ifdef CONFIG_ACPI + if (acpi_fadt.revision >= FADT2_REVISION_ID && + acpi_fadt.century) + extyear = CMOS_READ(acpi_fadt.century); +#endif + } while (sec != CMOS_READ(RTC_SECONDS)); spin_unlock_irqrestore(&rtc_lock, flags); -/* - * We know that x86-64 always uses BCD format, no need to check the config - * register. - */ - - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); - -/* - * This will work up to Dec 31, 2069. - */ - - if ((year += 1900) < 1970) - year += 100; + /* + * We know that x86-64 always uses BCD format, no need to check the + * config register. + */ + + BCD_TO_BIN(sec); + BCD_TO_BIN(min); + BCD_TO_BIN(hour); + BCD_TO_BIN(day); + BCD_TO_BIN(mon); + BCD_TO_BIN(year); + + if (extyear) { + BCD_TO_BIN(extyear); + year += extyear; + printk(KERN_INFO "Extended CMOS year: %d\n", extyear); + } else { + /* + * x86-64 systems only exists since 2002. + * This will work up to Dec 31, 2100 + */ + year += 2000; + } return mktime(year, mon, day, hour, min, sec); } @@ -509,6 +557,39 @@ unsigned long get_cmos_time(void) Should fix up last_tsc too. Currently gettimeofday in the first tick after the change will be slightly wrong. */ +#include + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(struct work_struct *v) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + cpufreq_get(cpu); + } + cpufreq_delayed_issched = 0; +} + +/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries + * to verify the CPU frequency the timing core thinks the CPU is running + * at is still correct. + */ +static void cpufreq_delayed_get(void) +{ + static int warned; + if (cpufreq_init && !cpufreq_delayed_issched) { + cpufreq_delayed_issched = 1; + if (!warned) { + warned = 1; + printk(KERN_DEBUG + "Losing some ticks... checking if CPU frequency changed.\n"); + } + schedule_work(&cpufreq_delayed_get_work); + } +} + static unsigned int ref_freq = 0; static unsigned long loops_per_jiffy_ref = 0; @@ -518,12 +599,17 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - unsigned long *lpj; + unsigned long *lpj, dummy; + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) #ifdef CONFIG_SMP - lpj = &cpu_data[freq->cpu].loops_per_jiffy; + lpj = &cpu_data[freq->cpu].loops_per_jiffy; #else - lpj = &boot_cpu_data.loops_per_jiffy; + lpj = &boot_cpu_data.loops_per_jiffy; #endif if (!ref_freq) { @@ -538,10 +624,11 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - vxtime.tsc_quot = (1000L << 32) / cpu_khz; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; } - set_cyc2ns_scale(cpu_khz_ref / 1000); + set_cyc2ns_scale(cpu_khz_ref); return 0; } @@ -549,6 +636,18 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, static struct notifier_block time_cpufreq_notifier_block = { .notifier_call = time_cpufreq_notifier }; + +static int __init cpufreq_tsc(void) +{ + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER)) + cpufreq_init = 1; + return 0; +} + +core_initcall(cpufreq_tsc); + #endif /* @@ -557,6 +656,25 @@ static struct notifier_block time_cpufreq_notifier_block = { */ #define TICK_COUNT 100000000 +#define TICK_MIN 5000 + +/* + * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none + * occurs between the reads of the hpet & TSC. + */ +static void __init read_hpet_tsc(int *hpet, int *tsc) +{ + int tsc1, tsc2, hpet1; + + do { + tsc1 = get_cycles_sync(); + hpet1 = hpet_readl(HPET_COUNTER); + tsc2 = get_cycles_sync(); + } while (tsc2 - tsc1 > TICK_MIN); + *hpet = hpet1; + *tsc = tsc2; +} + static unsigned int __init hpet_calibrate_tsc(void) { @@ -567,14 +685,11 @@ static unsigned int __init hpet_calibrate_tsc(void) local_irq_save(flags); local_irq_disable(); - hpet_start = hpet_readl(HPET_COUNTER); - rdtscl(tsc_start); + read_hpet_tsc(&hpet_start, &tsc_start); do { local_irq_disable(); - hpet_now = hpet_readl(HPET_COUNTER); - sync_core(); - rdtscl(tsc_now); + read_hpet_tsc(&hpet_now, &tsc_now); local_irq_restore(flags); } while ((tsc_now - tsc_start) < TICK_COUNT && (hpet_now - hpet_start) < TICK_COUNT); @@ -604,42 +719,67 @@ static unsigned int __init pit_calibrate_tsc(void) outb(0xb0, 0x43); outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42); outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42); - rdtscll(start); - sync_core(); + start = get_cycles_sync(); while ((inb(0x61) & 0x20) == 0); - sync_core(); - rdtscll(end); + end = get_cycles_sync(); spin_unlock_irqrestore(&i8253_lock, flags); return (end - start) / 50; } -static int hpet_init(void) +#ifdef CONFIG_HPET +static __init int late_hpet_init(void) { - unsigned int cfg, id; + struct hpet_data hd; + unsigned int ntimer; if (!vxtime.hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); - __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + return 0; -/* - * Read the period, compute tick and quotient. - */ + memset(&hd, 0, sizeof (hd)); - id = hpet_readl(HPET_ID); + ntimer = hpet_readl(HPET_ID); + ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + ntimer++; - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) || - !(id & HPET_ID_LEGSUP)) - return -1; + /* + * Register with driver. + * Timer0 and Timer1 is used by platform. + */ + hd.hd_phys_address = vxtime.hpet_address; + hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); + hd.hd_nirqs = ntimer; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + if (ntimer > 2) { + struct hpet *hpet; + struct hpet_timer *timer; + int i; + + hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); + timer = &hpet->hpet_timers[2]; + for (i = 2; i < ntimer; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & + Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; + } - hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / - hpet_period; + hpet_alloc(&hd); + return 0; +} +fs_initcall(late_hpet_init); +#endif + +static int hpet_timer_stop_set_go(unsigned long tick) +{ + unsigned int cfg; /* * Stop the timers and reset the main counter. @@ -655,33 +795,94 @@ static int hpet_init(void) * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, * and period also hpet_tick. */ - - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + if (hpet_use_timer) { + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ - + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ + cfg |= HPET_CFG_LEGACY; + } /* * Go! */ - cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY; + cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); return 0; } -void __init pit_init(void) +static int hpet_init(void) +{ + unsigned int id; + + if (!vxtime.hpet_address) + return -1; + set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); + __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + +/* + * Read the period, compute tick and quotient. + */ + + id = hpet_readl(HPET_ID); + + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) + return -1; + + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < 100000 || hpet_period > 100000000) + return -1; + + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; + + hpet_use_timer = (id & HPET_ID_LEGSUP); + + return hpet_timer_stop_set_go(hpet_tick); +} + +static int hpet_reenable(void) +{ + return hpet_timer_stop_set_go(hpet_tick); +} + +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 + +static void __init __pit_init(int val, u8 mode) { unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34, 0x43); /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(LATCH & 0xff, 0x40); /* LSB */ - outb_p(LATCH >> 8, 0x40); /* MSB */ + outb_p(mode, PIT_MODE); + outb_p(val & 0xff, PIT_CH0); /* LSB */ + outb_p(val >> 8, PIT_CH0); /* MSB */ spin_unlock_irqrestore(&i8253_lock, flags); } +void __init pit_init(void) +{ + __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ +} + +void __init pit_stop_interrupt(void) +{ + __pit_init(0, 0x30); /* mode 0 */ +} + +void __init stop_timer_interrupt(void) +{ + char *name; + if (vxtime.hpet_address) { + name = "HPET"; + hpet_timer_stop_set_go(0); + } else { + name = "PIT"; + pit_stop_interrupt(); + } + printk(KERN_INFO "timer: %s interrupt stopped.\n", name); +} + int __init time_setup(char *str) { report_lost_ticks = 1; @@ -689,27 +890,11 @@ int __init time_setup(char *str) } static struct irqaction irq0 = { - timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL + timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; -extern void __init config_acpi_tables(void); - void __init time_init(void) { - char *timename; - -#ifdef HPET_HACK_ENABLE_DANGEROUS - if (!vxtime.hpet_address) { - printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " - "manually!\n"); - outl(0x800038a0, 0xcf8); - outl(0xff000001, 0xcfc); - outl(0x800038a0, 0xcf8); - hpet_address = inl(0xcfc) & 0xfffffffe; - printk(KERN_WARNING "time.c: WARNING: Enabled HPET " - "at %#lx.\n", hpet_address); - } -#endif if (nohpet) vxtime.hpet_address = 0; @@ -719,94 +904,196 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) { - vxtime_hz = (1000000000000000L + hpet_period / 2) / - hpet_period; + if (!hpet_init()) + vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; + else + vxtime.hpet_address = 0; + + if (hpet_use_timer) { + /* set tick_nsec to use the proper rate for HPET */ + tick_nsec = TICK_NSEC_HPET; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; +#ifdef CONFIG_X86_PM_TIMER + } else if (pmtmr_ioport && !vxtime.hpet_address) { + vxtime_hz = PM_TIMER_FREQUENCY; + timename = "PM"; + pit_init(); + cpu_khz = pit_calibrate_tsc(); +#endif } else { - pit_init(); - cpu_khz = pit_calibrate_tsc(); + pit_init(); + cpu_khz = pit_calibrate_tsc(); timename = "PIT"; } - printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename); - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); vxtime.mode = VXTIME_TSC; - vxtime.quot = (1000000L << 32) / vxtime_hz; - vxtime.tsc_quot = (1000L << 32) / cpu_khz; - vxtime.hz = vxtime_hz; - rdtscll_sync(&vxtime.last_tsc); + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; + vxtime.last_tsc = get_cycles_sync(); + set_cyc2ns_scale(cpu_khz); setup_irq(0, &irq0); - set_cyc2ns_scale(cpu_khz / 1000); +#ifndef CONFIG_SMP + time_init_gtod(); +#endif +} -#ifdef CONFIG_CPU_FREQ - cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +__cpuinit int unsynchronized_tsc(void) +{ +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; #endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 1000) + return 1; +#endif + return 0; + } + + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; } -void __init time_init_smp(void) +/* + * Decide what mode gettimeofday should use. + */ +void time_init_gtod(void) { char *timetype; - if (vxtime.hpet_address) { - timetype = "HPET"; - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; + if (unsynchronized_tsc()) + notsc = 1; + + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; + + if (vxtime.hpet_address && notsc) { + timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; + if (hpet_use_timer) + vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; + else + vxtime.last = hpet_readl(HPET_COUNTER); vxtime.mode = VXTIME_HPET; do_gettimeoffset = do_gettimeoffset_hpet; +#ifdef CONFIG_X86_PM_TIMER + /* Using PM for gettimeofday is quite slow, but we have no other + choice because the TSC is too unreliable on some systems. */ + } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { + timetype = "PM"; + do_gettimeoffset = do_gettimeoffset_pm; + vxtime.mode = VXTIME_PMTMR; + sysctl_vsyscall = 0; + printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); +#endif } else { - timetype = "PIT/TSC"; + timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; } - printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype); + + printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", + vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype); + printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; + vxtime.last_tsc = get_cycles_sync(); + + set_cyc2ns_scale(cpu_khz); } __setup("report_lost_ticks", time_setup); static long clock_cmos_diff; +static unsigned long sleep_start; + +/* + * sysfs support for the timer. + */ -static int time_suspend(struct sys_device *dev, u32 state) +static int timer_suspend(struct sys_device *dev, pm_message_t state) { /* * Estimate time zone so that set_time can update the clock */ - clock_cmos_diff = -get_cmos_time(); + long cmos_time = get_cmos_time(); + + clock_cmos_diff = -cmos_time; clock_cmos_diff += get_seconds(); + sleep_start = cmos_time; return 0; } -static int time_resume(struct sys_device *dev) +static int timer_resume(struct sys_device *dev) { - unsigned long sec = get_cmos_time() + clock_cmos_diff; - write_seqlock_irq(&xtime_lock); + unsigned long flags; + unsigned long sec; + unsigned long ctime = get_cmos_time(); + long sleep_length = (ctime - sleep_start) * HZ; + + if (sleep_length < 0) { + printk(KERN_WARNING "Time skew detected in timer resume!\n"); + /* The time after the resume must not be earlier than the time + * before the suspend or some nasty things will happen + */ + sleep_length = 0; + ctime = sleep_start; + } + if (vxtime.hpet_address) + hpet_reenable(); + else + i8254_timer_resume(); + + sec = ctime + clock_cmos_diff; + write_seqlock_irqsave(&xtime_lock,flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; - write_sequnlock_irq(&xtime_lock); + if (vxtime.mode == VXTIME_HPET) { + if (hpet_use_timer) + vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; + else + vxtime.last = hpet_readl(HPET_COUNTER); +#ifdef CONFIG_X86_PM_TIMER + } else if (vxtime.mode == VXTIME_PMTMR) { + pmtimer_resume(); +#endif + } else + vxtime.last_tsc = get_cycles_sync(); + write_sequnlock_irqrestore(&xtime_lock,flags); + jiffies += sleep_length; + monotonic_base += sleep_length * (NSEC_PER_SEC/HZ); + touch_softlockup_watchdog(); return 0; } -static struct sysdev_class pit_sysclass = { - .resume = time_resume, - .suspend = time_suspend, - set_kset_name("pit"), +static struct sysdev_class timer_sysclass = { + .resume = timer_resume, + .suspend = timer_suspend, + set_kset_name("timer"), }; - /* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_i8253 = { +static struct sys_device device_timer = { .id = 0, - .cls = &pit_sysclass, + .cls = &timer_sysclass, }; static int time_init_device(void) { - int error = sysdev_class_register(&pit_sysclass); + int error = sysdev_class_register(&timer_sysclass); if (!error) - error = sysdev_register(&device_i8253); + error = sysdev_register(&device_timer); return error; } @@ -827,11 +1114,8 @@ device_initcall(time_init_device); * For (3), we use interrupts at 64Hz or user specified periodic * frequency, whichever is higher. */ -#include #include -extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs); - #define DEFAULT_RTC_INT_FREQ 64 #define RTC_NUM_INTS 1 @@ -846,6 +1130,7 @@ static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; static unsigned long PIE_count; static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ +static unsigned int hpet_t1_cmp; /* cached comparator register */ int is_hpet_enabled(void) { @@ -879,24 +1164,32 @@ int hpet_rtc_timer_init(void) hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; local_irq_save(flags); + cnt = hpet_readl(HPET_COUNTER); cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); hpet_writel(cnt, HPET_T1_CMP); - local_irq_restore(flags); + hpet_t1_cmp = cnt; cfg = hpet_readl(HPET_T1_CFG); - cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; hpet_writel(cfg, HPET_T1_CFG); + local_irq_restore(flags); + return 1; } static void hpet_rtc_timer_reinit(void) { - unsigned int cfg, cnt; + unsigned int cfg, cnt, ticks_per_int, lost_ints; - if (!(PIE_on | AIE_on | UIE_on)) + if (unlikely(!(PIE_on | AIE_on | UIE_on))) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); return; + } if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) hpet_rtc_int_freq = PIE_freq; @@ -904,15 +1197,33 @@ static void hpet_rtc_timer_reinit(void) hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; /* It is more accurate to use the comparator value than current count.*/ - cnt = hpet_readl(HPET_T1_CMP); - cnt += hpet_tick*HZ/hpet_rtc_int_freq; - hpet_writel(cnt, HPET_T1_CMP); + ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; + hpet_t1_cmp += ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); - cfg = hpet_readl(HPET_T1_CFG); - cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); + /* + * If the interrupt handler was delayed too long, the write above tries + * to schedule the next interrupt in the past and the hardware would + * not interrupt until the counter had wrapped around. + * So we have to check that the comparator wasn't set to a past time. + */ + cnt = hpet_readl(HPET_COUNTER); + if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { + lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; + /* Make sure that, even with the time needed to execute + * this code, the next scheduled interrupt has been moved + * back to the future: */ + lost_ints++; - return; + hpet_t1_cmp += lost_ints * ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + if (PIE_on) + PIE_count += lost_ints; + + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); + } } /* @@ -1032,7 +1343,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) } if (call_rtc_interrupt) { rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id, regs); + rtc_interrupt(rtc_int_flag, dev_id); } return IRQ_HANDLED; } @@ -1041,7 +1352,15 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) static int __init nohpet_setup(char *s) { nohpet = 1; - return 0; + return 1; } __setup("nohpet", nohpet_setup); + +int __init notsc_setup(char *s) +{ + notsc = 1; + return 1; +} + +__setup("notsc", notsc_setup);