VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / arch / i386 / kernel / timers / timer_tsc.c
1 /*
2  * This code largely moved from arch/i386/kernel/time.c.
3  * See comments there for proper credits.
4  *
5  * 2004-06-25    Jesper Juhl
6  *      moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
7  *      failing to inline.
8  */
9
10 #include <linux/spinlock.h>
11 #include <linux/init.h>
12 #include <linux/timex.h>
13 #include <linux/errno.h>
14 #include <linux/cpufreq.h>
15 #include <linux/string.h>
16 #include <linux/jiffies.h>
17
18 #include <asm/timer.h>
19 #include <asm/io.h>
20 /* processor.h for distable_tsc flag */
21 #include <asm/processor.h>
22
23 #include "io_ports.h"
24 #include "mach_timer.h"
25
26 #include <asm/hpet.h>
27
28 #ifdef CONFIG_HPET_TIMER
29 static unsigned long hpet_usec_quotient;
30 static unsigned long hpet_last;
31 struct timer_opts timer_tsc;
32 #endif
33
34 static inline void cpufreq_delayed_get(void);
35
36 int tsc_disable __initdata = 0;
37
38 extern spinlock_t i8253_lock;
39
40 static int use_tsc;
41 /* Number of usecs that the last interrupt was delayed */
42 static int delay_at_last_interrupt;
43
44 static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
45 static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
46 static unsigned long long monotonic_base;
47 static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
48
49 /* convert from cycles(64bits) => nanoseconds (64bits)
50  *  basic equation:
51  *              ns = cycles / (freq / ns_per_sec)
52  *              ns = cycles * (ns_per_sec / freq)
53  *              ns = cycles * (10^9 / (cpu_mhz * 10^6))
54  *              ns = cycles * (10^3 / cpu_mhz)
55  *
56  *      Then we use scaling math (suggested by george@mvista.com) to get:
57  *              ns = cycles * (10^3 * SC / cpu_mhz) / SC
58  *              ns = cycles * cyc2ns_scale / SC
59  *
60  *      And since SC is a constant power of two, we can convert the div
61  *  into a shift.   
62  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
63  */
64 static unsigned long cyc2ns_scale; 
65 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
66
67 static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
68 {
69         cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
70 }
71
72 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
73 {
74         return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
75 }
76
77 static int count2; /* counter for mark_offset_tsc() */
78
79 /* Cached *multiplier* to convert TSC counts to microseconds.
80  * (see the equation below).
81  * Equal to 2^32 * (1 / (clocks per usec) ).
82  * Initialized in time_init.
83  */
84 static unsigned long fast_gettimeoffset_quotient;
85
86 static unsigned long get_offset_tsc(void)
87 {
88         register unsigned long eax, edx;
89
90         /* Read the Time Stamp Counter */
91
92         rdtsc(eax,edx);
93
94         /* .. relative to previous jiffy (32 bits is enough) */
95         eax -= last_tsc_low;    /* tsc_low delta */
96
97         /*
98          * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
99          *             = (tsc_low delta) * (usecs_per_clock)
100          *             = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
101          *
102          * Using a mull instead of a divl saves up to 31 clock cycles
103          * in the critical path.
104          */
105
106         __asm__("mull %2"
107                 :"=a" (eax), "=d" (edx)
108                 :"rm" (fast_gettimeoffset_quotient),
109                  "0" (eax));
110
111         /* our adjusted time offset in microseconds */
112         return delay_at_last_interrupt + edx;
113 }
114
115 static unsigned long long monotonic_clock_tsc(void)
116 {
117         unsigned long long last_offset, this_offset, base;
118         unsigned seq;
119         
120         /* atomically read monotonic base & last_offset */
121         do {
122                 seq = read_seqbegin(&monotonic_lock);
123                 last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
124                 base = monotonic_base;
125         } while (read_seqretry(&monotonic_lock, seq));
126
127         /* Read the Time Stamp Counter */
128         rdtscll(this_offset);
129
130         /* return the value in ns */
131         return base + cycles_2_ns(this_offset - last_offset);
132 }
133
134 /*
135  * Scheduler clock - returns current time in nanosec units.
136  */
137 unsigned long long sched_clock(void)
138 {
139         unsigned long long this_offset;
140
141         /*
142          * In the NUMA case we dont use the TSC as they are not
143          * synchronized across all CPUs.
144          */
145 #ifndef CONFIG_NUMA
146         if (!use_tsc)
147 #endif
148                 /* no locking but a rare wrong value is not a big deal */
149                 return jiffies_64 * (1000000000 / HZ);
150
151         /* Read the Time Stamp Counter */
152         rdtscll(this_offset);
153
154         /* return the value in ns */
155         return cycles_2_ns(this_offset);
156 }
157
158 static void delay_tsc(unsigned long loops)
159 {
160         unsigned long bclock, now;
161         
162         rdtscl(bclock);
163         do
164         {
165                 rep_nop();
166                 rdtscl(now);
167         } while ((now-bclock) < loops);
168 }
169
170 #ifdef CONFIG_HPET_TIMER
171 static void mark_offset_tsc_hpet(void)
172 {
173         unsigned long long this_offset, last_offset;
174         unsigned long offset, temp, hpet_current;
175
176         write_seqlock(&monotonic_lock);
177         last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
178         /*
179          * It is important that these two operations happen almost at
180          * the same time. We do the RDTSC stuff first, since it's
181          * faster. To avoid any inconsistencies, we need interrupts
182          * disabled locally.
183          */
184         /*
185          * Interrupts are just disabled locally since the timer irq
186          * has the SA_INTERRUPT flag set. -arca
187          */
188         /* read Pentium cycle counter */
189
190         hpet_current = hpet_readl(HPET_COUNTER);
191         rdtsc(last_tsc_low, last_tsc_high);
192
193         /* lost tick compensation */
194         offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
195         if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) {
196                 int lost_ticks = (offset - hpet_last) / hpet_tick;
197                 jiffies_64 += lost_ticks;
198         }
199         hpet_last = hpet_current;
200
201         /* update the monotonic base value */
202         this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
203         monotonic_base += cycles_2_ns(this_offset - last_offset);
204         write_sequnlock(&monotonic_lock);
205
206         /* calculate delay_at_last_interrupt */
207         /*
208          * Time offset = (hpet delta) * ( usecs per HPET clock )
209          *             = (hpet delta) * ( usecs per tick / HPET clocks per tick)
210          *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
211          * Where,
212          * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
213          */
214         delay_at_last_interrupt = hpet_current - offset;
215         ASM_MUL64_REG(temp, delay_at_last_interrupt,
216                         hpet_usec_quotient, delay_at_last_interrupt);
217 }
218 #endif
219
220
221 #ifdef CONFIG_CPU_FREQ
222 #include <linux/workqueue.h>
223
224 static unsigned int cpufreq_delayed_issched = 0;
225 static unsigned int cpufreq_init = 0;
226 static struct work_struct cpufreq_delayed_get_work;
227
228 static void handle_cpufreq_delayed_get(void *v)
229 {
230         unsigned int cpu;
231         for_each_online_cpu(cpu) {
232                 cpufreq_get(cpu);
233         }
234         cpufreq_delayed_issched = 0;
235 }
236
237 /* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
238  * to verify the CPU frequency the timing core thinks the CPU is running
239  * at is still correct.
240  */
241 static inline void cpufreq_delayed_get(void) 
242 {
243         if (cpufreq_init && !cpufreq_delayed_issched) {
244                 cpufreq_delayed_issched = 1;
245                 printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
246                 schedule_work(&cpufreq_delayed_get_work);
247         }
248 }
249
250 /* If the CPU frequency is scaled, TSC-based delays will need a different
251  * loops_per_jiffy value to function properly.
252  */
253
254 static unsigned int  ref_freq = 0;
255 static unsigned long loops_per_jiffy_ref = 0;
256
257 #ifndef CONFIG_SMP
258 static unsigned long fast_gettimeoffset_ref = 0;
259 static unsigned long cpu_khz_ref = 0;
260 #endif
261
262 static int
263 time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
264                        void *data)
265 {
266         struct cpufreq_freqs *freq = data;
267
268         write_seqlock_irq(&xtime_lock);
269         if (!ref_freq) {
270                 ref_freq = freq->old;
271                 loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
272 #ifndef CONFIG_SMP
273                 fast_gettimeoffset_ref = fast_gettimeoffset_quotient;
274                 cpu_khz_ref = cpu_khz;
275 #endif
276         }
277
278         if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
279             (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
280             (val == CPUFREQ_RESUMECHANGE)) {
281                 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
282                         cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
283 #ifndef CONFIG_SMP
284                 if (cpu_khz)
285                         cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
286                 if (use_tsc) {
287                         if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
288                                 fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq);
289                                 set_cyc2ns_scale(cpu_khz/1000);
290                         }
291                 }
292 #endif
293         }
294         write_sequnlock_irq(&xtime_lock);
295
296         return 0;
297 }
298
299 static struct notifier_block time_cpufreq_notifier_block = {
300         .notifier_call  = time_cpufreq_notifier
301 };
302
303
304 static int __init cpufreq_tsc(void)
305 {
306         int ret;
307         INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
308         ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
309                                         CPUFREQ_TRANSITION_NOTIFIER);
310         if (!ret)
311                 cpufreq_init = 1;
312         return ret;
313 }
314 core_initcall(cpufreq_tsc);
315
316 #else /* CONFIG_CPU_FREQ */
317 static inline void cpufreq_delayed_get(void) { return; }
318 #endif 
319
320 static void mark_offset_tsc(void)
321 {
322         unsigned long lost,delay;
323         unsigned long delta = last_tsc_low;
324         int count;
325         int countmp;
326         static int count1 = 0;
327         unsigned long long this_offset, last_offset;
328         static int lost_count = 0;
329
330         write_seqlock(&monotonic_lock);
331         last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
332         /*
333          * It is important that these two operations happen almost at
334          * the same time. We do the RDTSC stuff first, since it's
335          * faster. To avoid any inconsistencies, we need interrupts
336          * disabled locally.
337          */
338
339         /*
340          * Interrupts are just disabled locally since the timer irq
341          * has the SA_INTERRUPT flag set. -arca
342          */
343
344         /* read Pentium cycle counter */
345
346         rdtsc(last_tsc_low, last_tsc_high);
347
348         spin_lock(&i8253_lock);
349         outb_p(0x00, PIT_MODE);     /* latch the count ASAP */
350
351         count = inb_p(PIT_CH0);    /* read the latched count */
352         count |= inb(PIT_CH0) << 8;
353
354         /*
355          * VIA686a test code... reset the latch if count > max + 1
356          * from timer_pit.c - cjb
357          */
358         if (count > LATCH) {
359                 outb_p(0x34, PIT_MODE);
360                 outb_p(LATCH & 0xff, PIT_CH0);
361                 outb(LATCH >> 8, PIT_CH0);
362                 count = LATCH - 1;
363         }
364
365         spin_unlock(&i8253_lock);
366
367         if (pit_latch_buggy) {
368                 /* get center value of last 3 time lutch */
369                 if ((count2 >= count && count >= count1)
370                     || (count1 >= count && count >= count2)) {
371                         count2 = count1; count1 = count;
372                 } else if ((count1 >= count2 && count2 >= count)
373                            || (count >= count2 && count2 >= count1)) {
374                         countmp = count;count = count2;
375                         count2 = count1;count1 = countmp;
376                 } else {
377                         count2 = count1; count1 = count; count = count1;
378                 }
379         }
380
381         /* lost tick compensation */
382         delta = last_tsc_low - delta;
383         {
384                 register unsigned long eax, edx;
385                 eax = delta;
386                 __asm__("mull %2"
387                 :"=a" (eax), "=d" (edx)
388                 :"rm" (fast_gettimeoffset_quotient),
389                  "0" (eax));
390                 delta = edx;
391         }
392         delta += delay_at_last_interrupt;
393         lost = delta/(1000000/HZ);
394         delay = delta%(1000000/HZ);
395         if (lost >= 2) {
396                 jiffies_64 += lost-1;
397
398                 /* sanity check to ensure we're not always losing ticks */
399                 if (lost_count++ > 100) {
400                         printk(KERN_WARNING "Losing too many ticks!\n");
401                         printk(KERN_WARNING "TSC cannot be used as a timesource.  \n");
402                         printk(KERN_WARNING "Possible reasons for this are:\n");
403                         printk(KERN_WARNING "  You're running with Speedstep,\n");
404                         printk(KERN_WARNING "  You don't have DMA enabled for your hard disk (see hdparm),\n");
405                         printk(KERN_WARNING "  Incorrect TSC synchronization on an SMP system (see dmesg).\n");
406                         printk(KERN_WARNING "Falling back to a sane timesource now.\n");
407
408                         clock_fallback();
409                 }
410                 /* ... but give the TSC a fair chance */
411                 if (lost_count > 25)
412                         cpufreq_delayed_get();
413         } else
414                 lost_count = 0;
415         /* update the monotonic base value */
416         this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
417         monotonic_base += cycles_2_ns(this_offset - last_offset);
418         write_sequnlock(&monotonic_lock);
419
420         /* calculate delay_at_last_interrupt */
421         count = ((LATCH-1) - count) * TICK_SIZE;
422         delay_at_last_interrupt = (count + LATCH/2) / LATCH;
423
424         /* catch corner case where tick rollover occured
425          * between tsc and pit reads (as noted when
426          * usec delta is > 90% # of usecs/tick)
427          */
428         if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
429                 jiffies_64++;
430 }
431
432 static int __init init_tsc(char* override)
433 {
434
435         /* check clock override */
436         if (override[0] && strncmp(override,"tsc",3)) {
437 #ifdef CONFIG_HPET_TIMER
438                 if (is_hpet_enabled()) {
439                         printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n");
440                 } else
441 #endif
442                 {
443                         return -ENODEV;
444                 }
445         }
446
447         /*
448          * If we have APM enabled or the CPU clock speed is variable
449          * (CPU stops clock on HLT or slows clock to save power)
450          * then the TSC timestamps may diverge by up to 1 jiffy from
451          * 'real time' but nothing will break.
452          * The most frequent case is that the CPU is "woken" from a halt
453          * state by the timer interrupt itself, so we get 0 error. In the
454          * rare cases where a driver would "wake" the CPU and request a
455          * timestamp, the maximum error is < 1 jiffy. But timestamps are
456          * still perfectly ordered.
457          * Note that the TSC counter will be reset if APM suspends
458          * to disk; this won't break the kernel, though, 'cuz we're
459          * smart.  See arch/i386/kernel/apm.c.
460          */
461         /*
462          *      Firstly we have to do a CPU check for chips with
463          *      a potentially buggy TSC. At this point we haven't run
464          *      the ident/bugs checks so we must run this hook as it
465          *      may turn off the TSC flag.
466          *
467          *      NOTE: this doesn't yet handle SMP 486 machines where only
468          *      some CPU's have a TSC. Thats never worked and nobody has
469          *      moaned if you have the only one in the world - you fix it!
470          */
471
472         count2 = LATCH; /* initialize counter for mark_offset_tsc() */
473
474         if (cpu_has_tsc) {
475                 unsigned long tsc_quotient;
476 #ifdef CONFIG_HPET_TIMER
477                 if (is_hpet_enabled()){
478                         unsigned long result, remain;
479                         printk("Using TSC for gettimeofday\n");
480                         tsc_quotient = calibrate_tsc_hpet(NULL);
481                         timer_tsc.mark_offset = &mark_offset_tsc_hpet;
482                         /*
483                          * Math to calculate hpet to usec multiplier
484                          * Look for the comments at get_offset_tsc_hpet()
485                          */
486                         ASM_DIV64_REG(result, remain, hpet_tick,
487                                         0, KERNEL_TICK_USEC);
488                         if (remain > (hpet_tick >> 1))
489                                 result++; /* rounding the result */
490
491                         hpet_usec_quotient = result;
492                 } else
493 #endif
494                 {
495                         tsc_quotient = calibrate_tsc();
496                 }
497
498                 if (tsc_quotient) {
499                         fast_gettimeoffset_quotient = tsc_quotient;
500                         use_tsc = 1;
501                         /*
502                          *      We could be more selective here I suspect
503                          *      and just enable this for the next intel chips ?
504                          */
505                         /* report CPU clock rate in Hz.
506                          * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
507                          * clock/second. Our precision is about 100 ppm.
508                          */
509                         {       unsigned long eax=0, edx=1000;
510                                 __asm__("divl %2"
511                                 :"=a" (cpu_khz), "=d" (edx)
512                                 :"r" (tsc_quotient),
513                                 "0" (eax), "1" (edx));
514                                 printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000);
515                         }
516                         set_cyc2ns_scale(cpu_khz/1000);
517                         return 0;
518                 }
519         }
520         return -ENODEV;
521 }
522
523 #ifndef CONFIG_X86_TSC
524 /* disable flag for tsc.  Takes effect by clearing the TSC cpu flag
525  * in cpu/common.c */
526 static int __init tsc_setup(char *str)
527 {
528         tsc_disable = 1;
529         return 1;
530 }
531 #else
532 static int __init tsc_setup(char *str)
533 {
534         printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
535                                 "cannot disable TSC.\n");
536         return 1;
537 }
538 #endif
539 __setup("notsc", tsc_setup);
540
541
542
543 /************************************************************/
544
545 /* tsc timer_opts struct */
546 struct timer_opts timer_tsc = {
547         .name =         "tsc",
548         .init =         init_tsc,
549         .mark_offset =  mark_offset_tsc, 
550         .get_offset =   get_offset_tsc,
551         .monotonic_clock =      monotonic_clock_tsc,
552         .delay = delay_tsc,
553 };