This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / arch / i386 / kernel / time-xen.c
1 /*
2  *  linux/arch/i386/kernel/time.c
3  *
4  *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
5  *
6  * This file contains the PC-specific time handling details:
7  * reading the RTC at bootup, etc..
8  * 1994-07-02    Alan Modra
9  *      fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10  * 1995-03-26    Markus Kuhn
11  *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12  *      precision CMOS clock update
13  * 1996-05-03    Ingo Molnar
14  *      fixed time warps in do_[slow|fast]_gettimeoffset()
15  * 1997-09-10   Updated NTP code according to technical memorandum Jan '96
16  *              "A Kernel Model for Precision Timekeeping" by Dave Mills
17  * 1998-09-05    (Various)
18  *      More robust do_fast_gettimeoffset() algorithm implemented
19  *      (works with APM, Cyrix 6x86MX and Centaur C6),
20  *      monotonic gettimeofday() with fast_get_timeoffset(),
21  *      drift-proof precision TSC calibration on boot
22  *      (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23  *      Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24  *      ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25  * 1998-12-16    Andrea Arcangeli
26  *      Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27  *      because was not accounting lost_ticks.
28  * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
29  *      Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30  *      serialize accesses to xtime/lost_ticks).
31  */
32
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
53
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/sections.h>
64
65 #include "mach_time.h"
66
67 #include <linux/timex.h>
68 #include <linux/config.h>
69
70 #include <asm/hpet.h>
71
72 #include <asm/arch_hooks.h>
73
74 #include <xen/evtchn.h>
75 #include <xen/interface/vcpu.h>
76
77 #if defined (__i386__)
78 #include <asm/i8259.h>
79 #endif
80
81 int pit_latch_buggy;              /* extern */
82
83 #if defined(__x86_64__)
84 unsigned long vxtime_hz = PIT_TICK_RATE;
85 struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
86 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
87 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
88 struct timespec __xtime __section_xtime;
89 struct timezone __sys_tz __section_sys_tz;
90 #endif
91
92 unsigned int cpu_khz;   /* Detected as we calibrate the TSC */
93 EXPORT_SYMBOL(cpu_khz);
94
95 extern unsigned long wall_jiffies;
96
97 DEFINE_SPINLOCK(rtc_lock);
98 EXPORT_SYMBOL(rtc_lock);
99
100 #if defined (__i386__)
101 #include <asm/i8253.h>
102 #endif
103
104 DEFINE_SPINLOCK(i8253_lock);
105 EXPORT_SYMBOL(i8253_lock);
106
107 extern struct init_timer_opts timer_tsc_init;
108 extern struct timer_opts timer_tsc;
109 #define timer_none timer_tsc
110 struct timer_opts *cur_timer __read_mostly = &timer_tsc;
111
112 /* These are peridically updated in shared_info, and then copied here. */
113 struct shadow_time_info {
114         u64 tsc_timestamp;     /* TSC at last update of time vals.  */
115         u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
116         u32 tsc_to_nsec_mul;
117         u32 tsc_to_usec_mul;
118         int tsc_shift;
119         u32 version;
120 };
121 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
122 static struct timespec shadow_tv;
123 static u32 shadow_tv_version;
124
125 /* Keep track of last time we did processing/updating of jiffies and xtime. */
126 static u64 processed_system_time;   /* System time (ns) at last processing. */
127 static DEFINE_PER_CPU(u64, processed_system_time);
128
129 /* How much CPU time was spent blocked and how much was 'stolen'? */
130 static DEFINE_PER_CPU(u64, processed_stolen_time);
131 static DEFINE_PER_CPU(u64, processed_blocked_time);
132
133 /* Current runstate of each CPU (updated automatically by the hypervisor). */
134 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
135
136 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
137 #define NS_PER_TICK (1000000000LL/HZ)
138
139 static inline void __normalize_time(time_t *sec, s64 *nsec)
140 {
141         while (*nsec >= NSEC_PER_SEC) {
142                 (*nsec) -= NSEC_PER_SEC;
143                 (*sec)++;
144         }
145         while (*nsec < 0) {
146                 (*nsec) += NSEC_PER_SEC;
147                 (*sec)--;
148         }
149 }
150
151 /* Does this guest OS track Xen time, or set its wall clock independently? */
152 static int independent_wallclock = 0;
153 static int __init __independent_wallclock(char *str)
154 {
155         independent_wallclock = 1;
156         return 1;
157 }
158 __setup("independent_wallclock", __independent_wallclock);
159
160 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
161 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
162 static int __init __permitted_clock_jitter(char *str)
163 {
164         permitted_clock_jitter = simple_strtoul(str, NULL, 0);
165         return 1;
166 }
167 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
168
169 int tsc_disable __devinitdata = 0;
170
171 static void delay_tsc(unsigned long loops)
172 {
173         unsigned long bclock, now;
174
175         rdtscl(bclock);
176         do {
177                 rep_nop();
178                 rdtscl(now);
179         } while ((now - bclock) < loops);
180 }
181
182 struct timer_opts timer_tsc = {
183         .name = "tsc",
184         .delay = delay_tsc,
185 };
186
187 /*
188  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
189  * yielding a 64-bit result.
190  */
191 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
192 {
193         u64 product;
194 #ifdef __i386__
195         u32 tmp1, tmp2;
196 #endif
197
198         if (shift < 0)
199                 delta >>= -shift;
200         else
201                 delta <<= shift;
202
203 #ifdef __i386__
204         __asm__ (
205                 "mul  %5       ; "
206                 "mov  %4,%%eax ; "
207                 "mov  %%edx,%4 ; "
208                 "mul  %5       ; "
209                 "xor  %5,%5    ; "
210                 "add  %4,%%eax ; "
211                 "adc  %5,%%edx ; "
212                 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
213                 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
214 #else
215         __asm__ (
216                 "mul %%rdx ; shrd $32,%%rdx,%%rax"
217                 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
218 #endif
219
220         return product;
221 }
222
223 #if defined (__i386__)
224 int read_current_timer(unsigned long *timer_val)
225 {
226         rdtscl(*timer_val);
227         return 0;
228 }
229 #endif
230
231 void init_cpu_khz(void)
232 {
233         u64 __cpu_khz = 1000000ULL << 32;
234         struct vcpu_time_info *info;
235         info = &HYPERVISOR_shared_info->vcpu_info[0].time;
236         do_div(__cpu_khz, info->tsc_to_system_mul);
237         if (info->tsc_shift < 0)
238                 cpu_khz = __cpu_khz << -info->tsc_shift;
239         else
240                 cpu_khz = __cpu_khz >> info->tsc_shift;
241 }
242
243 static u64 get_nsec_offset(struct shadow_time_info *shadow)
244 {
245         u64 now, delta;
246         rdtscll(now);
247         delta = now - shadow->tsc_timestamp;
248         return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
249 }
250
251 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
252 {
253         u64 now, delta;
254         rdtscll(now);
255         delta = now - shadow->tsc_timestamp;
256         return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
257 }
258
259 static void __update_wallclock(time_t sec, long nsec)
260 {
261         long wtm_nsec, xtime_nsec;
262         time_t wtm_sec, xtime_sec;
263         u64 tmp, wc_nsec;
264
265         /* Adjust wall-clock time base based on wall_jiffies ticks. */
266         wc_nsec = processed_system_time;
267         wc_nsec += sec * (u64)NSEC_PER_SEC;
268         wc_nsec += nsec;
269         wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
270
271         /* Split wallclock base into seconds and nanoseconds. */
272         tmp = wc_nsec;
273         xtime_nsec = do_div(tmp, 1000000000);
274         xtime_sec  = (time_t)tmp;
275
276         wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
277         wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
278
279         set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
280         set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
281
282         ntp_clear();
283 }
284
285 static void update_wallclock(void)
286 {
287         shared_info_t *s = HYPERVISOR_shared_info;
288
289         do {
290                 shadow_tv_version = s->wc_version;
291                 rmb();
292                 shadow_tv.tv_sec  = s->wc_sec;
293                 shadow_tv.tv_nsec = s->wc_nsec;
294                 rmb();
295         } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
296
297         if (!independent_wallclock)
298                 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
299 }
300
301 /*
302  * Reads a consistent set of time-base values from Xen, into a shadow data
303  * area.
304  */
305 static void get_time_values_from_xen(void)
306 {
307         shared_info_t           *s = HYPERVISOR_shared_info;
308         struct vcpu_time_info   *src;
309         struct shadow_time_info *dst;
310
311         src = &s->vcpu_info[smp_processor_id()].time;
312         dst = &per_cpu(shadow_time, smp_processor_id());
313
314         do {
315                 dst->version = src->version;
316                 rmb();
317                 dst->tsc_timestamp     = src->tsc_timestamp;
318                 dst->system_timestamp  = src->system_time;
319                 dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
320                 dst->tsc_shift         = src->tsc_shift;
321                 rmb();
322         } while ((src->version & 1) | (dst->version ^ src->version));
323
324         dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
325 }
326
327 static inline int time_values_up_to_date(int cpu)
328 {
329         struct vcpu_time_info   *src;
330         struct shadow_time_info *dst;
331
332         src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
333         dst = &per_cpu(shadow_time, cpu);
334
335         rmb();
336         return (dst->version == src->version);
337 }
338
339 /*
340  * This is a special lock that is owned by the CPU and holds the index
341  * register we are working with.  It is required for NMI access to the
342  * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
343  */
344 volatile unsigned long cmos_lock = 0;
345 EXPORT_SYMBOL(cmos_lock);
346
347 /* Routines for accessing the CMOS RAM/RTC. */
348 unsigned char rtc_cmos_read(unsigned char addr)
349 {
350         unsigned char val;
351         lock_cmos_prefix(addr);
352         outb_p(addr, RTC_PORT(0));
353         val = inb_p(RTC_PORT(1));
354         lock_cmos_suffix(addr);
355         return val;
356 }
357 EXPORT_SYMBOL(rtc_cmos_read);
358
359 void rtc_cmos_write(unsigned char val, unsigned char addr)
360 {
361         lock_cmos_prefix(addr);
362         outb_p(addr, RTC_PORT(0));
363         outb_p(val, RTC_PORT(1));
364         lock_cmos_suffix(addr);
365 }
366 EXPORT_SYMBOL(rtc_cmos_write);
367
368 /*
369  * This version of gettimeofday has microsecond resolution
370  * and better than microsecond precision on fast x86 machines with TSC.
371  */
372 void do_gettimeofday(struct timeval *tv)
373 {
374         unsigned long seq;
375         unsigned long usec, sec;
376         unsigned long max_ntp_tick;
377         s64 nsec;
378         unsigned int cpu;
379         struct shadow_time_info *shadow;
380         u32 local_time_version;
381
382         cpu = get_cpu();
383         shadow = &per_cpu(shadow_time, cpu);
384
385         do {
386                 unsigned long lost;
387
388                 local_time_version = shadow->version;
389                 seq = read_seqbegin(&xtime_lock);
390
391                 usec = get_usec_offset(shadow);
392                 lost = jiffies - wall_jiffies;
393
394                 /*
395                  * If time_adjust is negative then NTP is slowing the clock
396                  * so make sure not to go into next possible interval.
397                  * Better to lose some accuracy than have time go backwards..
398                  */
399                 if (unlikely(time_adjust < 0)) {
400                         max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
401                         usec = min(usec, max_ntp_tick);
402
403                         if (lost)
404                                 usec += lost * max_ntp_tick;
405                 }
406                 else if (unlikely(lost))
407                         usec += lost * (USEC_PER_SEC / HZ);
408
409                 sec = xtime.tv_sec;
410                 usec += (xtime.tv_nsec / NSEC_PER_USEC);
411
412                 nsec = shadow->system_timestamp - processed_system_time;
413                 __normalize_time(&sec, &nsec);
414                 usec += (long)nsec / NSEC_PER_USEC;
415
416                 if (unlikely(!time_values_up_to_date(cpu))) {
417                         /*
418                          * We may have blocked for a long time,
419                          * rendering our calculations invalid
420                          * (e.g. the time delta may have
421                          * overflowed). Detect that and recalculate
422                          * with fresh values.
423                          */
424                         get_time_values_from_xen();
425                         continue;
426                 }
427         } while (read_seqretry(&xtime_lock, seq) ||
428                  (local_time_version != shadow->version));
429
430         put_cpu();
431
432         while (usec >= USEC_PER_SEC) {
433                 usec -= USEC_PER_SEC;
434                 sec++;
435         }
436
437         tv->tv_sec = sec;
438         tv->tv_usec = usec;
439 }
440
441 EXPORT_SYMBOL(do_gettimeofday);
442
443 int do_settimeofday(struct timespec *tv)
444 {
445         time_t sec;
446         s64 nsec;
447         unsigned int cpu;
448         struct shadow_time_info *shadow;
449         dom0_op_t op;
450
451         if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
452                 return -EINVAL;
453
454         cpu = get_cpu();
455         shadow = &per_cpu(shadow_time, cpu);
456
457         write_seqlock_irq(&xtime_lock);
458
459         /*
460          * Ensure we don't get blocked for a long time so that our time delta
461          * overflows. If that were to happen then our shadow time values would
462          * be stale, so we can retry with fresh ones.
463          */
464         for (;;) {
465                 nsec = tv->tv_nsec - get_nsec_offset(shadow);
466                 if (time_values_up_to_date(cpu))
467                         break;
468                 get_time_values_from_xen();
469         }
470         sec = tv->tv_sec;
471         __normalize_time(&sec, &nsec);
472
473         if ((xen_start_info->flags & SIF_INITDOMAIN) &&
474             !independent_wallclock) {
475                 op.cmd = DOM0_SETTIME;
476                 op.u.settime.secs        = sec;
477                 op.u.settime.nsecs       = nsec;
478                 op.u.settime.system_time = shadow->system_timestamp;
479                 HYPERVISOR_dom0_op(&op);
480                 update_wallclock();
481         } else if (independent_wallclock) {
482                 nsec -= shadow->system_timestamp;
483                 __normalize_time(&sec, &nsec);
484                 __update_wallclock(sec, nsec);
485         }
486
487         write_sequnlock_irq(&xtime_lock);
488
489         put_cpu();
490
491         clock_was_set();
492         return 0;
493 }
494
495 EXPORT_SYMBOL(do_settimeofday);
496
497 static void sync_xen_wallclock(unsigned long dummy);
498 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
499 static void sync_xen_wallclock(unsigned long dummy)
500 {
501         time_t sec;
502         s64 nsec;
503         dom0_op_t op;
504
505         if (!ntp_synced() || independent_wallclock ||
506             !(xen_start_info->flags & SIF_INITDOMAIN))
507                 return;
508
509         write_seqlock_irq(&xtime_lock);
510
511         sec  = xtime.tv_sec;
512         nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
513         __normalize_time(&sec, &nsec);
514
515         op.cmd = DOM0_SETTIME;
516         op.u.settime.secs        = sec;
517         op.u.settime.nsecs       = nsec;
518         op.u.settime.system_time = processed_system_time;
519         HYPERVISOR_dom0_op(&op);
520
521         update_wallclock();
522
523         write_sequnlock_irq(&xtime_lock);
524
525         /* Once per minute. */
526         mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
527 }
528
529 static int set_rtc_mmss(unsigned long nowtime)
530 {
531         int retval;
532
533         WARN_ON(irqs_disabled());
534
535         if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
536                 return 0;
537
538         /* gets recalled with irq locally disabled */
539         spin_lock_irq(&rtc_lock);
540         if (efi_enabled)
541                 retval = efi_set_rtc_mmss(nowtime);
542         else
543                 retval = mach_set_rtc_mmss(nowtime);
544         spin_unlock_irq(&rtc_lock);
545
546         return retval;
547 }
548
549 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
550  *              Note: This function is required to return accurate
551  *              time even in the absence of multiple timer ticks.
552  */
553 unsigned long long monotonic_clock(void)
554 {
555         int cpu = get_cpu();
556         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
557         u64 time;
558         u32 local_time_version;
559
560         do {
561                 local_time_version = shadow->version;
562                 barrier();
563                 time = shadow->system_timestamp + get_nsec_offset(shadow);
564                 if (!time_values_up_to_date(cpu))
565                         get_time_values_from_xen();
566                 barrier();
567         } while (local_time_version != shadow->version);
568
569         put_cpu();
570
571         return time;
572 }
573 EXPORT_SYMBOL(monotonic_clock);
574
575 unsigned long long sched_clock(void)
576 {
577         return monotonic_clock();
578 }
579
580 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
581 unsigned long profile_pc(struct pt_regs *regs)
582 {
583         unsigned long pc = instruction_pointer(regs);
584
585 #ifdef __x86_64__
586         /* Assume the lock function has either no stack frame or only a single word.
587            This checks if the address on the stack looks like a kernel text address.
588            There is a small window for false hits, but in that case the tick
589            is just accounted to the spinlock function.
590            Better would be to write these functions in assembler again
591            and check exactly. */
592         if (in_lock_functions(pc)) {
593                 char *v = *(char **)regs->rsp;
594                 if ((v >= _stext && v <= _etext) ||
595                         (v >= _sinittext && v <= _einittext) ||
596                         (v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
597                         return (unsigned long)v;
598                 return ((unsigned long *)regs->rsp)[1];
599         }
600 #else
601         if (in_lock_functions(pc))
602                 return *(unsigned long *)(regs->ebp + 4);
603 #endif
604
605         return pc;
606 }
607 EXPORT_SYMBOL(profile_pc);
608 #endif
609
610 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
611 {
612         s64 delta, delta_cpu, stolen, blocked;
613         u64 sched_time;
614         int i, cpu = smp_processor_id();
615         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
616         struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
617
618         write_seqlock(&xtime_lock);
619
620         do {
621                 get_time_values_from_xen();
622
623                 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
624                 delta = delta_cpu =
625                         shadow->system_timestamp + get_nsec_offset(shadow);
626                 delta     -= processed_system_time;
627                 delta_cpu -= per_cpu(processed_system_time, cpu);
628
629                 /*
630                  * Obtain a consistent snapshot of stolen/blocked cycles. We
631                  * can use state_entry_time to detect if we get preempted here.
632                  */
633                 do {
634                         sched_time = runstate->state_entry_time;
635                         barrier();
636                         stolen = runstate->time[RUNSTATE_runnable] +
637                                 runstate->time[RUNSTATE_offline] -
638                                 per_cpu(processed_stolen_time, cpu);
639                         blocked = runstate->time[RUNSTATE_blocked] -
640                                 per_cpu(processed_blocked_time, cpu);
641                         barrier();
642                 } while (sched_time != runstate->state_entry_time);
643         } while (!time_values_up_to_date(cpu));
644
645         if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
646              unlikely(delta_cpu < -(s64)permitted_clock_jitter))
647             && printk_ratelimit()) {
648                 printk("Timer ISR/%d: Time went backwards: "
649                        "delta=%lld delta_cpu=%lld shadow=%lld "
650                        "off=%lld processed=%lld cpu_processed=%lld\n",
651                        cpu, delta, delta_cpu, shadow->system_timestamp,
652                        (s64)get_nsec_offset(shadow),
653                        processed_system_time,
654                        per_cpu(processed_system_time, cpu));
655                 for (i = 0; i < num_online_cpus(); i++)
656                         printk(" %d: %lld\n", i,
657                                per_cpu(processed_system_time, i));
658         }
659
660         /* System-wide jiffy work. */
661         while (delta >= NS_PER_TICK) {
662                 delta -= NS_PER_TICK;
663                 processed_system_time += NS_PER_TICK;
664                 do_timer(regs);
665         }
666
667         if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
668                 update_wallclock();
669                 clock_was_set();
670         }
671
672         write_sequnlock(&xtime_lock);
673
674         /*
675          * Account stolen ticks.
676          * HACK: Passing NULL to account_steal_time()
677          * ensures that the ticks are accounted as stolen.
678          */
679         if ((stolen > 0) && (delta_cpu > 0)) {
680                 delta_cpu -= stolen;
681                 if (unlikely(delta_cpu < 0))
682                         stolen += delta_cpu; /* clamp local-time progress */
683                 do_div(stolen, NS_PER_TICK);
684                 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
685                 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
686                 account_steal_time(NULL, (cputime_t)stolen);
687         }
688
689         /*
690          * Account blocked ticks.
691          * HACK: Passing idle_task to account_steal_time()
692          * ensures that the ticks are accounted as idle/wait.
693          */
694         if ((blocked > 0) && (delta_cpu > 0)) {
695                 delta_cpu -= blocked;
696                 if (unlikely(delta_cpu < 0))
697                         blocked += delta_cpu; /* clamp local-time progress */
698                 do_div(blocked, NS_PER_TICK);
699                 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
700                 per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
701                 account_steal_time(idle_task(cpu), (cputime_t)blocked);
702         }
703
704         /* Account user/system ticks. */
705         if (delta_cpu > 0) {
706                 do_div(delta_cpu, NS_PER_TICK);
707                 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
708                 if (user_mode(regs))
709                         account_user_time(current, (cputime_t)delta_cpu);
710                 else
711                         account_system_time(current, HARDIRQ_OFFSET,
712                                             (cputime_t)delta_cpu);
713         }
714
715         /* Local timer processing (see update_process_times()). */
716         run_local_timers();
717         if (rcu_pending(cpu))
718                 rcu_check_callbacks(cpu, user_mode(regs));
719         scheduler_tick();
720         run_posix_cpu_timers(current);
721
722         return IRQ_HANDLED;
723 }
724
725 static void init_missing_ticks_accounting(int cpu)
726 {
727         struct vcpu_register_runstate_memory_area area;
728         struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
729
730         memset(runstate, 0, sizeof(*runstate));
731
732         area.addr.v = runstate;
733         HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
734
735         per_cpu(processed_blocked_time, cpu) =
736                 runstate->time[RUNSTATE_blocked];
737         per_cpu(processed_stolen_time, cpu) =
738                 runstate->time[RUNSTATE_runnable] +
739                 runstate->time[RUNSTATE_offline];
740 }
741
742 /* not static: needed by APM */
743 unsigned long get_cmos_time(void)
744 {
745         unsigned long retval;
746
747         spin_lock(&rtc_lock);
748
749         if (efi_enabled)
750                 retval = efi_get_time();
751         else
752                 retval = mach_get_cmos_time();
753
754         spin_unlock(&rtc_lock);
755
756         return retval;
757 }
758 EXPORT_SYMBOL(get_cmos_time);
759
760 static void sync_cmos_clock(unsigned long dummy);
761
762 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
763
764 static void sync_cmos_clock(unsigned long dummy)
765 {
766         struct timeval now, next;
767         int fail = 1;
768
769         /*
770          * If we have an externally synchronized Linux clock, then update
771          * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
772          * called as close as possible to 500 ms before the new second starts.
773          * This code is run on a timer.  If the clock is set, that timer
774          * may not expire at the correct time.  Thus, we adjust...
775          */
776         if (!ntp_synced())
777                 /*
778                  * Not synced, exit, do not restart a timer (if one is
779                  * running, let it run out).
780                  */
781                 return;
782
783         do_gettimeofday(&now);
784         if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
785             now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
786                 fail = set_rtc_mmss(now.tv_sec);
787
788         next.tv_usec = USEC_AFTER - now.tv_usec;
789         if (next.tv_usec <= 0)
790                 next.tv_usec += USEC_PER_SEC;
791
792         if (!fail)
793                 next.tv_sec = 659;
794         else
795                 next.tv_sec = 0;
796
797         if (next.tv_usec >= USEC_PER_SEC) {
798                 next.tv_sec++;
799                 next.tv_usec -= USEC_PER_SEC;
800         }
801         mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
802 }
803
804 void notify_arch_cmos_timer(void)
805 {
806         mod_timer(&sync_cmos_timer, jiffies + 1);
807         mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
808 }
809
810 static long clock_cmos_diff, sleep_start;
811
812 static struct timer_opts *last_timer;
813 static int timer_suspend(struct sys_device *dev, pm_message_t state)
814 {
815         /*
816          * Estimate time zone so that set_time can update the clock
817          */
818         clock_cmos_diff = -get_cmos_time();
819         clock_cmos_diff += get_seconds();
820         sleep_start = get_cmos_time();
821         last_timer = cur_timer;
822         cur_timer = &timer_none;
823         if (last_timer->suspend)
824                 last_timer->suspend(state);
825         return 0;
826 }
827
828 static int timer_resume(struct sys_device *dev)
829 {
830         unsigned long flags;
831         unsigned long sec;
832         unsigned long sleep_length;
833
834 #ifdef CONFIG_HPET_TIMER
835         if (is_hpet_enabled())
836                 hpet_reenable();
837 #endif
838         sec = get_cmos_time() + clock_cmos_diff;
839         sleep_length = (get_cmos_time() - sleep_start) * HZ;
840         write_seqlock_irqsave(&xtime_lock, flags);
841         xtime.tv_sec = sec;
842         xtime.tv_nsec = 0;
843         jiffies_64 += sleep_length;
844         wall_jiffies += sleep_length;
845         write_sequnlock_irqrestore(&xtime_lock, flags);
846         if (last_timer->resume)
847                 last_timer->resume();
848         cur_timer = last_timer;
849         last_timer = NULL;
850         touch_softlockup_watchdog();
851         return 0;
852 }
853
854 static struct sysdev_class timer_sysclass = {
855         .resume = timer_resume,
856         .suspend = timer_suspend,
857         set_kset_name("timer"),
858 };
859
860
861 /* XXX this driverfs stuff should probably go elsewhere later -john */
862 static struct sys_device device_timer = {
863         .id     = 0,
864         .cls    = &timer_sysclass,
865 };
866
867 static int time_init_device(void)
868 {
869         int error = sysdev_class_register(&timer_sysclass);
870         if (!error)
871                 error = sysdev_register(&device_timer);
872         return error;
873 }
874
875 device_initcall(time_init_device);
876
877 #ifdef CONFIG_HPET_TIMER
878 extern void (*late_time_init)(void);
879 /* Duplicate of time_init() below, with hpet_enable part added */
880 static void __init hpet_time_init(void)
881 {
882         xtime.tv_sec = get_cmos_time();
883         xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
884         set_normalized_timespec(&wall_to_monotonic,
885                 -xtime.tv_sec, -xtime.tv_nsec);
886
887         if ((hpet_enable() >= 0) && hpet_use_timer) {
888                 printk("Using HPET for base-timer\n");
889         }
890
891         cur_timer = select_timer();
892         printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
893
894         time_init_hook();
895 }
896 #endif
897
898 /* Dynamically-mapped IRQ. */
899 DEFINE_PER_CPU(int, timer_irq);
900
901 extern void (*late_time_init)(void);
902 static void setup_cpu0_timer_irq(void)
903 {
904         per_cpu(timer_irq, 0) =
905                 bind_virq_to_irqhandler(
906                         VIRQ_TIMER,
907                         0,
908                         timer_interrupt,
909                         SA_INTERRUPT,
910                         "timer0",
911                         NULL);
912         BUG_ON(per_cpu(timer_irq, 0) < 0);
913 }
914
915 void __init time_init(void)
916 {
917 #ifdef CONFIG_HPET_TIMER
918         if (is_hpet_capable()) {
919                 /*
920                  * HPET initialization needs to do memory-mapped io. So, let
921                  * us do a late initialization after mem_init().
922                  */
923                 late_time_init = hpet_time_init;
924                 return;
925         }
926 #endif
927         get_time_values_from_xen();
928
929         processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
930         per_cpu(processed_system_time, 0) = processed_system_time;
931         init_missing_ticks_accounting(0);
932
933         update_wallclock();
934
935         init_cpu_khz();
936         printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
937                cpu_khz / 1000, cpu_khz % 1000);
938
939 #if defined(__x86_64__)
940         vxtime.mode = VXTIME_TSC;
941         vxtime.quot = (1000000L << 32) / vxtime_hz;
942         vxtime.tsc_quot = (1000L << 32) / cpu_khz;
943         sync_core();
944         rdtscll(vxtime.last_tsc);
945 #endif
946
947         /* Cannot request_irq() until kmem is initialised. */
948         late_time_init = setup_cpu0_timer_irq;
949 }
950
951 /* Convert jiffies to system time. */
952 u64 jiffies_to_st(unsigned long j)
953 {
954         unsigned long seq;
955         long delta;
956         u64 st;
957
958         do {
959                 seq = read_seqbegin(&xtime_lock);
960                 delta = j - jiffies;
961                 if (delta < 1) {
962                         /* Triggers in some wrap-around cases, but that's okay:
963                          * we just end up with a shorter timeout. */
964                         st = processed_system_time + NS_PER_TICK;
965                 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
966                         /* Very long timeout means there is no pending timer.
967                          * We indicate this to Xen by passing zero timeout. */
968                         st = 0;
969                 } else {
970                         st = processed_system_time + delta * (u64)NS_PER_TICK;
971                 }
972         } while (read_seqretry(&xtime_lock, seq));
973
974         return st;
975 }
976 EXPORT_SYMBOL(jiffies_to_st);
977
978 /*
979  * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
980  * These functions are based on implementations from arch/s390/kernel/time.c
981  */
982 static void stop_hz_timer(void)
983 {
984         unsigned int cpu = smp_processor_id();
985         unsigned long j;
986
987         cpu_set(cpu, nohz_cpu_mask);
988
989         /* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
990         /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
991         /* value of rcp->cur that matches rdp->quiescbatch and allows us to  */
992         /* stop the hz timer then the cpumasks created for subsequent values */
993         /* of cur in rcu_start_batch are guaranteed to pick up the updated   */
994         /* nohz_cpu_mask and so will not depend on this cpu.                 */
995
996         smp_mb();
997
998         /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
999         if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
1000             (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
1001                 cpu_clear(cpu, nohz_cpu_mask);
1002                 j = jiffies + 1;
1003         }
1004
1005         if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
1006                 BUG();
1007 }
1008
1009 static void start_hz_timer(void)
1010 {
1011         cpu_clear(smp_processor_id(), nohz_cpu_mask);
1012 }
1013
1014 void safe_halt(void)
1015 {
1016         stop_hz_timer();
1017         /* Blocking includes an implicit local_irq_enable(). */
1018         HYPERVISOR_block();
1019         start_hz_timer();
1020 }
1021 EXPORT_SYMBOL(safe_halt);
1022
1023 void halt(void)
1024 {
1025         if (irqs_disabled())
1026                 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
1027 }
1028 EXPORT_SYMBOL(halt);
1029
1030 /* No locking required. We are only CPU running, and interrupts are off. */
1031 void time_resume(void)
1032 {
1033         init_cpu_khz();
1034
1035         get_time_values_from_xen();
1036
1037         processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
1038         per_cpu(processed_system_time, 0) = processed_system_time;
1039         init_missing_ticks_accounting(0);
1040
1041         update_wallclock();
1042 }
1043
1044 #ifdef CONFIG_SMP
1045 static char timer_name[NR_CPUS][15];
1046
1047 void local_setup_timer(unsigned int cpu)
1048 {
1049         int seq;
1050
1051         BUG_ON(cpu == 0);
1052
1053         do {
1054                 seq = read_seqbegin(&xtime_lock);
1055                 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
1056                 per_cpu(processed_system_time, cpu) =
1057                         per_cpu(shadow_time, 0).system_timestamp;
1058                 init_missing_ticks_accounting(cpu);
1059         } while (read_seqretry(&xtime_lock, seq));
1060
1061         sprintf(timer_name[cpu], "timer%d", cpu);
1062         per_cpu(timer_irq, cpu) =
1063                 bind_virq_to_irqhandler(
1064                         VIRQ_TIMER,
1065                         cpu,
1066                         timer_interrupt,
1067                         SA_INTERRUPT,
1068                         timer_name[cpu],
1069                         NULL);
1070         BUG_ON(per_cpu(timer_irq, cpu) < 0);
1071 }
1072
1073 void local_teardown_timer(unsigned int cpu)
1074 {
1075         BUG_ON(cpu == 0);
1076         unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1077 }
1078 #endif
1079
1080 /*
1081  * /proc/sys/xen: This really belongs in another file. It can stay here for
1082  * now however.
1083  */
1084 static ctl_table xen_subtable[] = {
1085         {
1086                 .ctl_name       = 1,
1087                 .procname       = "independent_wallclock",
1088                 .data           = &independent_wallclock,
1089                 .maxlen         = sizeof(independent_wallclock),
1090                 .mode           = 0644,
1091                 .proc_handler   = proc_dointvec
1092         },
1093         {
1094                 .ctl_name       = 2,
1095                 .procname       = "permitted_clock_jitter",
1096                 .data           = &permitted_clock_jitter,
1097                 .maxlen         = sizeof(permitted_clock_jitter),
1098                 .mode           = 0644,
1099                 .proc_handler   = proc_doulongvec_minmax
1100         },
1101         { 0 }
1102 };
1103 static ctl_table xen_table[] = {
1104         {
1105                 .ctl_name       = 123,
1106                 .procname       = "xen",
1107                 .mode           = 0555,
1108                 .child          = xen_subtable},
1109         { 0 }
1110 };
1111 static int __init xen_sysctl_init(void)
1112 {
1113         (void)register_sysctl_table(xen_table, 0);
1114         return 0;
1115 }
1116 __initcall(xen_sysctl_init);