Merge to Fedora kernel-2.6.18-1.2224_FC5 patched with stable patch-2.6.18.1-vs2.0...
[linux-2.6.git] / arch / i386 / kernel / time-xen.c
1 /*
2  *  linux/arch/i386/kernel/time.c
3  *
4  *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
5  *
6  * This file contains the PC-specific time handling details:
7  * reading the RTC at bootup, etc..
8  * 1994-07-02    Alan Modra
9  *      fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10  * 1995-03-26    Markus Kuhn
11  *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12  *      precision CMOS clock update
13  * 1996-05-03    Ingo Molnar
14  *      fixed time warps in do_[slow|fast]_gettimeoffset()
15  * 1997-09-10   Updated NTP code according to technical memorandum Jan '96
16  *              "A Kernel Model for Precision Timekeeping" by Dave Mills
17  * 1998-09-05    (Various)
18  *      More robust do_fast_gettimeoffset() algorithm implemented
19  *      (works with APM, Cyrix 6x86MX and Centaur C6),
20  *      monotonic gettimeofday() with fast_get_timeoffset(),
21  *      drift-proof precision TSC calibration on boot
22  *      (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23  *      Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24  *      ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25  * 1998-12-16    Andrea Arcangeli
26  *      Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27  *      because was not accounting lost_ticks.
28  * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
29  *      Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30  *      serialize accesses to xtime/lost_ticks).
31  */
32
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
53
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/sections.h>
64
65 #include "mach_time.h"
66
67 #include <linux/timex.h>
68
69 #include <asm/hpet.h>
70
71 #include <asm/arch_hooks.h>
72
73 #include <xen/evtchn.h>
74 #include <xen/interface/vcpu.h>
75
76 #if defined (__i386__)
77 #include <asm/i8259.h>
78 #endif
79
80 int pit_latch_buggy;              /* extern */
81
82 #if defined(__x86_64__)
83 unsigned long vxtime_hz = PIT_TICK_RATE;
84 struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
85 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
86 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
87 struct timespec __xtime __section_xtime;
88 struct timezone __sys_tz __section_sys_tz;
89 #endif
90
91 #define USEC_PER_TICK (USEC_PER_SEC / HZ)
92 #define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
93 #define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
94
95 #define NS_SCALE        10 /* 2^10, carefully chosen */
96 #define US_SCALE        32 /* 2^32, arbitralrily chosen */
97
98 unsigned int cpu_khz;   /* Detected as we calibrate the TSC */
99 EXPORT_SYMBOL(cpu_khz);
100
101 extern unsigned long wall_jiffies;
102
103 DEFINE_SPINLOCK(rtc_lock);
104 EXPORT_SYMBOL(rtc_lock);
105
106 extern struct init_timer_opts timer_tsc_init;
107 extern struct timer_opts timer_tsc;
108 #define timer_none timer_tsc
109
110 /* These are peridically updated in shared_info, and then copied here. */
111 struct shadow_time_info {
112         u64 tsc_timestamp;     /* TSC at last update of time vals.  */
113         u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
114         u32 tsc_to_nsec_mul;
115         u32 tsc_to_usec_mul;
116         int tsc_shift;
117         u32 version;
118 };
119 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
120 static struct timespec shadow_tv;
121 static u32 shadow_tv_version;
122
123 /* Keep track of last time we did processing/updating of jiffies and xtime. */
124 static u64 processed_system_time;   /* System time (ns) at last processing. */
125 static DEFINE_PER_CPU(u64, processed_system_time);
126
127 /* How much CPU time was spent blocked and how much was 'stolen'? */
128 static DEFINE_PER_CPU(u64, processed_stolen_time);
129 static DEFINE_PER_CPU(u64, processed_blocked_time);
130
131 /* Current runstate of each CPU (updated automatically by the hypervisor). */
132 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
133
134 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
135 #define NS_PER_TICK (1000000000LL/HZ)
136
137 static inline void __normalize_time(time_t *sec, s64 *nsec)
138 {
139         while (*nsec >= NSEC_PER_SEC) {
140                 (*nsec) -= NSEC_PER_SEC;
141                 (*sec)++;
142         }
143         while (*nsec < 0) {
144                 (*nsec) += NSEC_PER_SEC;
145                 (*sec)--;
146         }
147 }
148
149 /* Does this guest OS track Xen time, or set its wall clock independently? */
150 static int independent_wallclock = 0;
151 static int __init __independent_wallclock(char *str)
152 {
153         independent_wallclock = 1;
154         return 1;
155 }
156 __setup("independent_wallclock", __independent_wallclock);
157
158 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
159 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
160 static int __init __permitted_clock_jitter(char *str)
161 {
162         permitted_clock_jitter = simple_strtoul(str, NULL, 0);
163         return 1;
164 }
165 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
166
167 #ifndef CONFIG_X86
168 int tsc_disable __devinitdata = 0;
169 #endif
170
171 static void delay_tsc(unsigned long loops)
172 {
173         unsigned long bclock, now;
174
175         rdtscl(bclock);
176         do {
177                 rep_nop();
178                 rdtscl(now);
179         } while ((now - bclock) < loops);
180 }
181
182 struct timer_opts timer_tsc = {
183         .name = "tsc",
184         .delay = delay_tsc,
185 };
186
187 /*
188  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
189  * yielding a 64-bit result.
190  */
191 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
192 {
193         u64 product;
194 #ifdef __i386__
195         u32 tmp1, tmp2;
196 #endif
197
198         if (shift < 0)
199                 delta >>= -shift;
200         else
201                 delta <<= shift;
202
203 #ifdef __i386__
204         __asm__ (
205                 "mul  %5       ; "
206                 "mov  %4,%%eax ; "
207                 "mov  %%edx,%4 ; "
208                 "mul  %5       ; "
209                 "xor  %5,%5    ; "
210                 "add  %4,%%eax ; "
211                 "adc  %5,%%edx ; "
212                 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
213                 : "a" ((u32)delta), "1" ((u32)(delta >> US_SCALE)), "2" (mul_frac) );
214 #else
215         __asm__ (
216                 "mul %%rdx ; shrd $32,%%rdx,%%rax"
217                 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
218 #endif
219
220         return product;
221 }
222
223 #if defined (__i386__)
224 int read_current_timer(unsigned long *timer_val)
225 {
226         rdtscl(*timer_val);
227         return 0;
228 }
229 #endif
230
231 void init_cpu_khz(void)
232 {
233         u64 __cpu_khz = 1000000ULL << US_SCALE;
234         struct vcpu_time_info *info;
235         info = &HYPERVISOR_shared_info->vcpu_info[0].time;
236         do_div(__cpu_khz, info->tsc_to_system_mul);
237         if (info->tsc_shift < 0)
238                 cpu_khz = __cpu_khz << -info->tsc_shift;
239         else
240                 cpu_khz = __cpu_khz >> info->tsc_shift;
241 }
242
243 static u64 get_nsec_offset(struct shadow_time_info *shadow)
244 {
245         u64 now, delta;
246         rdtscll(now);
247         delta = now - shadow->tsc_timestamp;
248         return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
249 }
250
251 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
252 {
253         u64 now, delta;
254         rdtscll(now);
255         delta = now - shadow->tsc_timestamp;
256         return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
257 }
258
259 static void __update_wallclock(time_t sec, long nsec)
260 {
261         long wtm_nsec, xtime_nsec;
262         time_t wtm_sec, xtime_sec;
263         u64 tmp, wc_nsec;
264
265         /* Adjust wall-clock time base based on wall_jiffies ticks. */
266         wc_nsec = processed_system_time;
267         wc_nsec += sec * (u64)NSEC_PER_SEC;
268         wc_nsec += nsec;
269         wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
270
271         /* Split wallclock base into seconds and nanoseconds. */
272         tmp = wc_nsec;
273         xtime_nsec = do_div(tmp, 1000000000);
274         xtime_sec  = (time_t)tmp;
275
276         wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
277         wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
278
279         set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
280         set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
281
282         ntp_clear();
283 }
284
285 static void update_wallclock(void)
286 {
287         shared_info_t *s = HYPERVISOR_shared_info;
288
289         do {
290                 shadow_tv_version = s->wc_version;
291                 rmb();
292                 shadow_tv.tv_sec  = s->wc_sec;
293                 shadow_tv.tv_nsec = s->wc_nsec;
294                 rmb();
295         } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
296
297         if (!independent_wallclock)
298                 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
299 }
300
301 /*
302  * Reads a consistent set of time-base values from Xen, into a shadow data
303  * area.
304  */
305 static void get_time_values_from_xen(void)
306 {
307         shared_info_t           *s = HYPERVISOR_shared_info;
308         struct vcpu_time_info   *src;
309         struct shadow_time_info *dst;
310
311         src = &s->vcpu_info[smp_processor_id()].time;
312         dst = &per_cpu(shadow_time, smp_processor_id());
313
314         do {
315                 dst->version = src->version;
316                 rmb();
317                 dst->tsc_timestamp     = src->tsc_timestamp;
318                 dst->system_timestamp  = src->system_time;
319                 dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
320                 dst->tsc_shift         = src->tsc_shift;
321                 rmb();
322         } while ((src->version & 1) | (dst->version ^ src->version));
323
324         dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
325 }
326
327 static inline int time_values_up_to_date(int cpu)
328 {
329         struct vcpu_time_info   *src;
330         struct shadow_time_info *dst;
331
332         src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
333         dst = &per_cpu(shadow_time, cpu);
334
335         rmb();
336         return (dst->version == src->version);
337 }
338
339 /*
340  * This is a special lock that is owned by the CPU and holds the index
341  * register we are working with.  It is required for NMI access to the
342  * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
343  */
344 volatile unsigned long cmos_lock = 0;
345 EXPORT_SYMBOL(cmos_lock);
346
347 /* Routines for accessing the CMOS RAM/RTC. */
348 unsigned char rtc_cmos_read(unsigned char addr)
349 {
350         unsigned char val;
351         lock_cmos_prefix(addr);
352         outb_p(addr, RTC_PORT(0));
353         val = inb_p(RTC_PORT(1));
354         lock_cmos_suffix(addr);
355         return val;
356 }
357 EXPORT_SYMBOL(rtc_cmos_read);
358
359 void rtc_cmos_write(unsigned char val, unsigned char addr)
360 {
361         lock_cmos_prefix(addr);
362         outb_p(addr, RTC_PORT(0));
363         outb_p(val, RTC_PORT(1));
364         lock_cmos_suffix(addr);
365 }
366 EXPORT_SYMBOL(rtc_cmos_write);
367
368 /*
369  * This version of gettimeofday has microsecond resolution
370  * and better than microsecond precision on fast x86 machines with TSC.
371  */
372 void do_gettimeofday(struct timeval *tv)
373 {
374         unsigned long seq;
375         unsigned long usec, sec;
376         unsigned long max_ntp_tick;
377         s64 nsec;
378         unsigned int cpu;
379         struct shadow_time_info *shadow;
380         u32 local_time_version;
381
382         cpu = get_cpu();
383         shadow = &per_cpu(shadow_time, cpu);
384
385         do {
386                 unsigned long lost;
387
388                 local_time_version = shadow->version;
389                 seq = read_seqbegin(&xtime_lock);
390
391                 usec = get_usec_offset(shadow);
392                 lost = jiffies - wall_jiffies;
393
394                 /*
395                  * If time_adjust is negative then NTP is slowing the clock
396                  * so make sure not to go into next possible interval.
397                  * Better to lose some accuracy than have time go backwards..
398                  */
399                 if (unlikely(time_adjust < 0)) {
400                         max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
401                         usec = min(usec, max_ntp_tick);
402
403                         if (lost)
404                                 usec += lost * max_ntp_tick;
405                 }
406                 else if (unlikely(lost))
407                         usec += lost * (USEC_PER_SEC / HZ);
408
409                 sec = xtime.tv_sec;
410                 usec += (xtime.tv_nsec / NSEC_PER_USEC);
411
412                 nsec = shadow->system_timestamp - processed_system_time;
413                 __normalize_time(&sec, &nsec);
414                 usec += (long)nsec / NSEC_PER_USEC;
415
416                 if (unlikely(!time_values_up_to_date(cpu))) {
417                         /*
418                          * We may have blocked for a long time,
419                          * rendering our calculations invalid
420                          * (e.g. the time delta may have
421                          * overflowed). Detect that and recalculate
422                          * with fresh values.
423                          */
424                         get_time_values_from_xen();
425                         continue;
426                 }
427         } while (read_seqretry(&xtime_lock, seq) ||
428                  (local_time_version != shadow->version));
429
430         put_cpu();
431
432         while (usec >= USEC_PER_SEC) {
433                 usec -= USEC_PER_SEC;
434                 sec++;
435         }
436
437         tv->tv_sec = sec;
438         tv->tv_usec = usec;
439 }
440
441 EXPORT_SYMBOL(do_gettimeofday);
442
443 int do_settimeofday(struct timespec *tv)
444 {
445         time_t sec;
446         s64 nsec;
447         unsigned int cpu;
448         struct shadow_time_info *shadow;
449         dom0_op_t op;
450
451         if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
452                 return -EINVAL;
453
454         cpu = get_cpu();
455         shadow = &per_cpu(shadow_time, cpu);
456
457         write_seqlock_irq(&xtime_lock);
458
459         /*
460          * Ensure we don't get blocked for a long time so that our time delta
461          * overflows. If that were to happen then our shadow time values would
462          * be stale, so we can retry with fresh ones.
463          */
464         for (;;) {
465                 nsec = tv->tv_nsec - get_nsec_offset(shadow);
466                 if (time_values_up_to_date(cpu))
467                         break;
468                 get_time_values_from_xen();
469         }
470         sec = tv->tv_sec;
471         __normalize_time(&sec, &nsec);
472
473         if (is_initial_xendomain() && !independent_wallclock) {
474                 op.cmd = DOM0_SETTIME;
475                 op.u.settime.secs        = sec;
476                 op.u.settime.nsecs       = nsec;
477                 op.u.settime.system_time = shadow->system_timestamp;
478                 HYPERVISOR_dom0_op(&op);
479                 update_wallclock();
480         } else if (independent_wallclock) {
481                 nsec -= shadow->system_timestamp;
482                 __normalize_time(&sec, &nsec);
483                 __update_wallclock(sec, nsec);
484         }
485
486         write_sequnlock_irq(&xtime_lock);
487
488         put_cpu();
489
490         clock_was_set();
491         return 0;
492 }
493
494 EXPORT_SYMBOL(do_settimeofday);
495
496 static void sync_xen_wallclock(unsigned long dummy);
497 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
498 static void sync_xen_wallclock(unsigned long dummy)
499 {
500         time_t sec;
501         s64 nsec;
502         dom0_op_t op;
503
504         if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
505                 return;
506
507         write_seqlock_irq(&xtime_lock);
508
509         sec  = xtime.tv_sec;
510         nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
511         __normalize_time(&sec, &nsec);
512
513         op.cmd = DOM0_SETTIME;
514         op.u.settime.secs        = sec;
515         op.u.settime.nsecs       = nsec;
516         op.u.settime.system_time = processed_system_time;
517         HYPERVISOR_dom0_op(&op);
518
519         update_wallclock();
520
521         write_sequnlock_irq(&xtime_lock);
522
523         /* Once per minute. */
524         mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
525 }
526
527 static int set_rtc_mmss(unsigned long nowtime)
528 {
529         int retval;
530         unsigned long flags;
531
532         if (independent_wallclock || !is_initial_xendomain())
533                 return 0;
534
535         /* gets recalled with irq locally disabled */
536         spin_lock_irqsave(&rtc_lock, flags);
537         if (efi_enabled)
538                 retval = efi_set_rtc_mmss(nowtime);
539         else
540                 retval = mach_set_rtc_mmss(nowtime);
541         spin_unlock_irqrestore(&rtc_lock, flags);
542
543         return retval;
544 }
545
546 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
547  *              Note: This function is required to return accurate
548  *              time even in the absence of multiple timer ticks.
549  */
550 unsigned long long monotonic_clock(void)
551 {
552         int cpu = get_cpu();
553         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
554         u64 time;
555         u32 local_time_version;
556
557         do {
558                 local_time_version = shadow->version;
559                 barrier();
560                 time = shadow->system_timestamp + get_nsec_offset(shadow);
561                 if (!time_values_up_to_date(cpu))
562                         get_time_values_from_xen();
563                 barrier();
564         } while (local_time_version != shadow->version);
565
566         put_cpu();
567
568         return time;
569 }
570 EXPORT_SYMBOL(monotonic_clock);
571
572 unsigned long long sched_clock(void)
573 {
574         return monotonic_clock();
575 }
576
577 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
578 unsigned long profile_pc(struct pt_regs *regs)
579 {
580         unsigned long pc = instruction_pointer(regs);
581
582 #ifdef __x86_64__
583         /* Assume the lock function has either no stack frame or only a single word.
584            This checks if the address on the stack looks like a kernel text address.
585            There is a small window for false hits, but in that case the tick
586            is just accounted to the spinlock function.
587            Better would be to write these functions in assembler again
588            and check exactly. */
589         if (!user_mode_vm(regs) && in_lock_functions(pc)) {
590                 char *v = *(char **)regs->rsp;
591                 if ((v >= _stext && v <= _etext) ||
592                         (v >= _sinittext && v <= _einittext) ||
593                         (v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
594                         return (unsigned long)v;
595                 return ((unsigned long *)regs->rsp)[1];
596         }
597 #else
598         if (!user_mode_vm(regs) && in_lock_functions(pc))
599                 return *(unsigned long *)(regs->ebp + 4);
600 #endif
601
602         return pc;
603 }
604 EXPORT_SYMBOL(profile_pc);
605 #endif
606
607 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
608 {
609         s64 delta, delta_cpu, stolen, blocked;
610         u64 sched_time;
611         int i, cpu = smp_processor_id();
612         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
613         struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
614
615         write_seqlock(&xtime_lock);
616
617         do {
618                 get_time_values_from_xen();
619
620                 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
621                 delta = delta_cpu =
622                         shadow->system_timestamp + get_nsec_offset(shadow);
623                 delta     -= processed_system_time;
624                 delta_cpu -= per_cpu(processed_system_time, cpu);
625
626                 /*
627                  * Obtain a consistent snapshot of stolen/blocked cycles. We
628                  * can use state_entry_time to detect if we get preempted here.
629                  */
630                 do {
631                         sched_time = runstate->state_entry_time;
632                         barrier();
633                         stolen = runstate->time[RUNSTATE_runnable] +
634                                 runstate->time[RUNSTATE_offline] -
635                                 per_cpu(processed_stolen_time, cpu);
636                         blocked = runstate->time[RUNSTATE_blocked] -
637                                 per_cpu(processed_blocked_time, cpu);
638                         barrier();
639                 } while (sched_time != runstate->state_entry_time);
640         } while (!time_values_up_to_date(cpu));
641
642         if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
643              unlikely(delta_cpu < -(s64)permitted_clock_jitter))
644             && printk_ratelimit()) {
645                 printk("Timer ISR/%d: Time went backwards: "
646                        "delta=%lld delta_cpu=%lld shadow=%lld "
647                        "off=%lld processed=%lld cpu_processed=%lld\n",
648                        cpu, delta, delta_cpu, shadow->system_timestamp,
649                        (s64)get_nsec_offset(shadow),
650                        processed_system_time,
651                        per_cpu(processed_system_time, cpu));
652                 for (i = 0; i < num_online_cpus(); i++)
653                         printk(" %d: %lld\n", i,
654                                per_cpu(processed_system_time, i));
655         }
656
657         /* System-wide jiffy work. */
658         while (delta >= NS_PER_TICK) {
659                 delta -= NS_PER_TICK;
660                 processed_system_time += NS_PER_TICK;
661                 do_timer(regs);
662         }
663
664         if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
665                 update_wallclock();
666                 clock_was_set();
667         }
668
669         write_sequnlock(&xtime_lock);
670
671         /*
672          * Account stolen ticks.
673          * HACK: Passing NULL to account_steal_time()
674          * ensures that the ticks are accounted as stolen.
675          */
676         if ((stolen > 0) && (delta_cpu > 0)) {
677                 delta_cpu -= stolen;
678                 if (unlikely(delta_cpu < 0))
679                         stolen += delta_cpu; /* clamp local-time progress */
680                 do_div(stolen, NS_PER_TICK);
681                 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
682                 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
683                 account_steal_time(NULL, (cputime_t)stolen);
684         }
685
686         /*
687          * Account blocked ticks.
688          * HACK: Passing idle_task to account_steal_time()
689          * ensures that the ticks are accounted as idle/wait.
690          */
691         if ((blocked > 0) && (delta_cpu > 0)) {
692                 delta_cpu -= blocked;
693                 if (unlikely(delta_cpu < 0))
694                         blocked += delta_cpu; /* clamp local-time progress */
695                 do_div(blocked, NS_PER_TICK);
696                 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
697                 per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
698                 account_steal_time(idle_task(cpu), (cputime_t)blocked);
699         }
700
701         /* Account user/system ticks. */
702         if (delta_cpu > 0) {
703                 do_div(delta_cpu, NS_PER_TICK);
704                 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
705                 if (user_mode(regs))
706                         account_user_time(current, (cputime_t)delta_cpu);
707                 else
708                         account_system_time(current, HARDIRQ_OFFSET,
709                                             (cputime_t)delta_cpu);
710         }
711
712         /* Local timer processing (see update_process_times()). */
713         run_local_timers();
714         if (rcu_pending(cpu))
715                 rcu_check_callbacks(cpu, user_mode(regs));
716         scheduler_tick();
717         run_posix_cpu_timers(current);
718
719         return IRQ_HANDLED;
720 }
721
722 static void init_missing_ticks_accounting(int cpu)
723 {
724         struct vcpu_register_runstate_memory_area area;
725         struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
726
727         memset(runstate, 0, sizeof(*runstate));
728
729         area.addr.v = runstate;
730         HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
731
732         per_cpu(processed_blocked_time, cpu) =
733                 runstate->time[RUNSTATE_blocked];
734         per_cpu(processed_stolen_time, cpu) =
735                 runstate->time[RUNSTATE_runnable] +
736                 runstate->time[RUNSTATE_offline];
737 }
738
739 /* not static: needed by APM */
740 unsigned long get_cmos_time(void)
741 {
742         unsigned long retval;
743         unsigned long flags;
744
745         spin_lock_irqsave(&rtc_lock, flags);
746
747         if (efi_enabled)
748                 retval = efi_get_time();
749         else
750                 retval = mach_get_cmos_time();
751
752         spin_unlock_irqrestore(&rtc_lock, flags);
753
754         return retval;
755 }
756 EXPORT_SYMBOL(get_cmos_time);
757
758 static void sync_cmos_clock(unsigned long dummy);
759
760 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
761
762 static void sync_cmos_clock(unsigned long dummy)
763 {
764         struct timeval now, next;
765         int fail = 1;
766
767         /*
768          * If we have an externally synchronized Linux clock, then update
769          * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
770          * called as close as possible to 500 ms before the new second starts.
771          * This code is run on a timer.  If the clock is set, that timer
772          * may not expire at the correct time.  Thus, we adjust...
773          */
774         if (!ntp_synced())
775                 /*
776                  * Not synced, exit, do not restart a timer (if one is
777                  * running, let it run out).
778                  */
779                 return;
780
781         do_gettimeofday(&now);
782         if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
783             now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
784                 fail = set_rtc_mmss(now.tv_sec);
785
786         next.tv_usec = USEC_AFTER - now.tv_usec;
787         if (next.tv_usec <= 0)
788                 next.tv_usec += USEC_PER_SEC;
789
790         if (!fail)
791                 next.tv_sec = 659;
792         else
793                 next.tv_sec = 0;
794
795         if (next.tv_usec >= USEC_PER_SEC) {
796                 next.tv_sec++;
797                 next.tv_usec -= USEC_PER_SEC;
798         }
799         mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
800 }
801
802 void notify_arch_cmos_timer(void)
803 {
804         mod_timer(&sync_cmos_timer, jiffies + 1);
805         mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
806 }
807
808 static long clock_cmos_diff, sleep_start;
809
810 static int timer_suspend(struct sys_device *dev, pm_message_t state)
811 {
812         /*
813          * Estimate time zone so that set_time can update the clock
814          */
815         clock_cmos_diff = -get_cmos_time();
816         clock_cmos_diff += get_seconds();
817         sleep_start = get_cmos_time();
818         return 0;
819 }
820
821 static int timer_resume(struct sys_device *dev)
822 {
823         unsigned long flags;
824         unsigned long sec;
825         unsigned long sleep_length;
826
827 #ifdef CONFIG_HPET_TIMER
828         if (is_hpet_enabled())
829                 hpet_reenable();
830 #endif
831         sec = get_cmos_time() + clock_cmos_diff;
832         sleep_length = (get_cmos_time() - sleep_start) * HZ;
833         write_seqlock_irqsave(&xtime_lock, flags);
834         xtime.tv_sec = sec;
835         xtime.tv_nsec = 0;
836         jiffies_64 += sleep_length;
837         wall_jiffies += sleep_length;
838         write_sequnlock_irqrestore(&xtime_lock, flags);
839         touch_softlockup_watchdog();
840         return 0;
841 }
842
843 static struct sysdev_class timer_sysclass = {
844         .resume = timer_resume,
845         .suspend = timer_suspend,
846         set_kset_name("timer"),
847 };
848
849
850 /* XXX this driverfs stuff should probably go elsewhere later -john */
851 static struct sys_device device_timer = {
852         .id     = 0,
853         .cls    = &timer_sysclass,
854 };
855
856 static int time_init_device(void)
857 {
858         int error = sysdev_class_register(&timer_sysclass);
859         if (!error)
860                 error = sysdev_register(&device_timer);
861         return error;
862 }
863
864 device_initcall(time_init_device);
865
866 #ifdef CONFIG_HPET_TIMER
867 extern void (*late_time_init)(void);
868 /* Duplicate of time_init() below, with hpet_enable part added */
869 static void __init hpet_time_init(void)
870 {
871         xtime.tv_sec = get_cmos_time();
872         xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
873         set_normalized_timespec(&wall_to_monotonic,
874                 -xtime.tv_sec, -xtime.tv_nsec);
875
876         if ((hpet_enable() >= 0) && hpet_use_timer) {
877                 printk("Using HPET for base-timer\n");
878         }
879         time_init_hook();
880 }
881 #endif
882
883 /* Dynamically-mapped IRQ. */
884 DEFINE_PER_CPU(int, timer_irq);
885
886 extern void (*late_time_init)(void);
887 static void setup_cpu0_timer_irq(void)
888 {
889         per_cpu(timer_irq, 0) =
890                 bind_virq_to_irqhandler(
891                         VIRQ_TIMER,
892                         0,
893                         timer_interrupt,
894                         SA_INTERRUPT,
895                         "timer0",
896                         NULL);
897         BUG_ON(per_cpu(timer_irq, 0) < 0);
898 }
899
900 void __init time_init(void)
901 {
902 #ifdef CONFIG_HPET_TIMER
903         if (is_hpet_capable()) {
904                 /*
905                  * HPET initialization needs to do memory-mapped io. So, let
906                  * us do a late initialization after mem_init().
907                  */
908                 late_time_init = hpet_time_init;
909                 return;
910         }
911 #endif
912         get_time_values_from_xen();
913
914         processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
915         per_cpu(processed_system_time, 0) = processed_system_time;
916         init_missing_ticks_accounting(0);
917
918         update_wallclock();
919
920         init_cpu_khz();
921         printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
922                cpu_khz / 1000, cpu_khz % 1000);
923
924 #if defined(__x86_64__)
925         vxtime.mode = VXTIME_TSC;
926         vxtime.quot = (1000000L << US_SCALE) / vxtime_hz;
927         vxtime.tsc_quot = (1000L << US_SCALE) / cpu_khz;
928         sync_core();
929         rdtscll(vxtime.last_tsc);
930 #endif
931
932         /* Cannot request_irq() until kmem is initialised. */
933         late_time_init = setup_cpu0_timer_irq;
934 }
935
936 /* Convert jiffies to system time. */
937 u64 jiffies_to_st(unsigned long j)
938 {
939         unsigned long seq;
940         long delta;
941         u64 st;
942
943         do {
944                 seq = read_seqbegin(&xtime_lock);
945                 delta = j - jiffies;
946                 if (delta < 1) {
947                         /* Triggers in some wrap-around cases, but that's okay:
948                          * we just end up with a shorter timeout. */
949                         st = processed_system_time + NS_PER_TICK;
950                 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
951                         /* Very long timeout means there is no pending timer.
952                          * We indicate this to Xen by passing zero timeout. */
953                         st = 0;
954                 } else {
955                         st = processed_system_time + delta * (u64)NS_PER_TICK;
956                 }
957         } while (read_seqretry(&xtime_lock, seq));
958
959         return st;
960 }
961 EXPORT_SYMBOL(jiffies_to_st);
962
963 /*
964  * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
965  * These functions are based on implementations from arch/s390/kernel/time.c
966  */
967 static void stop_hz_timer(void)
968 {
969         unsigned int cpu = smp_processor_id();
970         unsigned long j;
971
972         cpu_set(cpu, nohz_cpu_mask);
973
974         /* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
975         /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
976         /* value of rcp->cur that matches rdp->quiescbatch and allows us to  */
977         /* stop the hz timer then the cpumasks created for subsequent values */
978         /* of cur in rcu_start_batch are guaranteed to pick up the updated   */
979         /* nohz_cpu_mask and so will not depend on this cpu.                 */
980
981         smp_mb();
982
983         /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
984         if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
985             (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
986                 cpu_clear(cpu, nohz_cpu_mask);
987                 j = jiffies + 1;
988         }
989
990         if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
991                 BUG();
992 }
993
994 static void start_hz_timer(void)
995 {
996         cpu_clear(smp_processor_id(), nohz_cpu_mask);
997 }
998
999 void raw_safe_halt(void)
1000 {
1001         stop_hz_timer();
1002         /* Blocking includes an implicit local_irq_enable(). */
1003         HYPERVISOR_block();
1004         start_hz_timer();
1005 }
1006 EXPORT_SYMBOL(raw_safe_halt);
1007
1008 void halt(void)
1009 {
1010         if (irqs_disabled())
1011                 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
1012 }
1013 EXPORT_SYMBOL(halt);
1014
1015 /* No locking required. We are only CPU running, and interrupts are off. */
1016 void time_resume(void)
1017 {
1018         init_cpu_khz();
1019
1020         get_time_values_from_xen();
1021
1022         processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
1023         per_cpu(processed_system_time, 0) = processed_system_time;
1024         init_missing_ticks_accounting(0);
1025
1026         update_wallclock();
1027 }
1028
1029 #ifdef CONFIG_SMP
1030 static char timer_name[NR_CPUS][15];
1031
1032 void local_setup_timer(unsigned int cpu)
1033 {
1034         int seq;
1035
1036         BUG_ON(cpu == 0);
1037
1038         do {
1039                 seq = read_seqbegin(&xtime_lock);
1040                 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
1041                 per_cpu(processed_system_time, cpu) =
1042                         per_cpu(shadow_time, 0).system_timestamp;
1043                 init_missing_ticks_accounting(cpu);
1044         } while (read_seqretry(&xtime_lock, seq));
1045
1046         sprintf(timer_name[cpu], "timer%d", cpu);
1047         per_cpu(timer_irq, cpu) =
1048                 bind_virq_to_irqhandler(
1049                         VIRQ_TIMER,
1050                         cpu,
1051                         timer_interrupt,
1052                         SA_INTERRUPT,
1053                         timer_name[cpu],
1054                         NULL);
1055         BUG_ON(per_cpu(timer_irq, cpu) < 0);
1056 }
1057
1058 void local_teardown_timer(unsigned int cpu)
1059 {
1060         BUG_ON(cpu == 0);
1061         unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1062 }
1063 #endif
1064
1065 /*
1066  * /proc/sys/xen: This really belongs in another file. It can stay here for
1067  * now however.
1068  */
1069 static ctl_table xen_subtable[] = {
1070         {
1071                 .ctl_name       = 1,
1072                 .procname       = "independent_wallclock",
1073                 .data           = &independent_wallclock,
1074                 .maxlen         = sizeof(independent_wallclock),
1075                 .mode           = 0644,
1076                 .proc_handler   = proc_dointvec
1077         },
1078         {
1079                 .ctl_name       = 2,
1080                 .procname       = "permitted_clock_jitter",
1081                 .data           = &permitted_clock_jitter,
1082                 .maxlen         = sizeof(permitted_clock_jitter),
1083                 .mode           = 0644,
1084                 .proc_handler   = proc_doulongvec_minmax
1085         },
1086         { 0 }
1087 };
1088 static ctl_table xen_table[] = {
1089         {
1090                 .ctl_name       = 123,
1091                 .procname       = "xen",
1092                 .mode           = 0555,
1093                 .child          = xen_subtable},
1094         { 0 }
1095 };
1096 static int __init xen_sysctl_init(void)
1097 {
1098         (void)register_sysctl_table(xen_table, 0);
1099         return 0;
1100 }
1101 __initcall(xen_sysctl_init);