This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / arch / x86_64 / kernel / time-xen.c
1 /*
2  *  linux/arch/i386/kernel/time.c
3  *
4  *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
5  *
6  * This file contains the PC-specific time handling details:
7  * reading the RTC at bootup, etc..
8  * 1994-07-02    Alan Modra
9  *      fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10  * 1995-03-26    Markus Kuhn
11  *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12  *      precision CMOS clock update
13  * 1996-05-03    Ingo Molnar
14  *      fixed time warps in do_[slow|fast]_gettimeoffset()
15  * 1997-09-10   Updated NTP code according to technical memorandum Jan '96
16  *              "A Kernel Model for Precision Timekeeping" by Dave Mills
17  * 1998-09-05    (Various)
18  *      More robust do_fast_gettimeoffset() algorithm implemented
19  *      (works with APM, Cyrix 6x86MX and Centaur C6),
20  *      monotonic gettimeofday() with fast_get_timeoffset(),
21  *      drift-proof precision TSC calibration on boot
22  *      (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23  *      Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24  *      ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25  * 1998-12-16    Andrea Arcangeli
26  *      Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27  *      because was not accounting lost_ticks.
28  * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
29  *      Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30  *      serialize accesses to xtime/lost_ticks).
31  */
32
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
53
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/sections.h>
64
65 #include "mach_time.h"
66
67 #include <linux/timex.h>
68
69 #include <asm/hpet.h>
70
71 #include <asm/arch_hooks.h>
72
73 #include <xen/evtchn.h>
74 #include <xen/interface/vcpu.h>
75
76 int pit_latch_buggy;              /* extern */
77
78 unsigned long vxtime_hz = PIT_TICK_RATE;
79 struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
80 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
81 struct timespec __xtime __section_xtime;
82 struct timezone __sys_tz __section_sys_tz;
83
84 #define USEC_PER_TICK (USEC_PER_SEC / HZ)
85 #define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
86 #define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
87
88 #define NS_SCALE        10 /* 2^10, carefully chosen */
89 #define US_SCALE        32 /* 2^32, arbitralrily chosen */
90
91 unsigned int cpu_khz;   /* Detected as we calibrate the TSC */
92 EXPORT_SYMBOL(cpu_khz);
93
94 DEFINE_SPINLOCK(rtc_lock);
95 EXPORT_SYMBOL(rtc_lock);
96
97 extern struct init_timer_opts timer_tsc_init;
98 extern struct timer_opts timer_tsc;
99 #define timer_none timer_tsc
100
101 /* These are peridically updated in shared_info, and then copied here. */
102 struct shadow_time_info {
103         u64 tsc_timestamp;     /* TSC at last update of time vals.  */
104         u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
105         u32 tsc_to_nsec_mul;
106         u32 tsc_to_usec_mul;
107         int tsc_shift;
108         u32 version;
109 };
110 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
111 static struct timespec shadow_tv;
112 static u32 shadow_tv_version;
113
114 /* Keep track of last time we did processing/updating of jiffies and xtime. */
115 static u64 processed_system_time;   /* System time (ns) at last processing. */
116 static DEFINE_PER_CPU(u64, processed_system_time);
117
118 /* How much CPU time was spent blocked and how much was 'stolen'? */
119 static DEFINE_PER_CPU(u64, processed_stolen_time);
120 static DEFINE_PER_CPU(u64, processed_blocked_time);
121
122 /* Current runstate of each CPU (updated automatically by the hypervisor). */
123 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
124
125 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
126 #define NS_PER_TICK (1000000000LL/HZ)
127
128 static inline void __normalize_time(time_t *sec, s64 *nsec)
129 {
130         while (*nsec >= NSEC_PER_SEC) {
131                 (*nsec) -= NSEC_PER_SEC;
132                 (*sec)++;
133         }
134         while (*nsec < 0) {
135                 (*nsec) += NSEC_PER_SEC;
136                 (*sec)--;
137         }
138 }
139
140 /* Does this guest OS track Xen time, or set its wall clock independently? */
141 static int independent_wallclock = 0;
142 static int __init __independent_wallclock(char *str)
143 {
144         independent_wallclock = 1;
145         return 1;
146 }
147 __setup("independent_wallclock", __independent_wallclock);
148
149 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
150 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
151 static int __init __permitted_clock_jitter(char *str)
152 {
153         permitted_clock_jitter = simple_strtoul(str, NULL, 0);
154         return 1;
155 }
156 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
157
158 #ifndef CONFIG_X86
159 int tsc_disable __devinitdata = 0;
160 #endif
161
162 static void delay_tsc(unsigned long loops)
163 {
164         unsigned long bclock, now;
165
166         rdtscl(bclock);
167         do {
168                 rep_nop();
169                 rdtscl(now);
170         } while ((now - bclock) < loops);
171 }
172
173 struct timer_opts timer_tsc = {
174         .name = "tsc",
175         .delay = delay_tsc,
176 };
177
178 /*
179  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
180  * yielding a 64-bit result.
181  */
182 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
183 {
184         u64 product;
185
186         if (shift < 0)
187                 delta >>= -shift;
188         else
189                 delta <<= shift;
190
191         __asm__ (
192                 "mul %%rdx ; shrd $32,%%rdx,%%rax"
193                 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
194
195         return product;
196 }
197
198 void init_cpu_khz(void)
199 {
200         u64 __cpu_khz = 1000000ULL << US_SCALE;
201         struct vcpu_time_info *info;
202         info = &HYPERVISOR_shared_info->vcpu_info[0].time;
203         do_div(__cpu_khz, info->tsc_to_system_mul);
204         if (info->tsc_shift < 0)
205                 cpu_khz = __cpu_khz << -info->tsc_shift;
206         else
207                 cpu_khz = __cpu_khz >> info->tsc_shift;
208 }
209
210 static u64 get_nsec_offset(struct shadow_time_info *shadow)
211 {
212         u64 now, delta;
213         rdtscll(now);
214         delta = now - shadow->tsc_timestamp;
215         return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
216 }
217
218 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
219 {
220         u64 now, delta;
221         rdtscll(now);
222         delta = now - shadow->tsc_timestamp;
223         return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
224 }
225
226 static void __update_wallclock(time_t sec, long nsec)
227 {
228         long wtm_nsec, xtime_nsec;
229         time_t wtm_sec, xtime_sec;
230         u64 tmp, wc_nsec;
231
232         /* Adjust wall-clock time base based on jiffies ticks. */
233         wc_nsec = processed_system_time;
234         wc_nsec += sec * (u64)NSEC_PER_SEC;
235         wc_nsec += nsec;
236
237         /* Split wallclock base into seconds and nanoseconds. */
238         tmp = wc_nsec;
239         xtime_nsec = do_div(tmp, 1000000000);
240         xtime_sec  = (time_t)tmp;
241
242         wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
243         wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
244
245         set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
246         set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
247
248         ntp_clear();
249 }
250
251 static void update_wallclock(void)
252 {
253         shared_info_t *s = HYPERVISOR_shared_info;
254
255         do {
256                 shadow_tv_version = s->wc_version;
257                 rmb();
258                 shadow_tv.tv_sec  = s->wc_sec;
259                 shadow_tv.tv_nsec = s->wc_nsec;
260                 rmb();
261         } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
262
263         if (!independent_wallclock)
264                 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
265 }
266
267 /*
268  * Reads a consistent set of time-base values from Xen, into a shadow data
269  * area.
270  */
271 static void get_time_values_from_xen(void)
272 {
273         shared_info_t           *s = HYPERVISOR_shared_info;
274         struct vcpu_time_info   *src;
275         struct shadow_time_info *dst;
276
277         src = &s->vcpu_info[smp_processor_id()].time;
278         dst = &per_cpu(shadow_time, smp_processor_id());
279
280         do {
281                 dst->version = src->version;
282                 rmb();
283                 dst->tsc_timestamp     = src->tsc_timestamp;
284                 dst->system_timestamp  = src->system_time;
285                 dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
286                 dst->tsc_shift         = src->tsc_shift;
287                 rmb();
288         } while ((src->version & 1) | (dst->version ^ src->version));
289
290         dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
291 }
292
293 static inline int time_values_up_to_date(int cpu)
294 {
295         struct vcpu_time_info   *src;
296         struct shadow_time_info *dst;
297
298         src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
299         dst = &per_cpu(shadow_time, cpu);
300
301         rmb();
302         return (dst->version == src->version);
303 }
304
305 /*
306  * This is a special lock that is owned by the CPU and holds the index
307  * register we are working with.  It is required for NMI access to the
308  * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
309  */
310 volatile unsigned long cmos_lock = 0;
311 EXPORT_SYMBOL(cmos_lock);
312
313 /* Routines for accessing the CMOS RAM/RTC. */
314 unsigned char rtc_cmos_read(unsigned char addr)
315 {
316         unsigned char val;
317         lock_cmos_prefix(addr);
318         outb_p(addr, RTC_PORT(0));
319         val = inb_p(RTC_PORT(1));
320         lock_cmos_suffix(addr);
321         return val;
322 }
323 EXPORT_SYMBOL(rtc_cmos_read);
324
325 void rtc_cmos_write(unsigned char val, unsigned char addr)
326 {
327         lock_cmos_prefix(addr);
328         outb_p(addr, RTC_PORT(0));
329         outb_p(val, RTC_PORT(1));
330         lock_cmos_suffix(addr);
331 }
332 EXPORT_SYMBOL(rtc_cmos_write);
333
334 /*
335  * This version of gettimeofday has microsecond resolution
336  * and better than microsecond precision on fast x86 machines with TSC.
337  */
338 void do_gettimeofday(struct timeval *tv)
339 {
340         unsigned long seq;
341         unsigned long usec, sec;
342         unsigned long max_ntp_tick;
343         s64 nsec;
344         unsigned int cpu;
345         struct shadow_time_info *shadow;
346         u32 local_time_version;
347
348         cpu = get_cpu();
349         shadow = &per_cpu(shadow_time, cpu);
350
351         do {
352                 local_time_version = shadow->version;
353                 seq = read_seqbegin(&xtime_lock);
354
355                 usec = get_usec_offset(shadow);
356
357                 /*
358                  * If time_adjust is negative then NTP is slowing the clock
359                  * so make sure not to go into next possible interval.
360                  * Better to lose some accuracy than have time go backwards..
361                  */
362                 if (unlikely(time_adjust < 0)) {
363                         max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
364                         usec = min(usec, max_ntp_tick);
365                 }
366
367                 sec = xtime.tv_sec;
368                 usec += (xtime.tv_nsec / NSEC_PER_USEC);
369
370                 nsec = shadow->system_timestamp - processed_system_time;
371                 __normalize_time(&sec, &nsec);
372                 usec += (long)nsec / NSEC_PER_USEC;
373
374                 if (unlikely(!time_values_up_to_date(cpu))) {
375                         /*
376                          * We may have blocked for a long time,
377                          * rendering our calculations invalid
378                          * (e.g. the time delta may have
379                          * overflowed). Detect that and recalculate
380                          * with fresh values.
381                          */
382                         get_time_values_from_xen();
383                         continue;
384                 }
385         } while (read_seqretry(&xtime_lock, seq) ||
386                  (local_time_version != shadow->version));
387
388         put_cpu();
389
390         while (usec >= USEC_PER_SEC) {
391                 usec -= USEC_PER_SEC;
392                 sec++;
393         }
394
395         tv->tv_sec = sec;
396         tv->tv_usec = usec;
397 }
398
399 EXPORT_SYMBOL(do_gettimeofday);
400
401 int do_settimeofday(struct timespec *tv)
402 {
403         time_t sec;
404         s64 nsec;
405         unsigned int cpu;
406         struct shadow_time_info *shadow;
407         dom0_op_t op;
408
409         if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
410                 return -EINVAL;
411
412         cpu = get_cpu();
413         shadow = &per_cpu(shadow_time, cpu);
414
415         write_seqlock_irq(&xtime_lock);
416
417         /*
418          * Ensure we don't get blocked for a long time so that our time delta
419          * overflows. If that were to happen then our shadow time values would
420          * be stale, so we can retry with fresh ones.
421          */
422         for (;;) {
423                 nsec = tv->tv_nsec - get_nsec_offset(shadow);
424                 if (time_values_up_to_date(cpu))
425                         break;
426                 get_time_values_from_xen();
427         }
428         sec = tv->tv_sec;
429         __normalize_time(&sec, &nsec);
430
431         if (is_initial_xendomain() && !independent_wallclock) {
432                 op.cmd = DOM0_SETTIME;
433                 op.u.settime.secs        = sec;
434                 op.u.settime.nsecs       = nsec;
435                 op.u.settime.system_time = shadow->system_timestamp;
436                 HYPERVISOR_dom0_op(&op);
437                 update_wallclock();
438         } else if (independent_wallclock) {
439                 nsec -= shadow->system_timestamp;
440                 __normalize_time(&sec, &nsec);
441                 __update_wallclock(sec, nsec);
442         }
443
444         write_sequnlock_irq(&xtime_lock);
445
446         put_cpu();
447
448         clock_was_set();
449         return 0;
450 }
451
452 EXPORT_SYMBOL(do_settimeofday);
453
454 static void sync_xen_wallclock(unsigned long dummy);
455 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
456 static void sync_xen_wallclock(unsigned long dummy)
457 {
458         time_t sec;
459         s64 nsec;
460         dom0_op_t op;
461
462         if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
463                 return;
464
465         write_seqlock_irq(&xtime_lock);
466
467         sec  = xtime.tv_sec;
468         nsec = xtime.tv_nsec;
469         __normalize_time(&sec, &nsec);
470
471         op.cmd = DOM0_SETTIME;
472         op.u.settime.secs        = sec;
473         op.u.settime.nsecs       = nsec;
474         op.u.settime.system_time = processed_system_time;
475         HYPERVISOR_dom0_op(&op);
476
477         update_wallclock();
478
479         write_sequnlock_irq(&xtime_lock);
480
481         /* Once per minute. */
482         mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
483 }
484
485 static int set_rtc_mmss(unsigned long nowtime)
486 {
487         int retval;
488         unsigned long flags;
489
490         if (independent_wallclock || !is_initial_xendomain())
491                 return 0;
492
493         /* gets recalled with irq locally disabled */
494         spin_lock_irqsave(&rtc_lock, flags);
495         if (efi_enabled)
496                 retval = efi_set_rtc_mmss(nowtime);
497         else
498                 retval = mach_set_rtc_mmss(nowtime);
499         spin_unlock_irqrestore(&rtc_lock, flags);
500
501         return retval;
502 }
503
504 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
505  *              Note: This function is required to return accurate
506  *              time even in the absence of multiple timer ticks.
507  */
508 unsigned long long monotonic_clock(void)
509 {
510         int cpu = get_cpu();
511         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
512         u64 time;
513         u32 local_time_version;
514
515         do {
516                 local_time_version = shadow->version;
517                 barrier();
518                 time = shadow->system_timestamp + get_nsec_offset(shadow);
519                 if (!time_values_up_to_date(cpu))
520                         get_time_values_from_xen();
521                 barrier();
522         } while (local_time_version != shadow->version);
523
524         put_cpu();
525
526         return time;
527 }
528 EXPORT_SYMBOL(monotonic_clock);
529
530 unsigned long long sched_clock(void)
531 {
532         return monotonic_clock();
533 }
534
535 unsigned long profile_pc(struct pt_regs *regs)
536 {
537         unsigned long pc = instruction_pointer(regs);
538
539         /* Assume the lock function has either no stack frame or a copy
540            of eflags from PUSHF
541            Eflags always has bits 22 and up cleared unlike kernel addresses. */
542         if (!user_mode_vm(regs) && in_lock_functions(pc)) {
543                 unsigned long *sp = (unsigned long *)regs->rsp;
544                 if (sp[0] >> 22)
545                         return sp[0];
546                 if (sp[1] >> 22)
547                         return sp[1];
548         }
549         return pc;
550 }
551 EXPORT_SYMBOL(profile_pc);
552
553 irqreturn_t timer_interrupt(int irq, void *dev_id)
554 {
555         s64 delta, delta_cpu, stolen, blocked;
556         u64 sched_time;
557         int i, cpu = smp_processor_id();
558         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
559         struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
560
561         write_seqlock(&xtime_lock);
562
563         do {
564                 get_time_values_from_xen();
565
566                 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
567                 delta = delta_cpu =
568                         shadow->system_timestamp + get_nsec_offset(shadow);
569                 delta     -= processed_system_time;
570                 delta_cpu -= per_cpu(processed_system_time, cpu);
571
572                 /*
573                  * Obtain a consistent snapshot of stolen/blocked cycles. We
574                  * can use state_entry_time to detect if we get preempted here.
575                  */
576                 do {
577                         sched_time = runstate->state_entry_time;
578                         barrier();
579                         stolen = runstate->time[RUNSTATE_runnable] +
580                                 runstate->time[RUNSTATE_offline] -
581                                 per_cpu(processed_stolen_time, cpu);
582                         blocked = runstate->time[RUNSTATE_blocked] -
583                                 per_cpu(processed_blocked_time, cpu);
584                         barrier();
585                 } while (sched_time != runstate->state_entry_time);
586         } while (!time_values_up_to_date(cpu));
587
588         if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
589              unlikely(delta_cpu < -(s64)permitted_clock_jitter))
590             && printk_ratelimit()) {
591                 printk("Timer ISR/%d: Time went backwards: "
592                        "delta=%lld delta_cpu=%lld shadow=%lld "
593                        "off=%lld processed=%lld cpu_processed=%lld\n",
594                        cpu, delta, delta_cpu, shadow->system_timestamp,
595                        (s64)get_nsec_offset(shadow),
596                        processed_system_time,
597                        per_cpu(processed_system_time, cpu));
598                 for (i = 0; i < num_online_cpus(); i++)
599                         printk(" %d: %lld\n", i,
600                                per_cpu(processed_system_time, i));
601         }
602
603         /* System-wide jiffy work. */
604         while (delta >= NS_PER_TICK) {
605                 delta -= NS_PER_TICK;
606                 processed_system_time += NS_PER_TICK;
607                 do_timer(1);
608         }
609
610         if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
611                 update_wallclock();
612                 clock_was_set();
613         }
614
615         write_sequnlock(&xtime_lock);
616
617         /*
618          * Account stolen ticks.
619          * HACK: Passing NULL to account_steal_time()
620          * ensures that the ticks are accounted as stolen.
621          */
622         if ((stolen > 0) && (delta_cpu > 0)) {
623                 delta_cpu -= stolen;
624                 if (unlikely(delta_cpu < 0))
625                         stolen += delta_cpu; /* clamp local-time progress */
626                 do_div(stolen, NS_PER_TICK);
627                 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
628                 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
629                 account_steal_time(NULL, (cputime_t)stolen);
630         }
631
632         /*
633          * Account blocked ticks.
634          * HACK: Passing idle_task to account_steal_time()
635          * ensures that the ticks are accounted as idle/wait.
636          */
637         if ((blocked > 0) && (delta_cpu > 0)) {
638                 delta_cpu -= blocked;
639                 if (unlikely(delta_cpu < 0))
640                         blocked += delta_cpu; /* clamp local-time progress */
641                 do_div(blocked, NS_PER_TICK);
642                 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
643                 per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
644                 account_steal_time(idle_task(cpu), (cputime_t)blocked);
645         }
646
647         /* Account user/system ticks. */
648         if (delta_cpu > 0) {
649                 do_div(delta_cpu, NS_PER_TICK);
650                 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
651                 if (user_mode(get_irq_regs()))
652                         account_user_time(current, (cputime_t)delta_cpu);
653                 else
654                         account_system_time(current, HARDIRQ_OFFSET,
655                                             (cputime_t)delta_cpu);
656         }
657
658         /* Local timer processing (see update_process_times()). */
659         run_local_timers();
660         if (rcu_pending(cpu))
661                 rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
662         scheduler_tick();
663         run_posix_cpu_timers(current);
664
665         return IRQ_HANDLED;
666 }
667
668 static void init_missing_ticks_accounting(int cpu)
669 {
670         struct vcpu_register_runstate_memory_area area;
671         struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
672
673         memset(runstate, 0, sizeof(*runstate));
674
675         area.addr.v = runstate;
676         HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
677
678         per_cpu(processed_blocked_time, cpu) =
679                 runstate->time[RUNSTATE_blocked];
680         per_cpu(processed_stolen_time, cpu) =
681                 runstate->time[RUNSTATE_runnable] +
682                 runstate->time[RUNSTATE_offline];
683 }
684
685 /* not static: needed by APM */
686 unsigned long get_cmos_time(void)
687 {
688         unsigned long retval;
689         unsigned long flags;
690
691         spin_lock_irqsave(&rtc_lock, flags);
692
693         if (efi_enabled)
694                 retval = efi_get_time();
695         else
696                 retval = mach_get_cmos_time();
697
698         spin_unlock_irqrestore(&rtc_lock, flags);
699
700         return retval;
701 }
702 EXPORT_SYMBOL(get_cmos_time);
703
704 static void sync_cmos_clock(unsigned long dummy);
705
706 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
707
708 static void sync_cmos_clock(unsigned long dummy)
709 {
710         struct timeval now, next;
711         int fail = 1;
712
713         /*
714          * If we have an externally synchronized Linux clock, then update
715          * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
716          * called as close as possible to 500 ms before the new second starts.
717          * This code is run on a timer.  If the clock is set, that timer
718          * may not expire at the correct time.  Thus, we adjust...
719          */
720         if (!ntp_synced())
721                 /*
722                  * Not synced, exit, do not restart a timer (if one is
723                  * running, let it run out).
724                  */
725                 return;
726
727         do_gettimeofday(&now);
728         if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
729             now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
730                 fail = set_rtc_mmss(now.tv_sec);
731
732         next.tv_usec = USEC_AFTER - now.tv_usec;
733         if (next.tv_usec <= 0)
734                 next.tv_usec += USEC_PER_SEC;
735
736         if (!fail)
737                 next.tv_sec = 659;
738         else
739                 next.tv_sec = 0;
740
741         if (next.tv_usec >= USEC_PER_SEC) {
742                 next.tv_sec++;
743                 next.tv_usec -= USEC_PER_SEC;
744         }
745         mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
746 }
747
748 void notify_arch_cmos_timer(void)
749 {
750         mod_timer(&sync_cmos_timer, jiffies + 1);
751         mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
752 }
753
754 static long clock_cmos_diff;
755 static unsigned long sleep_start;
756
757 static int timer_suspend(struct sys_device *dev, pm_message_t state)
758 {
759         /*
760          * Estimate time zone so that set_time can update the clock
761          */
762         unsigned long ctime =  get_cmos_time();
763
764         clock_cmos_diff = -ctime;
765         clock_cmos_diff += get_seconds();
766         sleep_start = ctime;
767         return 0;
768 }
769
770 static int timer_resume(struct sys_device *dev)
771 {
772         unsigned long flags;
773         unsigned long sec;
774         unsigned long ctime = get_cmos_time();
775         long sleep_length = (ctime - sleep_start) * HZ;
776
777         if (sleep_length < 0) {
778                 printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
779                 /* The time after the resume must not be earlier than the time
780                  * before the suspend or some nasty things will happen
781                  */
782                 sleep_length = 0;
783                 ctime = sleep_start;
784         }
785
786 #ifdef CONFIG_HPET_TIMER
787         if (is_hpet_enabled())
788                 hpet_reenable();
789 #endif
790         sec = ctime + clock_cmos_diff;
791         write_seqlock_irqsave(&xtime_lock, flags);
792         xtime.tv_sec = sec;
793         xtime.tv_nsec = 0;
794         jiffies_64 += sleep_length;
795         write_sequnlock_irqrestore(&xtime_lock, flags);
796         touch_softlockup_watchdog();
797         return 0;
798 }
799
800 static struct sysdev_class timer_sysclass = {
801         .resume = timer_resume,
802         .suspend = timer_suspend,
803         set_kset_name("timer"),
804 };
805
806
807 /* XXX this driverfs stuff should probably go elsewhere later -john */
808 static struct sys_device device_timer = {
809         .id     = 0,
810         .cls    = &timer_sysclass,
811 };
812
813 static int time_init_device(void)
814 {
815         int error = sysdev_class_register(&timer_sysclass);
816         if (!error)
817                 error = sysdev_register(&device_timer);
818         return error;
819 }
820
821 device_initcall(time_init_device);
822
823 #ifdef CONFIG_HPET_TIMER
824 extern void (*late_time_init)(void);
825 /* Duplicate of time_init() below, with hpet_enable part added */
826 static void __init hpet_time_init(void)
827 {
828         xtime.tv_sec = get_cmos_time();
829         xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
830         set_normalized_timespec(&wall_to_monotonic,
831                 -xtime.tv_sec, -xtime.tv_nsec);
832
833         if ((hpet_enable() >= 0) && hpet_use_timer) {
834                 printk("Using HPET for base-timer\n");
835         }
836
837         time_init_hook();
838 }
839 #endif
840
841 /* Dynamically-mapped IRQ. */
842 DEFINE_PER_CPU(int, timer_irq);
843
844 extern void (*late_time_init)(void);
845 static void setup_cpu0_timer_irq(void)
846 {
847         per_cpu(timer_irq, 0) =
848                 bind_virq_to_irqhandler(
849                         VIRQ_TIMER,
850                         0,
851                         timer_interrupt,
852                         SA_INTERRUPT,
853                         "timer0",
854                         NULL);
855         BUG_ON(per_cpu(timer_irq, 0) < 0);
856 }
857
858 void __init time_init(void)
859 {
860 #ifdef CONFIG_HPET_TIMER
861         if (is_hpet_capable()) {
862                 /*
863                  * HPET initialization needs to do memory-mapped io. So, let
864                  * us do a late initialization after mem_init().
865                  */
866                 late_time_init = hpet_time_init;
867                 return;
868         }
869 #endif
870         get_time_values_from_xen();
871
872         processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
873         per_cpu(processed_system_time, 0) = processed_system_time;
874         init_missing_ticks_accounting(0);
875
876         update_wallclock();
877
878         init_cpu_khz();
879         printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
880                cpu_khz / 1000, cpu_khz % 1000);
881
882         vxtime.mode = VXTIME_TSC;
883         vxtime.quot = (1000000L << US_SCALE) / vxtime_hz;
884         vxtime.tsc_quot = (1000L << US_SCALE) / cpu_khz;
885         sync_core();
886         rdtscll(vxtime.last_tsc);
887
888         /* Cannot request_irq() until kmem is initialised. */
889         late_time_init = setup_cpu0_timer_irq;
890 }
891
892 /* Convert jiffies to system time. */
893 u64 jiffies_to_st(unsigned long j)
894 {
895         unsigned long seq;
896         long delta;
897         u64 st;
898
899         do {
900                 seq = read_seqbegin(&xtime_lock);
901                 delta = j - jiffies;
902                 if (delta < 1) {
903                         /* Triggers in some wrap-around cases, but that's okay:
904                          * we just end up with a shorter timeout. */
905                         st = processed_system_time + NS_PER_TICK;
906                 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
907                         /* Very long timeout means there is no pending timer.
908                          * We indicate this to Xen by passing zero timeout. */
909                         st = 0;
910                 } else {
911                         st = processed_system_time + delta * (u64)NS_PER_TICK;
912                 }
913         } while (read_seqretry(&xtime_lock, seq));
914
915         return st;
916 }
917 EXPORT_SYMBOL(jiffies_to_st);
918
919 /*
920  * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
921  * These functions are based on implementations from arch/s390/kernel/time.c
922  */
923 static void stop_hz_timer(void)
924 {
925         unsigned int cpu = smp_processor_id();
926         unsigned long j;
927
928         cpu_set(cpu, nohz_cpu_mask);
929
930         /* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
931         /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
932         /* value of rcp->cur that matches rdp->quiescbatch and allows us to  */
933         /* stop the hz timer then the cpumasks created for subsequent values */
934         /* of cur in rcu_start_batch are guaranteed to pick up the updated   */
935         /* nohz_cpu_mask and so will not depend on this cpu.                 */
936
937         smp_mb();
938
939         /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
940         if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
941             (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
942                 cpu_clear(cpu, nohz_cpu_mask);
943                 j = jiffies + 1;
944         }
945
946         if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
947                 BUG();
948 }
949
950 static void start_hz_timer(void)
951 {
952         cpu_clear(smp_processor_id(), nohz_cpu_mask);
953 }
954
955 void raw_safe_halt(void)
956 {
957         stop_hz_timer();
958         /* Blocking includes an implicit local_irq_enable(). */
959         HYPERVISOR_block();
960         start_hz_timer();
961 }
962 EXPORT_SYMBOL(raw_safe_halt);
963
964 void halt(void)
965 {
966         if (irqs_disabled())
967                 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
968 }
969 EXPORT_SYMBOL(halt);
970
971 /* No locking required. We are only CPU running, and interrupts are off. */
972 void time_resume(void)
973 {
974         init_cpu_khz();
975
976         get_time_values_from_xen();
977
978         processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
979         per_cpu(processed_system_time, 0) = processed_system_time;
980         init_missing_ticks_accounting(0);
981
982         update_wallclock();
983 }
984
985 #ifdef CONFIG_SMP
986 static char timer_name[NR_CPUS][15];
987
988 void local_setup_timer(unsigned int cpu)
989 {
990         int seq;
991
992         BUG_ON(cpu == 0);
993
994         do {
995                 seq = read_seqbegin(&xtime_lock);
996                 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
997                 per_cpu(processed_system_time, cpu) =
998                         per_cpu(shadow_time, 0).system_timestamp;
999                 init_missing_ticks_accounting(cpu);
1000         } while (read_seqretry(&xtime_lock, seq));
1001
1002         sprintf(timer_name[cpu], "timer%d", cpu);
1003         per_cpu(timer_irq, cpu) =
1004                 bind_virq_to_irqhandler(
1005                         VIRQ_TIMER,
1006                         cpu,
1007                         timer_interrupt,
1008                         SA_INTERRUPT,
1009                         timer_name[cpu],
1010                         NULL);
1011         BUG_ON(per_cpu(timer_irq, cpu) < 0);
1012 }
1013
1014 void local_teardown_timer(unsigned int cpu)
1015 {
1016         BUG_ON(cpu == 0);
1017         unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1018 }
1019 #endif
1020
1021 /*
1022  * /proc/sys/xen: This really belongs in another file. It can stay here for
1023  * now however.
1024  */
1025 static ctl_table xen_subtable[] = {
1026         {
1027                 .ctl_name       = 1,
1028                 .procname       = "independent_wallclock",
1029                 .data           = &independent_wallclock,
1030                 .maxlen         = sizeof(independent_wallclock),
1031                 .mode           = 0644,
1032                 .proc_handler   = proc_dointvec
1033         },
1034         {
1035                 .ctl_name       = 2,
1036                 .procname       = "permitted_clock_jitter",
1037                 .data           = &permitted_clock_jitter,
1038                 .maxlen         = sizeof(permitted_clock_jitter),
1039                 .mode           = 0644,
1040                 .proc_handler   = proc_doulongvec_minmax
1041         },
1042         { 0 }
1043 };
1044 static ctl_table xen_table[] = {
1045         {
1046                 .ctl_name       = 123,
1047                 .procname       = "xen",
1048                 .mode           = 0555,
1049                 .child          = xen_subtable},
1050         { 0 }
1051 };
1052 static int __init xen_sysctl_init(void)
1053 {
1054         (void)register_sysctl_table(xen_table, 0);
1055         return 0;
1056 }
1057 __initcall(xen_sysctl_init);