fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / arch / i386 / kernel / time-xen.c
1 /*
2  *  linux/arch/i386/kernel/time.c
3  *
4  *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
5  *
6  * This file contains the PC-specific time handling details:
7  * reading the RTC at bootup, etc..
8  * 1994-07-02    Alan Modra
9  *      fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10  * 1995-03-26    Markus Kuhn
11  *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12  *      precision CMOS clock update
13  * 1996-05-03    Ingo Molnar
14  *      fixed time warps in do_[slow|fast]_gettimeoffset()
15  * 1997-09-10   Updated NTP code according to technical memorandum Jan '96
16  *              "A Kernel Model for Precision Timekeeping" by Dave Mills
17  * 1998-09-05    (Various)
18  *      More robust do_fast_gettimeoffset() algorithm implemented
19  *      (works with APM, Cyrix 6x86MX and Centaur C6),
20  *      monotonic gettimeofday() with fast_get_timeoffset(),
21  *      drift-proof precision TSC calibration on boot
22  *      (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23  *      Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24  *      ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25  * 1998-12-16    Andrea Arcangeli
26  *      Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27  *      because was not accounting lost_ticks.
28  * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
29  *      Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30  *      serialize accesses to xtime/lost_ticks).
31  */
32
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51 #include <linux/kernel_stat.h>
52 #include <linux/posix-timers.h>
53
54 #include <asm/io.h>
55 #include <asm/smp.h>
56 #include <asm/irq.h>
57 #include <asm/msr.h>
58 #include <asm/delay.h>
59 #include <asm/mpspec.h>
60 #include <asm/uaccess.h>
61 #include <asm/processor.h>
62 #include <asm/timer.h>
63 #include <asm/time.h>
64
65 #include "mach_time.h"
66
67 #include <linux/timex.h>
68
69 #include <asm/hpet.h>
70
71 #include <asm/arch_hooks.h>
72
73 #include <xen/evtchn.h>
74 #include <xen/interface/vcpu.h>
75
76 #include <asm/i8259.h>
77
78 int pit_latch_buggy;              /* extern */
79
80 #define USEC_PER_TICK (USEC_PER_SEC / HZ)
81 #define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
82 #define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
83
84 #define NS_SCALE        10 /* 2^10, carefully chosen */
85 #define US_SCALE        32 /* 2^32, arbitralrily chosen */
86
87 unsigned int cpu_khz;   /* Detected as we calibrate the TSC */
88 EXPORT_SYMBOL(cpu_khz);
89
90 DEFINE_SPINLOCK(rtc_lock);
91 EXPORT_SYMBOL(rtc_lock);
92
93 extern struct init_timer_opts timer_tsc_init;
94 extern struct timer_opts timer_tsc;
95 #define timer_none timer_tsc
96
97 /* These are peridically updated in shared_info, and then copied here. */
98 struct shadow_time_info {
99         u64 tsc_timestamp;     /* TSC at last update of time vals.  */
100         u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
101         u32 tsc_to_nsec_mul;
102         u32 tsc_to_usec_mul;
103         int tsc_shift;
104         u32 version;
105 };
106 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
107 static struct timespec shadow_tv;
108 static u32 shadow_tv_version;
109
110 /* Keep track of last time we did processing/updating of jiffies and xtime. */
111 static u64 processed_system_time;   /* System time (ns) at last processing. */
112 static DEFINE_PER_CPU(u64, processed_system_time);
113
114 /* How much CPU time was spent blocked and how much was 'stolen'? */
115 static DEFINE_PER_CPU(u64, processed_stolen_time);
116 static DEFINE_PER_CPU(u64, processed_blocked_time);
117
118 /* Current runstate of each CPU (updated automatically by the hypervisor). */
119 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
120
121 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
122 #define NS_PER_TICK (1000000000LL/HZ)
123
124 static inline void __normalize_time(time_t *sec, s64 *nsec)
125 {
126         while (*nsec >= NSEC_PER_SEC) {
127                 (*nsec) -= NSEC_PER_SEC;
128                 (*sec)++;
129         }
130         while (*nsec < 0) {
131                 (*nsec) += NSEC_PER_SEC;
132                 (*sec)--;
133         }
134 }
135
136 /* Does this guest OS track Xen time, or set its wall clock independently? */
137 static int independent_wallclock = 0;
138 static int __init __independent_wallclock(char *str)
139 {
140         independent_wallclock = 1;
141         return 1;
142 }
143 __setup("independent_wallclock", __independent_wallclock);
144
145 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
146 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
147 static int __init __permitted_clock_jitter(char *str)
148 {
149         permitted_clock_jitter = simple_strtoul(str, NULL, 0);
150         return 1;
151 }
152 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
153
154 #ifndef CONFIG_X86
155 int tsc_disable __devinitdata = 0;
156 #endif
157
158 static void delay_tsc(unsigned long loops)
159 {
160         unsigned long bclock, now;
161
162         rdtscl(bclock);
163         do {
164                 rep_nop();
165                 rdtscl(now);
166         } while ((now - bclock) < loops);
167 }
168
169 struct timer_opts timer_tsc = {
170         .name = "tsc",
171         .delay = delay_tsc,
172 };
173
174 /*
175  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
176  * yielding a 64-bit result.
177  */
178 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
179 {
180         u64 product;
181         u32 tmp1, tmp2;
182
183         if (shift < 0)
184                 delta >>= -shift;
185         else
186                 delta <<= shift;
187
188         __asm__ (
189                 "mul  %5       ; "
190                 "mov  %4,%%eax ; "
191                 "mov  %%edx,%4 ; "
192                 "mul  %5       ; "
193                 "xor  %5,%5    ; "
194                 "add  %4,%%eax ; "
195                 "adc  %5,%%edx ; "
196                 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
197                 : "a" ((u32)delta), "1" ((u32)(delta >> US_SCALE)), "2" (mul_frac) );
198
199         return product;
200 }
201
202 int read_current_timer(unsigned long *timer_val)
203 {
204         rdtscl(*timer_val);
205         return 0;
206 }
207
208 void init_cpu_khz(void)
209 {
210         u64 __cpu_khz = 1000000ULL << US_SCALE;
211         struct vcpu_time_info *info;
212         info = &HYPERVISOR_shared_info->vcpu_info[0].time;
213         do_div(__cpu_khz, info->tsc_to_system_mul);
214         if (info->tsc_shift < 0)
215                 cpu_khz = __cpu_khz << -info->tsc_shift;
216         else
217                 cpu_khz = __cpu_khz >> info->tsc_shift;
218 }
219
220 static u64 get_nsec_offset(struct shadow_time_info *shadow)
221 {
222         u64 now, delta;
223         rdtscll(now);
224         delta = now - shadow->tsc_timestamp;
225         return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
226 }
227
228 static unsigned long get_usec_offset(struct shadow_time_info *shadow)
229 {
230         u64 now, delta;
231         rdtscll(now);
232         delta = now - shadow->tsc_timestamp;
233         return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
234 }
235
236 static void __update_wallclock(time_t sec, long nsec)
237 {
238         long wtm_nsec, xtime_nsec;
239         time_t wtm_sec, xtime_sec;
240         u64 tmp, wc_nsec;
241
242         /* Adjust wall-clock time base based on jiffies ticks. */
243         wc_nsec = processed_system_time;
244         wc_nsec += sec * (u64)NSEC_PER_SEC;
245         wc_nsec += nsec;
246
247         /* Split wallclock base into seconds and nanoseconds. */
248         tmp = wc_nsec;
249         xtime_nsec = do_div(tmp, 1000000000);
250         xtime_sec  = (time_t)tmp;
251
252         wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
253         wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
254
255         set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
256         set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
257
258         ntp_clear();
259 }
260
261 static void update_wallclock(void)
262 {
263         shared_info_t *s = HYPERVISOR_shared_info;
264
265         do {
266                 shadow_tv_version = s->wc_version;
267                 rmb();
268                 shadow_tv.tv_sec  = s->wc_sec;
269                 shadow_tv.tv_nsec = s->wc_nsec;
270                 rmb();
271         } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
272
273         if (!independent_wallclock)
274                 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
275 }
276
277 /*
278  * Reads a consistent set of time-base values from Xen, into a shadow data
279  * area.
280  */
281 static void get_time_values_from_xen(void)
282 {
283         shared_info_t           *s = HYPERVISOR_shared_info;
284         struct vcpu_time_info   *src;
285         struct shadow_time_info *dst;
286
287         src = &s->vcpu_info[smp_processor_id()].time;
288         dst = &per_cpu(shadow_time, smp_processor_id());
289
290         do {
291                 dst->version = src->version;
292                 rmb();
293                 dst->tsc_timestamp     = src->tsc_timestamp;
294                 dst->system_timestamp  = src->system_time;
295                 dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
296                 dst->tsc_shift         = src->tsc_shift;
297                 rmb();
298         } while ((src->version & 1) | (dst->version ^ src->version));
299
300         dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
301 }
302
303 static inline int time_values_up_to_date(int cpu)
304 {
305         struct vcpu_time_info   *src;
306         struct shadow_time_info *dst;
307
308         src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
309         dst = &per_cpu(shadow_time, cpu);
310
311         rmb();
312         return (dst->version == src->version);
313 }
314
315 /*
316  * This is a special lock that is owned by the CPU and holds the index
317  * register we are working with.  It is required for NMI access to the
318  * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
319  */
320 volatile unsigned long cmos_lock = 0;
321 EXPORT_SYMBOL(cmos_lock);
322
323 /* Routines for accessing the CMOS RAM/RTC. */
324 unsigned char rtc_cmos_read(unsigned char addr)
325 {
326         unsigned char val;
327         lock_cmos_prefix(addr);
328         outb_p(addr, RTC_PORT(0));
329         val = inb_p(RTC_PORT(1));
330         lock_cmos_suffix(addr);
331         return val;
332 }
333 EXPORT_SYMBOL(rtc_cmos_read);
334
335 void rtc_cmos_write(unsigned char val, unsigned char addr)
336 {
337         lock_cmos_prefix(addr);
338         outb_p(addr, RTC_PORT(0));
339         outb_p(val, RTC_PORT(1));
340         lock_cmos_suffix(addr);
341 }
342 EXPORT_SYMBOL(rtc_cmos_write);
343
344 /*
345  * This version of gettimeofday has microsecond resolution
346  * and better than microsecond precision on fast x86 machines with TSC.
347  */
348 void do_gettimeofday(struct timeval *tv)
349 {
350         unsigned long seq;
351         unsigned long usec, sec;
352         unsigned long max_ntp_tick;
353         s64 nsec;
354         unsigned int cpu;
355         struct shadow_time_info *shadow;
356         u32 local_time_version;
357
358         cpu = get_cpu();
359         shadow = &per_cpu(shadow_time, cpu);
360
361         do {
362                 local_time_version = shadow->version;
363                 seq = read_seqbegin(&xtime_lock);
364
365                 usec = get_usec_offset(shadow);
366
367                 /*
368                  * If time_adjust is negative then NTP is slowing the clock
369                  * so make sure not to go into next possible interval.
370                  * Better to lose some accuracy than have time go backwards..
371                  */
372                 if (unlikely(time_adjust < 0)) {
373                         max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
374                         usec = min(usec, max_ntp_tick);
375                 }
376
377                 sec = xtime.tv_sec;
378                 usec += (xtime.tv_nsec / NSEC_PER_USEC);
379
380                 nsec = shadow->system_timestamp - processed_system_time;
381                 __normalize_time(&sec, &nsec);
382                 usec += (long)nsec / NSEC_PER_USEC;
383
384                 if (unlikely(!time_values_up_to_date(cpu))) {
385                         /*
386                          * We may have blocked for a long time,
387                          * rendering our calculations invalid
388                          * (e.g. the time delta may have
389                          * overflowed). Detect that and recalculate
390                          * with fresh values.
391                          */
392                         get_time_values_from_xen();
393                         continue;
394                 }
395         } while (read_seqretry(&xtime_lock, seq) ||
396                  (local_time_version != shadow->version));
397
398         put_cpu();
399
400         while (usec >= USEC_PER_SEC) {
401                 usec -= USEC_PER_SEC;
402                 sec++;
403         }
404
405         tv->tv_sec = sec;
406         tv->tv_usec = usec;
407 }
408
409 EXPORT_SYMBOL(do_gettimeofday);
410
411 int do_settimeofday(struct timespec *tv)
412 {
413         time_t sec;
414         s64 nsec;
415         unsigned int cpu;
416         struct shadow_time_info *shadow;
417         dom0_op_t op;
418
419         if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
420                 return -EINVAL;
421
422         cpu = get_cpu();
423         shadow = &per_cpu(shadow_time, cpu);
424
425         write_seqlock_irq(&xtime_lock);
426
427         /*
428          * Ensure we don't get blocked for a long time so that our time delta
429          * overflows. If that were to happen then our shadow time values would
430          * be stale, so we can retry with fresh ones.
431          */
432         for (;;) {
433                 nsec = tv->tv_nsec - get_nsec_offset(shadow);
434                 if (time_values_up_to_date(cpu))
435                         break;
436                 get_time_values_from_xen();
437         }
438         sec = tv->tv_sec;
439         __normalize_time(&sec, &nsec);
440
441         if (is_initial_xendomain() && !independent_wallclock) {
442                 op.cmd = DOM0_SETTIME;
443                 op.u.settime.secs        = sec;
444                 op.u.settime.nsecs       = nsec;
445                 op.u.settime.system_time = shadow->system_timestamp;
446                 HYPERVISOR_dom0_op(&op);
447                 update_wallclock();
448         } else if (independent_wallclock) {
449                 nsec -= shadow->system_timestamp;
450                 __normalize_time(&sec, &nsec);
451                 __update_wallclock(sec, nsec);
452         }
453
454         write_sequnlock_irq(&xtime_lock);
455
456         put_cpu();
457
458         clock_was_set();
459         return 0;
460 }
461
462 EXPORT_SYMBOL(do_settimeofday);
463
464 static void sync_xen_wallclock(unsigned long dummy);
465 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
466 static void sync_xen_wallclock(unsigned long dummy)
467 {
468         time_t sec;
469         s64 nsec;
470         dom0_op_t op;
471
472         if (!ntp_synced() || independent_wallclock || !is_initial_xendomain())
473                 return;
474
475         write_seqlock_irq(&xtime_lock);
476
477         sec  = xtime.tv_sec;
478         nsec = xtime.tv_nsec;
479         __normalize_time(&sec, &nsec);
480
481         op.cmd = DOM0_SETTIME;
482         op.u.settime.secs        = sec;
483         op.u.settime.nsecs       = nsec;
484         op.u.settime.system_time = processed_system_time;
485         HYPERVISOR_dom0_op(&op);
486
487         update_wallclock();
488
489         write_sequnlock_irq(&xtime_lock);
490
491         /* Once per minute. */
492         mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
493 }
494
495 static int set_rtc_mmss(unsigned long nowtime)
496 {
497         int retval;
498         unsigned long flags;
499
500         if (independent_wallclock || !is_initial_xendomain())
501                 return 0;
502
503         /* gets recalled with irq locally disabled */
504         spin_lock_irqsave(&rtc_lock, flags);
505         retval = set_wallclock(nowtime);
506         spin_unlock_irqrestore(&rtc_lock, flags);
507
508         return retval;
509 }
510
511 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
512  *              Note: This function is required to return accurate
513  *              time even in the absence of multiple timer ticks.
514  */
515 unsigned long long monotonic_clock(void)
516 {
517         int cpu = get_cpu();
518         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
519         u64 time;
520         u32 local_time_version;
521
522         do {
523                 local_time_version = shadow->version;
524                 barrier();
525                 time = shadow->system_timestamp + get_nsec_offset(shadow);
526                 if (!time_values_up_to_date(cpu))
527                         get_time_values_from_xen();
528                 barrier();
529         } while (local_time_version != shadow->version);
530
531         put_cpu();
532
533         return time;
534 }
535 EXPORT_SYMBOL(monotonic_clock);
536
537 unsigned long long sched_clock(void)
538 {
539         return monotonic_clock();
540 }
541
542 unsigned long profile_pc(struct pt_regs *regs)
543 {
544         unsigned long pc = instruction_pointer(regs);
545
546 #ifdef CONFIG_SMP
547         if (!user_mode_vm(regs) && in_lock_functions(pc)) {
548 #ifdef CONFIG_FRAME_POINTER
549                 return *(unsigned long *)(regs->ebp + 4);
550 #else
551                 unsigned long *sp;
552                 if ((regs->xcs & 3) == 0)
553                         sp = (unsigned long *)&regs->esp;
554                 else
555                         sp = (unsigned long *)regs->esp;
556                 /* Return address is either directly at stack pointer
557                    or above a saved eflags. Eflags has bits 22-31 zero,
558                    kernel addresses don't. */
559                 if (sp[0] >> 22)
560                         return sp[0];
561                 if (sp[1] >> 22)
562                         return sp[1];
563 #endif
564         }
565 #endif
566         return pc;
567 }
568 EXPORT_SYMBOL(profile_pc);
569
570 irqreturn_t timer_interrupt(int irq, void *dev_id)
571 {
572         s64 delta, delta_cpu, stolen, blocked;
573         u64 sched_time;
574         int i, cpu = smp_processor_id();
575         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
576         struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
577
578         write_seqlock(&xtime_lock);
579
580         do {
581                 get_time_values_from_xen();
582
583                 /* Obtain a consistent snapshot of elapsed wallclock cycles. */
584                 delta = delta_cpu =
585                         shadow->system_timestamp + get_nsec_offset(shadow);
586                 delta     -= processed_system_time;
587                 delta_cpu -= per_cpu(processed_system_time, cpu);
588
589                 /*
590                  * Obtain a consistent snapshot of stolen/blocked cycles. We
591                  * can use state_entry_time to detect if we get preempted here.
592                  */
593                 do {
594                         sched_time = runstate->state_entry_time;
595                         barrier();
596                         stolen = runstate->time[RUNSTATE_runnable] +
597                                 runstate->time[RUNSTATE_offline] -
598                                 per_cpu(processed_stolen_time, cpu);
599                         blocked = runstate->time[RUNSTATE_blocked] -
600                                 per_cpu(processed_blocked_time, cpu);
601                         barrier();
602                 } while (sched_time != runstate->state_entry_time);
603         } while (!time_values_up_to_date(cpu));
604
605         if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
606              unlikely(delta_cpu < -(s64)permitted_clock_jitter))
607             && printk_ratelimit()) {
608                 printk("Timer ISR/%d: Time went backwards: "
609                        "delta=%lld delta_cpu=%lld shadow=%lld "
610                        "off=%lld processed=%lld cpu_processed=%lld\n",
611                        cpu, delta, delta_cpu, shadow->system_timestamp,
612                        (s64)get_nsec_offset(shadow),
613                        processed_system_time,
614                        per_cpu(processed_system_time, cpu));
615                 for (i = 0; i < num_online_cpus(); i++)
616                         printk(" %d: %lld\n", i,
617                                per_cpu(processed_system_time, i));
618         }
619
620         /* System-wide jiffy work. */
621         while (delta >= NS_PER_TICK) {
622                 delta -= NS_PER_TICK;
623                 processed_system_time += NS_PER_TICK;
624                 do_timer(1);
625         }
626
627         if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
628                 update_wallclock();
629                 clock_was_set();
630         }
631
632         write_sequnlock(&xtime_lock);
633
634         /*
635          * Account stolen ticks.
636          * HACK: Passing NULL to account_steal_time()
637          * ensures that the ticks are accounted as stolen.
638          */
639         if ((stolen > 0) && (delta_cpu > 0)) {
640                 delta_cpu -= stolen;
641                 if (unlikely(delta_cpu < 0))
642                         stolen += delta_cpu; /* clamp local-time progress */
643                 do_div(stolen, NS_PER_TICK);
644                 per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
645                 per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
646                 account_steal_time(NULL, (cputime_t)stolen);
647         }
648
649         /*
650          * Account blocked ticks.
651          * HACK: Passing idle_task to account_steal_time()
652          * ensures that the ticks are accounted as idle/wait.
653          */
654         if ((blocked > 0) && (delta_cpu > 0)) {
655                 delta_cpu -= blocked;
656                 if (unlikely(delta_cpu < 0))
657                         blocked += delta_cpu; /* clamp local-time progress */
658                 do_div(blocked, NS_PER_TICK);
659                 per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
660                 per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
661                 account_steal_time(idle_task(cpu), (cputime_t)blocked);
662         }
663
664         /* Account user/system ticks. */
665         if (delta_cpu > 0) {
666                 do_div(delta_cpu, NS_PER_TICK);
667                 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
668                 if (user_mode(get_irq_regs()))
669                         account_user_time(current, (cputime_t)delta_cpu);
670                 else
671                         account_system_time(current, HARDIRQ_OFFSET,
672                                             (cputime_t)delta_cpu);
673         }
674
675         /* Offlined for more than a few seconds? Avoid lockup warnings. */
676         if (stolen > 5*HZ)
677                 touch_softlockup_watchdog();
678
679         /* Local timer processing (see update_process_times()). */
680         run_local_timers();
681         if (rcu_pending(cpu))
682                 rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
683         scheduler_tick();
684         run_posix_cpu_timers(current);
685
686         return IRQ_HANDLED;
687 }
688
689 static void init_missing_ticks_accounting(int cpu)
690 {
691         struct vcpu_register_runstate_memory_area area;
692         struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
693
694         memset(runstate, 0, sizeof(*runstate));
695
696         area.addr.v = runstate;
697         HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
698
699         per_cpu(processed_blocked_time, cpu) =
700                 runstate->time[RUNSTATE_blocked];
701         per_cpu(processed_stolen_time, cpu) =
702                 runstate->time[RUNSTATE_runnable] +
703                 runstate->time[RUNSTATE_offline];
704 }
705
706 /* not static: needed by APM */
707 unsigned long get_cmos_time(void)
708 {
709         unsigned long retval;
710         unsigned long flags;
711
712         spin_lock_irqsave(&rtc_lock, flags);
713
714         retval = get_wallclock();
715
716         spin_unlock_irqrestore(&rtc_lock, flags);
717
718         return retval;
719 }
720 EXPORT_SYMBOL(get_cmos_time);
721
722 static void sync_cmos_clock(unsigned long dummy);
723
724 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
725
726 static void sync_cmos_clock(unsigned long dummy)
727 {
728         struct timeval now, next;
729         int fail = 1;
730
731         /*
732          * If we have an externally synchronized Linux clock, then update
733          * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
734          * called as close as possible to 500 ms before the new second starts.
735          * This code is run on a timer.  If the clock is set, that timer
736          * may not expire at the correct time.  Thus, we adjust...
737          */
738         if (!ntp_synced())
739                 /*
740                  * Not synced, exit, do not restart a timer (if one is
741                  * running, let it run out).
742                  */
743                 return;
744
745         do_gettimeofday(&now);
746         if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
747             now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
748                 fail = set_rtc_mmss(now.tv_sec);
749
750         next.tv_usec = USEC_AFTER - now.tv_usec;
751         if (next.tv_usec <= 0)
752                 next.tv_usec += USEC_PER_SEC;
753
754         if (!fail)
755                 next.tv_sec = 659;
756         else
757                 next.tv_sec = 0;
758
759         if (next.tv_usec >= USEC_PER_SEC) {
760                 next.tv_sec++;
761                 next.tv_usec -= USEC_PER_SEC;
762         }
763         mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
764 }
765
766 void notify_arch_cmos_timer(void)
767 {
768         mod_timer(&sync_cmos_timer, jiffies + 1);
769         mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
770 }
771
772 static long clock_cmos_diff;
773 static unsigned long sleep_start;
774
775 static int timer_suspend(struct sys_device *dev, pm_message_t state)
776 {
777         /*
778          * Estimate time zone so that set_time can update the clock
779          */
780         unsigned long ctime =  get_cmos_time();
781
782         clock_cmos_diff = -ctime;
783         clock_cmos_diff += get_seconds();
784         sleep_start = ctime;
785         return 0;
786 }
787
788 static int timer_resume(struct sys_device *dev)
789 {
790         unsigned long flags;
791         unsigned long sec;
792         unsigned long ctime = get_cmos_time();
793         long sleep_length = (ctime - sleep_start) * HZ;
794
795         if (sleep_length < 0) {
796                 printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
797                 /* The time after the resume must not be earlier than the time
798                  * before the suspend or some nasty things will happen
799                  */
800                 sleep_length = 0;
801                 ctime = sleep_start;
802         }
803
804 #ifdef CONFIG_HPET_TIMER
805         if (is_hpet_enabled())
806                 hpet_reenable();
807 #endif
808         sec = ctime + clock_cmos_diff;
809         write_seqlock_irqsave(&xtime_lock, flags);
810         xtime.tv_sec = sec;
811         xtime.tv_nsec = 0;
812         jiffies_64 += sleep_length;
813         write_sequnlock_irqrestore(&xtime_lock, flags);
814         touch_softlockup_watchdog();
815         return 0;
816 }
817
818 static struct sysdev_class timer_sysclass = {
819         .resume = timer_resume,
820         .suspend = timer_suspend,
821         set_kset_name("timer"),
822 };
823
824
825 /* XXX this driverfs stuff should probably go elsewhere later -john */
826 static struct sys_device device_timer = {
827         .id     = 0,
828         .cls    = &timer_sysclass,
829 };
830
831 static int time_init_device(void)
832 {
833         int error = sysdev_class_register(&timer_sysclass);
834         if (!error)
835                 error = sysdev_register(&device_timer);
836         return error;
837 }
838
839 device_initcall(time_init_device);
840
841 #ifdef CONFIG_HPET_TIMER
842 extern void (*late_time_init)(void);
843 /* Duplicate of time_init() below, with hpet_enable part added */
844 static void __init hpet_time_init(void)
845 {
846         xtime.tv_sec = get_cmos_time();
847         xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
848         set_normalized_timespec(&wall_to_monotonic,
849                 -xtime.tv_sec, -xtime.tv_nsec);
850
851         if ((hpet_enable() >= 0) && hpet_use_timer) {
852                 printk("Using HPET for base-timer\n");
853         }
854
855         do_time_init();
856 }
857 #endif
858
859 /* Dynamically-mapped IRQ. */
860 DEFINE_PER_CPU(int, timer_irq);
861
862 extern void (*late_time_init)(void);
863 static void setup_cpu0_timer_irq(void)
864 {
865         per_cpu(timer_irq, 0) =
866                 bind_virq_to_irqhandler(
867                         VIRQ_TIMER,
868                         0,
869                         timer_interrupt,
870                         SA_INTERRUPT,
871                         "timer0",
872                         NULL);
873         BUG_ON(per_cpu(timer_irq, 0) < 0);
874 }
875
876 void __init time_init(void)
877 {
878 #ifdef CONFIG_HPET_TIMER
879         if (is_hpet_capable()) {
880                 /*
881                  * HPET initialization needs to do memory-mapped io. So, let
882                  * us do a late initialization after mem_init().
883                  */
884                 late_time_init = hpet_time_init;
885                 return;
886         }
887 #endif
888         get_time_values_from_xen();
889
890         processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
891         per_cpu(processed_system_time, 0) = processed_system_time;
892         init_missing_ticks_accounting(0);
893
894         update_wallclock();
895
896         init_cpu_khz();
897         printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
898                cpu_khz / 1000, cpu_khz % 1000);
899
900         /* Cannot request_irq() until kmem is initialised. */
901         late_time_init = setup_cpu0_timer_irq;
902 }
903
904 /* Convert jiffies to system time. */
905 u64 jiffies_to_st(unsigned long j)
906 {
907         unsigned long seq;
908         long delta;
909         u64 st;
910
911         do {
912                 seq = read_seqbegin(&xtime_lock);
913                 delta = j - jiffies;
914                 if (delta < 1) {
915                         /* Triggers in some wrap-around cases, but that's okay:
916                          * we just end up with a shorter timeout. */
917                         st = processed_system_time + NS_PER_TICK;
918                 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
919                         /* Very long timeout means there is no pending timer.
920                          * We indicate this to Xen by passing zero timeout. */
921                         st = 0;
922                 } else {
923                         st = processed_system_time + delta * (u64)NS_PER_TICK;
924                 }
925         } while (read_seqretry(&xtime_lock, seq));
926
927         return st;
928 }
929 EXPORT_SYMBOL(jiffies_to_st);
930
931 /*
932  * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
933  * These functions are based on implementations from arch/s390/kernel/time.c
934  */
935 static void stop_hz_timer(void)
936 {
937         unsigned int cpu = smp_processor_id();
938         unsigned long j;
939
940         cpu_set(cpu, nohz_cpu_mask);
941
942         /* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
943         /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
944         /* value of rcp->cur that matches rdp->quiescbatch and allows us to  */
945         /* stop the hz timer then the cpumasks created for subsequent values */
946         /* of cur in rcu_start_batch are guaranteed to pick up the updated   */
947         /* nohz_cpu_mask and so will not depend on this cpu.                 */
948
949         smp_mb();
950
951         /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
952         if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
953             (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
954                 cpu_clear(cpu, nohz_cpu_mask);
955                 j = jiffies + 1;
956         }
957
958         if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
959                 BUG();
960 }
961
962 static void start_hz_timer(void)
963 {
964         cpu_clear(smp_processor_id(), nohz_cpu_mask);
965 }
966
967 void raw_safe_halt(void)
968 {
969         stop_hz_timer();
970         /* Blocking includes an implicit local_irq_enable(). */
971         HYPERVISOR_block();
972         start_hz_timer();
973 }
974 EXPORT_SYMBOL(raw_safe_halt);
975
976 void halt(void)
977 {
978         if (irqs_disabled())
979                 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
980 }
981 EXPORT_SYMBOL(halt);
982
983 /* No locking required. We are only CPU running, and interrupts are off. */
984 void time_resume(void)
985 {
986         init_cpu_khz();
987
988         get_time_values_from_xen();
989
990         processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
991         per_cpu(processed_system_time, 0) = processed_system_time;
992         init_missing_ticks_accounting(0);
993
994         update_wallclock();
995 }
996
997 #ifdef CONFIG_SMP
998 static char timer_name[NR_CPUS][15];
999
1000 void local_setup_timer(unsigned int cpu)
1001 {
1002         int seq;
1003
1004         BUG_ON(cpu == 0);
1005
1006         do {
1007                 seq = read_seqbegin(&xtime_lock);
1008                 /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
1009                 per_cpu(processed_system_time, cpu) =
1010                         per_cpu(shadow_time, 0).system_timestamp;
1011                 init_missing_ticks_accounting(cpu);
1012         } while (read_seqretry(&xtime_lock, seq));
1013
1014         sprintf(timer_name[cpu], "timer%d", cpu);
1015         per_cpu(timer_irq, cpu) =
1016                 bind_virq_to_irqhandler(
1017                         VIRQ_TIMER,
1018                         cpu,
1019                         timer_interrupt,
1020                         SA_INTERRUPT,
1021                         timer_name[cpu],
1022                         NULL);
1023         BUG_ON(per_cpu(timer_irq, cpu) < 0);
1024 }
1025
1026 void local_teardown_timer(unsigned int cpu)
1027 {
1028         BUG_ON(cpu == 0);
1029         unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
1030 }
1031 #endif
1032
1033 /*
1034  * /proc/sys/xen: This really belongs in another file. It can stay here for
1035  * now however.
1036  */
1037 static ctl_table xen_subtable[] = {
1038         {
1039                 .ctl_name       = 1,
1040                 .procname       = "independent_wallclock",
1041                 .data           = &independent_wallclock,
1042                 .maxlen         = sizeof(independent_wallclock),
1043                 .mode           = 0644,
1044                 .proc_handler   = proc_dointvec
1045         },
1046         {
1047                 .ctl_name       = 2,
1048                 .procname       = "permitted_clock_jitter",
1049                 .data           = &permitted_clock_jitter,
1050                 .maxlen         = sizeof(permitted_clock_jitter),
1051                 .mode           = 0644,
1052                 .proc_handler   = proc_doulongvec_minmax
1053         },
1054         { 0 }
1055 };
1056 static ctl_table xen_table[] = {
1057         {
1058                 .ctl_name       = 123,
1059                 .procname       = "xen",
1060                 .mode           = 0555,
1061                 .child          = xen_subtable},
1062         { 0 }
1063 };
1064 static int __init xen_sysctl_init(void)
1065 {
1066         (void)register_sysctl_table(xen_table, 0);
1067         return 0;
1068 }
1069 __initcall(xen_sysctl_init);