This commit was generated by cvs2svn to compensate for changes in r925,
[linux-2.6.git] / arch / xen / i386 / kernel / time.c
1 /*
2  *  linux/arch/i386/kernel/time.c
3  *
4  *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
5  *
6  * This file contains the PC-specific time handling details:
7  * reading the RTC at bootup, etc..
8  * 1994-07-02    Alan Modra
9  *      fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
10  * 1995-03-26    Markus Kuhn
11  *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
12  *      precision CMOS clock update
13  * 1996-05-03    Ingo Molnar
14  *      fixed time warps in do_[slow|fast]_gettimeoffset()
15  * 1997-09-10   Updated NTP code according to technical memorandum Jan '96
16  *              "A Kernel Model for Precision Timekeeping" by Dave Mills
17  * 1998-09-05    (Various)
18  *      More robust do_fast_gettimeoffset() algorithm implemented
19  *      (works with APM, Cyrix 6x86MX and Centaur C6),
20  *      monotonic gettimeofday() with fast_get_timeoffset(),
21  *      drift-proof precision TSC calibration on boot
22  *      (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
23  *      Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
24  *      ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
25  * 1998-12-16    Andrea Arcangeli
26  *      Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
27  *      because was not accounting lost_ticks.
28  * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
29  *      Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
30  *      serialize accesses to xtime/lost_ticks).
31  */
32
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/kernel.h>
36 #include <linux/param.h>
37 #include <linux/string.h>
38 #include <linux/mm.h>
39 #include <linux/interrupt.h>
40 #include <linux/time.h>
41 #include <linux/delay.h>
42 #include <linux/init.h>
43 #include <linux/smp.h>
44 #include <linux/module.h>
45 #include <linux/sysdev.h>
46 #include <linux/bcd.h>
47 #include <linux/efi.h>
48 #include <linux/mca.h>
49 #include <linux/sysctl.h>
50 #include <linux/percpu.h>
51
52 #include <asm/io.h>
53 #include <asm/smp.h>
54 #include <asm/irq.h>
55 #include <asm/msr.h>
56 #include <asm/delay.h>
57 #include <asm/mpspec.h>
58 #include <asm/uaccess.h>
59 #include <asm/processor.h>
60 #include <asm/timer.h>
61
62 #include "mach_time.h"
63
64 #include <linux/timex.h>
65 #include <linux/config.h>
66
67 #include <asm/hpet.h>
68
69 #include <asm/arch_hooks.h>
70
71 #include "io_ports.h"
72
73 spinlock_t i8259A_lock = SPIN_LOCK_UNLOCKED;
74 int pit_latch_buggy;              /* extern */
75
76 u64 jiffies_64 = INITIAL_JIFFIES;
77
78 EXPORT_SYMBOL(jiffies_64);
79
80 #if defined(__x86_64__)
81 unsigned long vxtime_hz = PIT_TICK_RATE;
82 struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
83 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
84 unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
85 struct timespec __xtime __section_xtime;
86 struct timezone __sys_tz __section_sys_tz;
87 #endif
88
89 #if defined(__x86_64__)
90 unsigned int cpu_khz;   /* Detected as we calibrate the TSC */
91 #else
92 unsigned long cpu_khz;  /* Detected as we calibrate the TSC */
93 #endif
94
95 extern unsigned long wall_jiffies;
96
97 DEFINE_SPINLOCK(rtc_lock);
98
99 DEFINE_SPINLOCK(i8253_lock);
100 EXPORT_SYMBOL(i8253_lock);
101
102 extern struct init_timer_opts timer_tsc_init;
103 extern struct timer_opts timer_tsc;
104 struct timer_opts *cur_timer = &timer_tsc;
105
106 /* These are peridically updated in shared_info, and then copied here. */
107 u32 shadow_tsc_stamp;
108 u64 shadow_system_time;
109 static u32 shadow_time_version;
110 static struct timeval shadow_tv;
111
112 /*
113  * We use this to ensure that gettimeofday() is monotonically increasing. We
114  * only break this guarantee if the wall clock jumps backwards "a long way".
115  */
116 static struct timeval last_seen_tv = {0,0};
117
118 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
119 /* Periodically propagate synchronised time base to the RTC and to Xen. */
120 static long last_rtc_update, last_update_to_xen;
121 #endif
122
123 /* Periodically take synchronised time base from Xen, if we need it. */
124 static long last_update_from_xen;   /* UTC seconds when last read Xen clock. */
125
126 /* Keep track of last time we did processing/updating of jiffies and xtime. */
127 static u64 processed_system_time;   /* System time (ns) at last processing. */
128 static DEFINE_PER_CPU(u64, processed_system_time);
129
130 #define NS_PER_TICK (1000000000ULL/HZ)
131
132 #define HANDLE_USEC_UNDERFLOW(_tv) do {         \
133         while ((_tv).tv_usec < 0) {             \
134                 (_tv).tv_usec += USEC_PER_SEC;  \
135                 (_tv).tv_sec--;                 \
136         }                                       \
137 } while (0)
138 #define HANDLE_USEC_OVERFLOW(_tv) do {          \
139         while ((_tv).tv_usec >= USEC_PER_SEC) { \
140                 (_tv).tv_usec -= USEC_PER_SEC;  \
141                 (_tv).tv_sec++;                 \
142         }                                       \
143 } while (0)
144 static inline void __normalize_time(time_t *sec, s64 *nsec)
145 {
146         while (*nsec >= NSEC_PER_SEC) {
147                 (*nsec) -= NSEC_PER_SEC;
148                 (*sec)++;
149         }
150         while (*nsec < 0) {
151                 (*nsec) += NSEC_PER_SEC;
152                 (*sec)--;
153         }
154 }
155
156 /* Does this guest OS track Xen time, or set its wall clock independently? */
157 static int independent_wallclock = 0;
158 static int __init __independent_wallclock(char *str)
159 {
160         independent_wallclock = 1;
161         return 1;
162 }
163 __setup("independent_wallclock", __independent_wallclock);
164 #define INDEPENDENT_WALLCLOCK() \
165     (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN))
166
167 /*
168  * Reads a consistent set of time-base values from Xen, into a shadow data
169  * area. Must be called with the xtime_lock held for writing.
170  */
171 static void __get_time_values_from_xen(void)
172 {
173         shared_info_t *s = HYPERVISOR_shared_info;
174
175         do {
176                 shadow_time_version = s->time_version2;
177                 rmb();
178                 shadow_tv.tv_sec    = s->wc_sec;
179                 shadow_tv.tv_usec   = s->wc_usec;
180                 shadow_tsc_stamp    = (u32)s->tsc_timestamp;
181                 shadow_system_time  = s->system_time;
182                 rmb();
183         }
184         while (shadow_time_version != s->time_version1);
185
186         cur_timer->mark_offset();
187 }
188
189 #define TIME_VALUES_UP_TO_DATE \
190  ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); })
191
192 /*
193  * This is a special lock that is owned by the CPU and holds the index
194  * register we are working with.  It is required for NMI access to the
195  * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
196  */
197 volatile unsigned long cmos_lock = 0;
198 EXPORT_SYMBOL(cmos_lock);
199
200 /* Routines for accessing the CMOS RAM/RTC. */
201 unsigned char rtc_cmos_read(unsigned char addr)
202 {
203         unsigned char val;
204         lock_cmos_prefix(addr);
205         outb_p(addr, RTC_PORT(0));
206         val = inb_p(RTC_PORT(1));
207         lock_cmos_suffix(addr);
208         return val;
209 }
210 EXPORT_SYMBOL(rtc_cmos_read);
211
212 void rtc_cmos_write(unsigned char val, unsigned char addr)
213 {
214         lock_cmos_prefix(addr);
215         outb_p(addr, RTC_PORT(0));
216         outb_p(val, RTC_PORT(1));
217         lock_cmos_suffix(addr);
218 }
219 EXPORT_SYMBOL(rtc_cmos_write);
220
221 /*
222  * This version of gettimeofday has microsecond resolution
223  * and better than microsecond precision on fast x86 machines with TSC.
224  */
225 void do_gettimeofday(struct timeval *tv)
226 {
227         unsigned long seq;
228         unsigned long usec, sec;
229         unsigned long max_ntp_tick;
230         unsigned long flags;
231         s64 nsec;
232
233         do {
234                 unsigned long lost;
235
236                 seq = read_seqbegin(&xtime_lock);
237
238                 usec = cur_timer->get_offset();
239                 lost = jiffies - wall_jiffies;
240
241                 /*
242                  * If time_adjust is negative then NTP is slowing the clock
243                  * so make sure not to go into next possible interval.
244                  * Better to lose some accuracy than have time go backwards..
245                  */
246                 if (unlikely(time_adjust < 0)) {
247                         max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
248                         usec = min(usec, max_ntp_tick);
249
250                         if (lost)
251                                 usec += lost * max_ntp_tick;
252                 }
253                 else if (unlikely(lost))
254                         usec += lost * (USEC_PER_SEC / HZ);
255
256                 sec = xtime.tv_sec;
257                 usec += (xtime.tv_nsec / NSEC_PER_USEC);
258
259                 nsec = shadow_system_time - processed_system_time;
260                 __normalize_time(&sec, &nsec);
261                 usec += (long)nsec / NSEC_PER_USEC;
262
263                 if (unlikely(!TIME_VALUES_UP_TO_DATE)) {
264                         /*
265                          * We may have blocked for a long time,
266                          * rendering our calculations invalid
267                          * (e.g. the time delta may have
268                          * overflowed). Detect that and recalculate
269                          * with fresh values.
270                          */
271                         write_seqlock_irqsave(&xtime_lock, flags);
272                         __get_time_values_from_xen();
273                         write_sequnlock_irqrestore(&xtime_lock, flags);
274                         continue;
275                 }
276         } while (read_seqretry(&xtime_lock, seq));
277
278         while (usec >= USEC_PER_SEC) {
279                 usec -= USEC_PER_SEC;
280                 sec++;
281         }
282
283         /* Ensure that time-of-day is monotonically increasing. */
284         if ((sec < last_seen_tv.tv_sec) ||
285             ((sec == last_seen_tv.tv_sec) && (usec < last_seen_tv.tv_usec))) {
286                 sec = last_seen_tv.tv_sec;
287                 usec = last_seen_tv.tv_usec;
288         } else {
289                 last_seen_tv.tv_sec = sec;
290                 last_seen_tv.tv_usec = usec;
291         }
292
293         tv->tv_sec = sec;
294         tv->tv_usec = usec;
295 }
296
297 EXPORT_SYMBOL(do_gettimeofday);
298
299 int do_settimeofday(struct timespec *tv)
300 {
301         time_t wtm_sec, sec = tv->tv_sec;
302         long wtm_nsec;
303         s64 nsec;
304         struct timespec xentime;
305
306         if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
307                 return -EINVAL;
308
309         if (!INDEPENDENT_WALLCLOCK())
310                 return 0; /* Silent failure? */
311
312         write_seqlock_irq(&xtime_lock);
313
314         /*
315          * Ensure we don't get blocked for a long time so that our time delta
316          * overflows. If that were to happen then our shadow time values would
317          * be stale, so we can retry with fresh ones.
318          */
319  again:
320         nsec = (s64)tv->tv_nsec -
321             ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC);
322         if (unlikely(!TIME_VALUES_UP_TO_DATE)) {
323                 __get_time_values_from_xen();
324                 goto again;
325         }
326
327         __normalize_time(&sec, &nsec);
328         set_normalized_timespec(&xentime, sec, nsec);
329
330         /*
331          * This is revolting. We need to set "xtime" correctly. However, the
332          * value in this location is the value at the most recent update of
333          * wall time.  Discover what correction gettimeofday() would have
334          * made, and then undo it!
335          */
336         nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
337
338         nsec -= (shadow_system_time - processed_system_time);
339
340         __normalize_time(&sec, &nsec);
341         wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
342         wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
343
344         set_normalized_timespec(&xtime, sec, nsec);
345         set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
346
347         time_adjust = 0;                /* stop active adjtime() */
348         time_status |= STA_UNSYNC;
349         time_maxerror = NTP_PHASE_LIMIT;
350         time_esterror = NTP_PHASE_LIMIT;
351
352         /* Reset all our running time counts. They make no sense now. */
353         last_seen_tv.tv_sec = 0;
354         last_update_from_xen = 0;
355
356 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
357         if (xen_start_info.flags & SIF_INITDOMAIN) {
358                 dom0_op_t op;
359                 last_rtc_update = last_update_to_xen = 0;
360                 op.cmd = DOM0_SETTIME;
361                 op.u.settime.secs        = xentime.tv_sec;
362                 op.u.settime.usecs       = xentime.tv_nsec / NSEC_PER_USEC;
363                 op.u.settime.system_time = shadow_system_time;
364                 write_sequnlock_irq(&xtime_lock);
365                 HYPERVISOR_dom0_op(&op);
366         } else
367 #endif
368                 write_sequnlock_irq(&xtime_lock);
369
370         clock_was_set();
371         return 0;
372 }
373
374 EXPORT_SYMBOL(do_settimeofday);
375
376 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
377 static int set_rtc_mmss(unsigned long nowtime)
378 {
379         int retval;
380
381         /* gets recalled with irq locally disabled */
382         spin_lock(&rtc_lock);
383         if (efi_enabled)
384                 retval = efi_set_rtc_mmss(nowtime);
385         else
386                 retval = mach_set_rtc_mmss(nowtime);
387         spin_unlock(&rtc_lock);
388
389         return retval;
390 }
391 #endif
392
393 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
394  *              Note: This function is required to return accurate
395  *              time even in the absence of multiple timer ticks.
396  */
397 unsigned long long monotonic_clock(void)
398 {
399         return cur_timer->monotonic_clock();
400 }
401 EXPORT_SYMBOL(monotonic_clock);
402
403 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
404 unsigned long profile_pc(struct pt_regs *regs)
405 {
406         unsigned long pc = instruction_pointer(regs);
407
408         if (in_lock_functions(pc))
409                 return *(unsigned long *)(regs->ebp + 4);
410
411         return pc;
412 }
413 EXPORT_SYMBOL(profile_pc);
414 #endif
415
416 /*
417  * timer_interrupt() needs to keep up the real-time clock,
418  * as well as call the "do_timer()" routine every clocktick
419  */
420 static inline void do_timer_interrupt(int irq, void *dev_id,
421                                         struct pt_regs *regs)
422 {
423         time_t wtm_sec, sec;
424         s64 delta, delta_cpu, nsec;
425         long sec_diff, wtm_nsec;
426         int cpu = smp_processor_id();
427
428         do {
429                 __get_time_values_from_xen();
430
431                 delta = delta_cpu = (s64)shadow_system_time +
432                         ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC);
433                 delta     -= processed_system_time;
434                 delta_cpu -= per_cpu(processed_system_time, cpu);
435         }
436         while (!TIME_VALUES_UP_TO_DATE);
437
438         if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) {
439                 printk("Timer ISR/%d: Time went backwards: "
440                        "delta=%lld cpu_delta=%lld shadow=%lld "
441                        "off=%lld processed=%lld cpu_processed=%lld\n",
442                        cpu, delta, delta_cpu, shadow_system_time,
443                        ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC), 
444                        processed_system_time,
445                        per_cpu(processed_system_time, cpu));
446                 for (cpu = 0; cpu < num_online_cpus(); cpu++)
447                         printk(" %d: %lld\n", cpu,
448                                per_cpu(processed_system_time, cpu));
449                 return;
450         }
451
452         /* System-wide jiffy work. */
453         while (delta >= NS_PER_TICK) {
454                 delta -= NS_PER_TICK;
455                 processed_system_time += NS_PER_TICK;
456                 do_timer(regs);
457         }
458
459         /* Local CPU jiffy work. */
460         while (delta_cpu >= NS_PER_TICK) {
461                 delta_cpu -= NS_PER_TICK;
462                 per_cpu(processed_system_time, cpu) += NS_PER_TICK;
463                 update_process_times(user_mode(regs));
464                 profile_tick(CPU_PROFILING, regs);
465         }
466
467         if (cpu != 0)
468                 return;
469
470         /*
471          * Take synchronised time from Xen once a minute if we're not
472          * synchronised ourselves, and we haven't chosen to keep an independent
473          * time base.
474          */
475         if (!INDEPENDENT_WALLCLOCK() &&
476             ((time_status & STA_UNSYNC) != 0) &&
477             (xtime.tv_sec > (last_update_from_xen + 60))) {
478                 /* Adjust shadow for jiffies that haven't updated xtime yet. */
479                 shadow_tv.tv_usec -= 
480                         (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ);
481                 HANDLE_USEC_UNDERFLOW(shadow_tv);
482
483                 /*
484                  * Reset our running time counts if they are invalidated by
485                  * a warp backwards of more than 500ms.
486                  */
487                 sec_diff = xtime.tv_sec - shadow_tv.tv_sec;
488                 if (unlikely(abs(sec_diff) > 1) ||
489                     unlikely(((sec_diff * USEC_PER_SEC) +
490                               (xtime.tv_nsec / NSEC_PER_USEC) -
491                               shadow_tv.tv_usec) > 500000)) {
492 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
493                         last_rtc_update = last_update_to_xen = 0;
494 #endif
495                         last_seen_tv.tv_sec = 0;
496                 }
497
498                 /* Update our unsynchronised xtime appropriately. */
499                 sec = shadow_tv.tv_sec;
500                 nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
501
502                 __normalize_time(&sec, &nsec);
503                 wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
504                 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
505
506                 set_normalized_timespec(&xtime, sec, nsec);
507                 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
508
509                 last_update_from_xen = sec;
510         }
511
512 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
513         if (!(xen_start_info.flags & SIF_INITDOMAIN))
514                 return;
515
516         /* Send synchronised time to Xen approximately every minute. */
517         if (((time_status & STA_UNSYNC) == 0) &&
518             (xtime.tv_sec > (last_update_to_xen + 60))) {
519                 dom0_op_t op;
520                 struct timeval tv;
521
522                 tv.tv_sec   = xtime.tv_sec;
523                 tv.tv_usec  = xtime.tv_nsec / NSEC_PER_USEC;
524                 tv.tv_usec += (jiffies - wall_jiffies) * (USEC_PER_SEC/HZ);
525                 HANDLE_USEC_OVERFLOW(tv);
526
527                 op.cmd = DOM0_SETTIME;
528                 op.u.settime.secs        = tv.tv_sec;
529                 op.u.settime.usecs       = tv.tv_usec;
530                 op.u.settime.system_time = shadow_system_time;
531                 HYPERVISOR_dom0_op(&op);
532
533                 last_update_to_xen = xtime.tv_sec;
534         }
535
536         /*
537          * If we have an externally synchronized Linux clock, then update
538          * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
539          * called as close as possible to 500 ms before the new second starts.
540          */
541         if ((time_status & STA_UNSYNC) == 0 &&
542             xtime.tv_sec > last_rtc_update + 660 &&
543             (xtime.tv_nsec / 1000)
544                         >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
545             (xtime.tv_nsec / 1000)
546                         <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) {
547                 last_rtc_update = xtime.tv_sec;
548                 if (efi_enabled) {
549                         if (efi_set_rtc_mmss(xtime.tv_sec))
550                                 last_rtc_update -= 600;
551                 } else if (set_rtc_mmss(xtime.tv_sec))
552                         last_rtc_update -= 600;
553         }
554 #endif
555 }
556
557 /*
558  * This is the same as the above, except we _also_ save the current
559  * Time Stamp Counter value at the time of the timer interrupt, so that
560  * we later on can estimate the time of day more exactly.
561  */
562 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
563 {
564         /*
565          * Here we are in the timer irq handler. We just have irqs locally
566          * disabled but we don't know if the timer_bh is running on the other
567          * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
568          * the irq version of write_lock because as just said we have irq
569          * locally disabled. -arca
570          */
571         write_seqlock(&xtime_lock);
572         do_timer_interrupt(irq, NULL, regs);
573         write_sequnlock(&xtime_lock);
574         return IRQ_HANDLED;
575 }
576
577 /* not static: needed by APM */
578 unsigned long get_cmos_time(void)
579 {
580         unsigned long retval;
581
582         spin_lock(&rtc_lock);
583
584         if (efi_enabled)
585                 retval = efi_get_time();
586         else
587                 retval = mach_get_cmos_time();
588
589         spin_unlock(&rtc_lock);
590
591         return retval;
592 }
593
594 static long clock_cmos_diff, sleep_start;
595
596 static int timer_suspend(struct sys_device *dev, u32 state)
597 {
598         /*
599          * Estimate time zone so that set_time can update the clock
600          */
601         clock_cmos_diff = -get_cmos_time();
602         clock_cmos_diff += get_seconds();
603         sleep_start = get_cmos_time();
604         return 0;
605 }
606
607 static int timer_resume(struct sys_device *dev)
608 {
609         unsigned long flags;
610         unsigned long sec;
611         unsigned long sleep_length;
612
613 #ifdef CONFIG_HPET_TIMER
614         if (is_hpet_enabled())
615                 hpet_reenable();
616 #endif
617         sec = get_cmos_time() + clock_cmos_diff;
618         sleep_length = (get_cmos_time() - sleep_start) * HZ;
619         write_seqlock_irqsave(&xtime_lock, flags);
620         xtime.tv_sec = sec;
621         xtime.tv_nsec = 0;
622         write_sequnlock_irqrestore(&xtime_lock, flags);
623         jiffies += sleep_length;
624         wall_jiffies += sleep_length;
625         return 0;
626 }
627
628 static struct sysdev_class timer_sysclass = {
629         .resume = timer_resume,
630         .suspend = timer_suspend,
631         set_kset_name("timer"),
632 };
633
634
635 /* XXX this driverfs stuff should probably go elsewhere later -john */
636 static struct sys_device device_timer = {
637         .id     = 0,
638         .cls    = &timer_sysclass,
639 };
640
641 static int time_init_device(void)
642 {
643         int error = sysdev_class_register(&timer_sysclass);
644         if (!error)
645                 error = sysdev_register(&device_timer);
646         return error;
647 }
648
649 device_initcall(time_init_device);
650
651 #ifdef CONFIG_HPET_TIMER
652 extern void (*late_time_init)(void);
653 /* Duplicate of time_init() below, with hpet_enable part added */
654 static void __init hpet_time_init(void)
655 {
656         xtime.tv_sec = get_cmos_time();
657         xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
658         set_normalized_timespec(&wall_to_monotonic,
659                 -xtime.tv_sec, -xtime.tv_nsec);
660
661         if (hpet_enable() >= 0) {
662                 printk("Using HPET for base-timer\n");
663         }
664
665         cur_timer = select_timer();
666         printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
667
668         time_init_hook();
669 }
670 #endif
671
672 /* Dynamically-mapped IRQ. */
673 static DEFINE_PER_CPU(int, timer_irq);
674
675 static struct irqaction irq_timer = {
676         timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer0",
677         NULL, NULL
678 };
679
680 void __init time_init(void)
681 {
682 #ifdef CONFIG_HPET_TIMER
683         if (is_hpet_capable()) {
684                 /*
685                  * HPET initialization needs to do memory-mapped io. So, let
686                  * us do a late initialization after mem_init().
687                  */
688                 late_time_init = hpet_time_init;
689                 return;
690         }
691 #endif
692         __get_time_values_from_xen();
693         xtime.tv_sec = shadow_tv.tv_sec;
694         xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
695         set_normalized_timespec(&wall_to_monotonic,
696                 -xtime.tv_sec, -xtime.tv_nsec);
697         processed_system_time = shadow_system_time;
698         per_cpu(processed_system_time, 0) = processed_system_time;
699
700         if (timer_tsc_init.init(NULL) != 0)
701                 BUG();
702         printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
703
704 #if defined(__x86_64__)
705         vxtime.mode = VXTIME_TSC;
706         vxtime.quot = (1000000L << 32) / vxtime_hz;
707         vxtime.tsc_quot = (1000L << 32) / cpu_khz;
708         vxtime.hz = vxtime_hz;
709         sync_core();
710         rdtscll(vxtime.last_tsc);
711 #endif
712
713         per_cpu(timer_irq, 0) = bind_virq_to_irq(VIRQ_TIMER);
714         (void)setup_irq(per_cpu(timer_irq, 0), &irq_timer);
715 }
716
717 /* Convert jiffies to system time. Call with xtime_lock held for reading. */
718 static inline u64 __jiffies_to_st(unsigned long j) 
719 {
720         return processed_system_time + ((j - jiffies) * NS_PER_TICK);
721 }
722
723 /*
724  * This function works out when the the next timer function has to be
725  * executed (by looking at the timer list) and sets the Xen one-shot
726  * domain timer to the appropriate value. This is typically called in
727  * cpu_idle() before the domain blocks.
728  * 
729  * The function returns a non-0 value on error conditions.
730  * 
731  * It must be called with interrupts disabled.
732  */
733 int set_timeout_timer(void)
734 {
735         u64 alarm = 0;
736         int ret = 0;
737         unsigned long j;
738 #ifdef CONFIG_SMP
739         unsigned long seq;
740 #endif
741
742         /*
743          * This is safe against long blocking (since calculations are
744          * not based on TSC deltas). It is also safe against warped
745          * system time since suspend-resume is cooperative and we
746          * would first get locked out.
747          */
748 #ifdef CONFIG_SMP
749         do {
750                 seq = read_seqbegin(&xtime_lock);
751                 j = jiffies + 1;
752                 alarm = __jiffies_to_st(j);
753         } while (read_seqretry(&xtime_lock, seq));
754 #else
755         j = next_timer_interrupt();
756         if (j < (jiffies + 1))
757                 j = jiffies + 1;
758         alarm = __jiffies_to_st(j);
759 #endif
760
761         /* Failure is pretty bad, but we'd best soldier on. */
762         if ( HYPERVISOR_set_timer_op(alarm) != 0 )
763                 ret = -1;
764
765         return ret;
766 }
767
768 void time_suspend(void)
769 {
770         /* nothing */
771 }
772
773 /* No locking required. We are only CPU running, and interrupts are off. */
774 void time_resume(void)
775 {
776         if (timer_tsc_init.init(NULL) != 0)
777                 BUG();
778
779         /* Get timebases for new environment. */ 
780         __get_time_values_from_xen();
781
782         /* Reset our own concept of passage of system time. */
783         processed_system_time = shadow_system_time;
784         per_cpu(processed_system_time, 0) = processed_system_time;
785
786         /* Accept a warp in UTC (wall-clock) time. */
787         last_seen_tv.tv_sec = 0;
788
789         /* Make sure we resync UTC time with Xen on next timer interrupt. */
790         last_update_from_xen = 0;
791 }
792
793 #ifdef CONFIG_SMP
794 static char timer_name[NR_CPUS][15];
795 void local_setup_timer(void)
796 {
797         int seq, cpu = smp_processor_id();
798
799         do {
800                 seq = read_seqbegin(&xtime_lock);
801                 per_cpu(processed_system_time, cpu) = shadow_system_time;
802         } while (read_seqretry(&xtime_lock, seq));
803
804         per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER);
805         sprintf(timer_name[cpu], "timer%d", cpu);
806         BUG_ON(request_irq(per_cpu(timer_irq, cpu), timer_interrupt,
807                            SA_INTERRUPT, timer_name[cpu], NULL));
808 }
809 #endif
810
811 /*
812  * /proc/sys/xen: This really belongs in another file. It can stay here for
813  * now however.
814  */
815 static ctl_table xen_subtable[] = {
816         {1, "independent_wallclock", &independent_wallclock,
817          sizeof(independent_wallclock), 0644, NULL, proc_dointvec},
818         {0}
819 };
820 static ctl_table xen_table[] = {
821         {123, "xen", NULL, 0, 0555, xen_subtable},
822         {0}
823 };
824 static int __init xen_sysctl_init(void)
825 {
826         (void)register_sysctl_table(xen_table, 0);
827         return 0;
828 }
829 __initcall(xen_sysctl_init);