/* * linux/arch/i386/kernel/time.c * * Copyright (C) 1991, 1992, 1995 Linus Torvalds * * This file contains the PC-specific time handling details: * reading the RTC at bootup, etc.. * 1994-07-02 Alan Modra * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime * 1995-03-26 Markus Kuhn * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 * precision CMOS clock update * 1996-05-03 Ingo Molnar * fixed time warps in do_[slow|fast]_gettimeoffset() * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * 1998-09-05 (Various) * More robust do_fast_gettimeoffset() algorithm implemented * (works with APM, Cyrix 6x86MX and Centaur C6), * monotonic gettimeofday() with fast_get_timeoffset(), * drift-proof precision TSC calibration on boot * (C. Scott Ananian , Andrew D. * Balsa , Philip Gladstone ; * ported from 2.0.35 Jumbo-9 by Michael Krause ). * 1998-12-16 Andrea Arcangeli * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy * because was not accounting lost_ticks. * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to * serialize accesses to xtime/lost_ticks). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mach_time.h" #include #include #include #include #include int pit_latch_buggy; /* extern */ unsigned long vxtime_hz = PIT_TICK_RATE; struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; struct timespec __xtime __section_xtime; struct timezone __sys_tz __section_sys_tz; #define USEC_PER_TICK (USEC_PER_SEC / HZ) #define NSEC_PER_TICK (NSEC_PER_SEC / HZ) #define FSEC_PER_TICK (FSEC_PER_SEC / HZ) #define NS_SCALE 10 /* 2^10, carefully chosen */ #define US_SCALE 32 /* 2^32, arbitralrily chosen */ unsigned int cpu_khz; /* Detected as we calibrate the TSC */ EXPORT_SYMBOL(cpu_khz); DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); extern struct init_timer_opts timer_tsc_init; extern struct timer_opts timer_tsc; #define timer_none timer_tsc /* These are peridically updated in shared_info, and then copied here. */ struct shadow_time_info { u64 tsc_timestamp; /* TSC at last update of time vals. */ u64 system_timestamp; /* Time, in nanosecs, since boot. */ u32 tsc_to_nsec_mul; u32 tsc_to_usec_mul; int tsc_shift; u32 version; }; static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); static struct timespec shadow_tv; static u32 shadow_tv_version; /* Keep track of last time we did processing/updating of jiffies and xtime. */ static u64 processed_system_time; /* System time (ns) at last processing. */ static DEFINE_PER_CPU(u64, processed_system_time); /* How much CPU time was spent blocked and how much was 'stolen'? */ static DEFINE_PER_CPU(u64, processed_stolen_time); static DEFINE_PER_CPU(u64, processed_blocked_time); /* Current runstate of each CPU (updated automatically by the hypervisor). */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); /* Must be signed, as it's compared with s64 quantities which can be -ve. */ #define NS_PER_TICK (1000000000LL/HZ) static inline void __normalize_time(time_t *sec, s64 *nsec) { while (*nsec >= NSEC_PER_SEC) { (*nsec) -= NSEC_PER_SEC; (*sec)++; } while (*nsec < 0) { (*nsec) += NSEC_PER_SEC; (*sec)--; } } /* Does this guest OS track Xen time, or set its wall clock independently? */ static int independent_wallclock = 0; static int __init __independent_wallclock(char *str) { independent_wallclock = 1; return 1; } __setup("independent_wallclock", __independent_wallclock); /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */ static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */ static int __init __permitted_clock_jitter(char *str) { permitted_clock_jitter = simple_strtoul(str, NULL, 0); return 1; } __setup("permitted_clock_jitter=", __permitted_clock_jitter); #ifndef CONFIG_X86 int tsc_disable __devinitdata = 0; #endif static void delay_tsc(unsigned long loops) { unsigned long bclock, now; rdtscl(bclock); do { rep_nop(); rdtscl(now); } while ((now - bclock) < loops); } struct timer_opts timer_tsc = { .name = "tsc", .delay = delay_tsc, }; /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) { u64 product; if (shift < 0) delta >>= -shift; else delta <<= shift; __asm__ ( "mul %%rdx ; shrd $32,%%rdx,%%rax" : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); return product; } void init_cpu_khz(void) { u64 __cpu_khz = 1000000ULL << US_SCALE; struct vcpu_time_info *info; info = &HYPERVISOR_shared_info->vcpu_info[0].time; do_div(__cpu_khz, info->tsc_to_system_mul); if (info->tsc_shift < 0) cpu_khz = __cpu_khz << -info->tsc_shift; else cpu_khz = __cpu_khz >> info->tsc_shift; } static u64 get_nsec_offset(struct shadow_time_info *shadow) { u64 now, delta; rdtscll(now); delta = now - shadow->tsc_timestamp; return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); } static unsigned long get_usec_offset(struct shadow_time_info *shadow) { u64 now, delta; rdtscll(now); delta = now - shadow->tsc_timestamp; return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift); } static void __update_wallclock(time_t sec, long nsec) { long wtm_nsec, xtime_nsec; time_t wtm_sec, xtime_sec; u64 tmp, wc_nsec; /* Adjust wall-clock time base based on jiffies ticks. */ wc_nsec = processed_system_time; wc_nsec += sec * (u64)NSEC_PER_SEC; wc_nsec += nsec; /* Split wallclock base into seconds and nanoseconds. */ tmp = wc_nsec; xtime_nsec = do_div(tmp, 1000000000); xtime_sec = (time_t)tmp; wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec); wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec); set_normalized_timespec(&xtime, xtime_sec, xtime_nsec); set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); ntp_clear(); } static void update_wallclock(void) { shared_info_t *s = HYPERVISOR_shared_info; do { shadow_tv_version = s->wc_version; rmb(); shadow_tv.tv_sec = s->wc_sec; shadow_tv.tv_nsec = s->wc_nsec; rmb(); } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); if (!independent_wallclock) __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec); } /* * Reads a consistent set of time-base values from Xen, into a shadow data * area. */ static void get_time_values_from_xen(void) { shared_info_t *s = HYPERVISOR_shared_info; struct vcpu_time_info *src; struct shadow_time_info *dst; src = &s->vcpu_info[smp_processor_id()].time; dst = &per_cpu(shadow_time, smp_processor_id()); do { dst->version = src->version; rmb(); dst->tsc_timestamp = src->tsc_timestamp; dst->system_timestamp = src->system_time; dst->tsc_to_nsec_mul = src->tsc_to_system_mul; dst->tsc_shift = src->tsc_shift; rmb(); } while ((src->version & 1) | (dst->version ^ src->version)); dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; } static inline int time_values_up_to_date(int cpu) { struct vcpu_time_info *src; struct shadow_time_info *dst; src = &HYPERVISOR_shared_info->vcpu_info[cpu].time; dst = &per_cpu(shadow_time, cpu); rmb(); return (dst->version == src->version); } /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. */ volatile unsigned long cmos_lock = 0; EXPORT_SYMBOL(cmos_lock); /* Routines for accessing the CMOS RAM/RTC. */ unsigned char rtc_cmos_read(unsigned char addr) { unsigned char val; lock_cmos_prefix(addr); outb_p(addr, RTC_PORT(0)); val = inb_p(RTC_PORT(1)); lock_cmos_suffix(addr); return val; } EXPORT_SYMBOL(rtc_cmos_read); void rtc_cmos_write(unsigned char val, unsigned char addr) { lock_cmos_prefix(addr); outb_p(addr, RTC_PORT(0)); outb_p(val, RTC_PORT(1)); lock_cmos_suffix(addr); } EXPORT_SYMBOL(rtc_cmos_write); /* * This version of gettimeofday has microsecond resolution * and better than microsecond precision on fast x86 machines with TSC. */ void do_gettimeofday(struct timeval *tv) { unsigned long seq; unsigned long usec, sec; unsigned long max_ntp_tick; s64 nsec; unsigned int cpu; struct shadow_time_info *shadow; u32 local_time_version; cpu = get_cpu(); shadow = &per_cpu(shadow_time, cpu); do { local_time_version = shadow->version; seq = read_seqbegin(&xtime_lock); usec = get_usec_offset(shadow); /* * If time_adjust is negative then NTP is slowing the clock * so make sure not to go into next possible interval. * Better to lose some accuracy than have time go backwards.. */ if (unlikely(time_adjust < 0)) { max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; usec = min(usec, max_ntp_tick); } sec = xtime.tv_sec; usec += (xtime.tv_nsec / NSEC_PER_USEC); nsec = shadow->system_timestamp - processed_system_time; __normalize_time(&sec, &nsec); usec += (long)nsec / NSEC_PER_USEC; if (unlikely(!time_values_up_to_date(cpu))) { /* * We may have blocked for a long time, * rendering our calculations invalid * (e.g. the time delta may have * overflowed). Detect that and recalculate * with fresh values. */ get_time_values_from_xen(); continue; } } while (read_seqretry(&xtime_lock, seq) || (local_time_version != shadow->version)); put_cpu(); while (usec >= USEC_PER_SEC) { usec -= USEC_PER_SEC; sec++; } tv->tv_sec = sec; tv->tv_usec = usec; } EXPORT_SYMBOL(do_gettimeofday); int do_settimeofday(struct timespec *tv) { time_t sec; s64 nsec; unsigned int cpu; struct shadow_time_info *shadow; dom0_op_t op; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; cpu = get_cpu(); shadow = &per_cpu(shadow_time, cpu); write_seqlock_irq(&xtime_lock); /* * Ensure we don't get blocked for a long time so that our time delta * overflows. If that were to happen then our shadow time values would * be stale, so we can retry with fresh ones. */ for (;;) { nsec = tv->tv_nsec - get_nsec_offset(shadow); if (time_values_up_to_date(cpu)) break; get_time_values_from_xen(); } sec = tv->tv_sec; __normalize_time(&sec, &nsec); if (is_initial_xendomain() && !independent_wallclock) { op.cmd = DOM0_SETTIME; op.u.settime.secs = sec; op.u.settime.nsecs = nsec; op.u.settime.system_time = shadow->system_timestamp; HYPERVISOR_dom0_op(&op); update_wallclock(); } else if (independent_wallclock) { nsec -= shadow->system_timestamp; __normalize_time(&sec, &nsec); __update_wallclock(sec, nsec); } write_sequnlock_irq(&xtime_lock); put_cpu(); clock_was_set(); return 0; } EXPORT_SYMBOL(do_settimeofday); static void sync_xen_wallclock(unsigned long dummy); static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0); static void sync_xen_wallclock(unsigned long dummy) { time_t sec; s64 nsec; dom0_op_t op; if (!ntp_synced() || independent_wallclock || !is_initial_xendomain()) return; write_seqlock_irq(&xtime_lock); sec = xtime.tv_sec; nsec = xtime.tv_nsec; __normalize_time(&sec, &nsec); op.cmd = DOM0_SETTIME; op.u.settime.secs = sec; op.u.settime.nsecs = nsec; op.u.settime.system_time = processed_system_time; HYPERVISOR_dom0_op(&op); update_wallclock(); write_sequnlock_irq(&xtime_lock); /* Once per minute. */ mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ); } static int set_rtc_mmss(unsigned long nowtime) { int retval; unsigned long flags; if (independent_wallclock || !is_initial_xendomain()) return 0; /* gets recalled with irq locally disabled */ spin_lock_irqsave(&rtc_lock, flags); if (efi_enabled) retval = efi_set_rtc_mmss(nowtime); else retval = mach_set_rtc_mmss(nowtime); spin_unlock_irqrestore(&rtc_lock, flags); return retval; } /* monotonic_clock(): returns # of nanoseconds passed since time_init() * Note: This function is required to return accurate * time even in the absence of multiple timer ticks. */ unsigned long long monotonic_clock(void) { int cpu = get_cpu(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); u64 time; u32 local_time_version; do { local_time_version = shadow->version; barrier(); time = shadow->system_timestamp + get_nsec_offset(shadow); if (!time_values_up_to_date(cpu)) get_time_values_from_xen(); barrier(); } while (local_time_version != shadow->version); put_cpu(); return time; } EXPORT_SYMBOL(monotonic_clock); unsigned long long sched_clock(void) { return monotonic_clock(); } unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); /* Assume the lock function has either no stack frame or a copy of eflags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode_vm(regs) && in_lock_functions(pc)) { unsigned long *sp = (unsigned long *)regs->rsp; if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; } return pc; } EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { s64 delta, delta_cpu, stolen, blocked; u64 sched_time; int i, cpu = smp_processor_id(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); write_seqlock(&xtime_lock); do { get_time_values_from_xen(); /* Obtain a consistent snapshot of elapsed wallclock cycles. */ delta = delta_cpu = shadow->system_timestamp + get_nsec_offset(shadow); delta -= processed_system_time; delta_cpu -= per_cpu(processed_system_time, cpu); /* * Obtain a consistent snapshot of stolen/blocked cycles. We * can use state_entry_time to detect if we get preempted here. */ do { sched_time = runstate->state_entry_time; barrier(); stolen = runstate->time[RUNSTATE_runnable] + runstate->time[RUNSTATE_offline] - per_cpu(processed_stolen_time, cpu); blocked = runstate->time[RUNSTATE_blocked] - per_cpu(processed_blocked_time, cpu); barrier(); } while (sched_time != runstate->state_entry_time); } while (!time_values_up_to_date(cpu)); if ((unlikely(delta < -(s64)permitted_clock_jitter) || unlikely(delta_cpu < -(s64)permitted_clock_jitter)) && printk_ratelimit()) { printk("Timer ISR/%d: Time went backwards: " "delta=%lld delta_cpu=%lld shadow=%lld " "off=%lld processed=%lld cpu_processed=%lld\n", cpu, delta, delta_cpu, shadow->system_timestamp, (s64)get_nsec_offset(shadow), processed_system_time, per_cpu(processed_system_time, cpu)); for (i = 0; i < num_online_cpus(); i++) printk(" %d: %lld\n", i, per_cpu(processed_system_time, i)); } /* System-wide jiffy work. */ while (delta >= NS_PER_TICK) { delta -= NS_PER_TICK; processed_system_time += NS_PER_TICK; do_timer(1); } if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { update_wallclock(); clock_was_set(); } write_sequnlock(&xtime_lock); /* * Account stolen ticks. * HACK: Passing NULL to account_steal_time() * ensures that the ticks are accounted as stolen. */ if ((stolen > 0) && (delta_cpu > 0)) { delta_cpu -= stolen; if (unlikely(delta_cpu < 0)) stolen += delta_cpu; /* clamp local-time progress */ do_div(stolen, NS_PER_TICK); per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK; per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK; account_steal_time(NULL, (cputime_t)stolen); } /* * Account blocked ticks. * HACK: Passing idle_task to account_steal_time() * ensures that the ticks are accounted as idle/wait. */ if ((blocked > 0) && (delta_cpu > 0)) { delta_cpu -= blocked; if (unlikely(delta_cpu < 0)) blocked += delta_cpu; /* clamp local-time progress */ do_div(blocked, NS_PER_TICK); per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK; per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK; account_steal_time(idle_task(cpu), (cputime_t)blocked); } /* Account user/system ticks. */ if (delta_cpu > 0) { do_div(delta_cpu, NS_PER_TICK); per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK; if (user_mode(get_irq_regs())) account_user_time(current, (cputime_t)delta_cpu); else account_system_time(current, HARDIRQ_OFFSET, (cputime_t)delta_cpu); } /* Local timer processing (see update_process_times()). */ run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_mode(get_irq_regs())); scheduler_tick(); run_posix_cpu_timers(current); return IRQ_HANDLED; } static void init_missing_ticks_accounting(int cpu) { struct vcpu_register_runstate_memory_area area; struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); memset(runstate, 0, sizeof(*runstate)); area.addr.v = runstate; HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area); per_cpu(processed_blocked_time, cpu) = runstate->time[RUNSTATE_blocked]; per_cpu(processed_stolen_time, cpu) = runstate->time[RUNSTATE_runnable] + runstate->time[RUNSTATE_offline]; } /* not static: needed by APM */ unsigned long get_cmos_time(void) { unsigned long retval; unsigned long flags; spin_lock_irqsave(&rtc_lock, flags); if (efi_enabled) retval = efi_get_time(); else retval = mach_get_cmos_time(); spin_unlock_irqrestore(&rtc_lock, flags); return retval; } EXPORT_SYMBOL(get_cmos_time); static void sync_cmos_clock(unsigned long dummy); static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); static void sync_cmos_clock(unsigned long dummy) { struct timeval now, next; int fail = 1; /* * If we have an externally synchronized Linux clock, then update * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be * called as close as possible to 500 ms before the new second starts. * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... */ if (!ntp_synced()) /* * Not synced, exit, do not restart a timer (if one is * running, let it run out). */ return; do_gettimeofday(&now); if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) fail = set_rtc_mmss(now.tv_sec); next.tv_usec = USEC_AFTER - now.tv_usec; if (next.tv_usec <= 0) next.tv_usec += USEC_PER_SEC; if (!fail) next.tv_sec = 659; else next.tv_sec = 0; if (next.tv_usec >= USEC_PER_SEC) { next.tv_sec++; next.tv_usec -= USEC_PER_SEC; } mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); } void notify_arch_cmos_timer(void) { mod_timer(&sync_cmos_timer, jiffies + 1); mod_timer(&sync_xen_wallclock_timer, jiffies + 1); } static long clock_cmos_diff; static unsigned long sleep_start; static int timer_suspend(struct sys_device *dev, pm_message_t state) { /* * Estimate time zone so that set_time can update the clock */ unsigned long ctime = get_cmos_time(); clock_cmos_diff = -ctime; clock_cmos_diff += get_seconds(); sleep_start = ctime; return 0; } static int timer_resume(struct sys_device *dev) { unsigned long flags; unsigned long sec; unsigned long ctime = get_cmos_time(); long sleep_length = (ctime - sleep_start) * HZ; if (sleep_length < 0) { printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n"); /* The time after the resume must not be earlier than the time * before the suspend or some nasty things will happen */ sleep_length = 0; ctime = sleep_start; } #ifdef CONFIG_HPET_TIMER if (is_hpet_enabled()) hpet_reenable(); #endif sec = ctime + clock_cmos_diff; write_seqlock_irqsave(&xtime_lock, flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; jiffies_64 += sleep_length; write_sequnlock_irqrestore(&xtime_lock, flags); touch_softlockup_watchdog(); return 0; } static struct sysdev_class timer_sysclass = { .resume = timer_resume, .suspend = timer_suspend, set_kset_name("timer"), }; /* XXX this driverfs stuff should probably go elsewhere later -john */ static struct sys_device device_timer = { .id = 0, .cls = &timer_sysclass, }; static int time_init_device(void) { int error = sysdev_class_register(&timer_sysclass); if (!error) error = sysdev_register(&device_timer); return error; } device_initcall(time_init_device); #ifdef CONFIG_HPET_TIMER extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ static void __init hpet_time_init(void) { xtime.tv_sec = get_cmos_time(); xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); if ((hpet_enable() >= 0) && hpet_use_timer) { printk("Using HPET for base-timer\n"); } time_init_hook(); } #endif /* Dynamically-mapped IRQ. */ DEFINE_PER_CPU(int, timer_irq); extern void (*late_time_init)(void); static void setup_cpu0_timer_irq(void) { per_cpu(timer_irq, 0) = bind_virq_to_irqhandler( VIRQ_TIMER, 0, timer_interrupt, SA_INTERRUPT, "timer0", NULL); BUG_ON(per_cpu(timer_irq, 0) < 0); } void __init time_init(void) { #ifdef CONFIG_HPET_TIMER if (is_hpet_capable()) { /* * HPET initialization needs to do memory-mapped io. So, let * us do a late initialization after mem_init(). */ late_time_init = hpet_time_init; return; } #endif get_time_values_from_xen(); processed_system_time = per_cpu(shadow_time, 0).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; init_missing_ticks_accounting(0); update_wallclock(); init_cpu_khz(); printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); vxtime.mode = VXTIME_TSC; vxtime.quot = (1000000L << US_SCALE) / vxtime_hz; vxtime.tsc_quot = (1000L << US_SCALE) / cpu_khz; sync_core(); rdtscll(vxtime.last_tsc); /* Cannot request_irq() until kmem is initialised. */ late_time_init = setup_cpu0_timer_irq; } /* Convert jiffies to system time. */ u64 jiffies_to_st(unsigned long j) { unsigned long seq; long delta; u64 st; do { seq = read_seqbegin(&xtime_lock); delta = j - jiffies; if (delta < 1) { /* Triggers in some wrap-around cases, but that's okay: * we just end up with a shorter timeout. */ st = processed_system_time + NS_PER_TICK; } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) { /* Very long timeout means there is no pending timer. * We indicate this to Xen by passing zero timeout. */ st = 0; } else { st = processed_system_time + delta * (u64)NS_PER_TICK; } } while (read_seqretry(&xtime_lock, seq)); return st; } EXPORT_SYMBOL(jiffies_to_st); /* * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu * These functions are based on implementations from arch/s390/kernel/time.c */ static void stop_hz_timer(void) { unsigned int cpu = smp_processor_id(); unsigned long j; cpu_set(cpu, nohz_cpu_mask); /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */ /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */ /* value of rcp->cur that matches rdp->quiescbatch and allows us to */ /* stop the hz timer then the cpumasks created for subsequent values */ /* of cur in rcu_start_batch are guaranteed to pick up the updated */ /* nohz_cpu_mask and so will not depend on this cpu. */ smp_mb(); /* Leave ourselves in tick mode if rcu or softirq or timer pending. */ if (rcu_needs_cpu(cpu) || local_softirq_pending() || (j = next_timer_interrupt(), time_before_eq(j, jiffies))) { cpu_clear(cpu, nohz_cpu_mask); j = jiffies + 1; } if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0) BUG(); } static void start_hz_timer(void) { cpu_clear(smp_processor_id(), nohz_cpu_mask); } void raw_safe_halt(void) { stop_hz_timer(); /* Blocking includes an implicit local_irq_enable(). */ HYPERVISOR_block(); start_hz_timer(); } EXPORT_SYMBOL(raw_safe_halt); void halt(void) { if (irqs_disabled()) HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); } EXPORT_SYMBOL(halt); /* No locking required. We are only CPU running, and interrupts are off. */ void time_resume(void) { init_cpu_khz(); get_time_values_from_xen(); processed_system_time = per_cpu(shadow_time, 0).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; init_missing_ticks_accounting(0); update_wallclock(); } #ifdef CONFIG_SMP static char timer_name[NR_CPUS][15]; void local_setup_timer(unsigned int cpu) { int seq; BUG_ON(cpu == 0); do { seq = read_seqbegin(&xtime_lock); /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */ per_cpu(processed_system_time, cpu) = per_cpu(shadow_time, 0).system_timestamp; init_missing_ticks_accounting(cpu); } while (read_seqretry(&xtime_lock, seq)); sprintf(timer_name[cpu], "timer%d", cpu); per_cpu(timer_irq, cpu) = bind_virq_to_irqhandler( VIRQ_TIMER, cpu, timer_interrupt, SA_INTERRUPT, timer_name[cpu], NULL); BUG_ON(per_cpu(timer_irq, cpu) < 0); } void local_teardown_timer(unsigned int cpu) { BUG_ON(cpu == 0); unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL); } #endif /* * /proc/sys/xen: This really belongs in another file. It can stay here for * now however. */ static ctl_table xen_subtable[] = { { .ctl_name = 1, .procname = "independent_wallclock", .data = &independent_wallclock, .maxlen = sizeof(independent_wallclock), .mode = 0644, .proc_handler = proc_dointvec }, { .ctl_name = 2, .procname = "permitted_clock_jitter", .data = &permitted_clock_jitter, .maxlen = sizeof(permitted_clock_jitter), .mode = 0644, .proc_handler = proc_doulongvec_minmax }, { 0 } }; static ctl_table xen_table[] = { { .ctl_name = 123, .procname = "xen", .mode = 0555, .child = xen_subtable}, { 0 } }; static int __init xen_sysctl_init(void) { (void)register_sysctl_table(xen_table, 0); return 0; } __initcall(xen_sysctl_init);