4 * Kernel internal timers, kernel timekeeping, basic process system calls
6 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
10 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
11 * "A Kernel Model for Precision Timekeeping" by Dave Mills
12 * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13 * serialize accesses to xtime/lost_ticks).
14 * Copyright (C) 1998 Andrea Arcangeli
15 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
16 * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
17 * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
18 * Copyright (C) 2000, 2001, 2002 Ingo Molnar
19 * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
22 #include <linux/kernel_stat.h>
23 #include <linux/module.h>
24 #include <linux/interrupt.h>
25 #include <linux/percpu.h>
26 #include <linux/init.h>
28 #include <linux/swap.h>
29 #include <linux/notifier.h>
30 #include <linux/thread_info.h>
31 #include <linux/time.h>
32 #include <linux/jiffies.h>
33 #include <linux/posix-timers.h>
34 #include <linux/cpu.h>
35 #include <linux/syscalls.h>
36 #include <linux/vs_cvirt.h>
37 #include <linux/vserver/sched.h>
39 #include <asm/uaccess.h>
40 #include <asm/unistd.h>
41 #include <asm/div64.h>
42 #include <asm/timex.h>
45 #ifdef CONFIG_TIME_INTERPOLATION
46 static void time_interpolator_update(long delta_nsec);
48 #define time_interpolator_update(x)
52 * per-CPU timer vector definitions:
55 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
56 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
57 #define TVN_SIZE (1 << TVN_BITS)
58 #define TVR_SIZE (1 << TVR_BITS)
59 #define TVN_MASK (TVN_SIZE - 1)
60 #define TVR_MASK (TVR_SIZE - 1)
62 typedef struct tvec_s {
63 struct list_head vec[TVN_SIZE];
66 typedef struct tvec_root_s {
67 struct list_head vec[TVR_SIZE];
70 struct tvec_t_base_s {
72 unsigned long timer_jiffies;
73 struct timer_list *running_timer;
79 } ____cacheline_aligned_in_smp;
81 typedef struct tvec_t_base_s tvec_base_t;
83 static inline void set_running_timer(tvec_base_t *base,
84 struct timer_list *timer)
87 base->running_timer = timer;
91 /* Fake initialization */
92 static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
94 static void check_timer_failed(struct timer_list *timer)
96 static int whine_count;
97 if (whine_count < 16) {
99 printk("Uninitialised timer!\n");
100 printk("This is just a warning. Your computer is OK\n");
101 printk("function=0x%p, data=0x%lx\n",
102 timer->function, timer->data);
108 spin_lock_init(&timer->lock);
109 timer->magic = TIMER_MAGIC;
112 static inline void check_timer(struct timer_list *timer)
114 if (timer->magic != TIMER_MAGIC)
115 check_timer_failed(timer);
119 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
121 unsigned long expires = timer->expires;
122 unsigned long idx = expires - base->timer_jiffies;
123 struct list_head *vec;
125 if (idx < TVR_SIZE) {
126 int i = expires & TVR_MASK;
127 vec = base->tv1.vec + i;
128 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
129 int i = (expires >> TVR_BITS) & TVN_MASK;
130 vec = base->tv2.vec + i;
131 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
132 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
133 vec = base->tv3.vec + i;
134 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
135 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
136 vec = base->tv4.vec + i;
137 } else if ((signed long) idx < 0) {
139 * Can happen if you add a timer with expires == jiffies,
140 * or you set a timer to go off in the past
142 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
145 /* If the timeout is larger than 0xffffffff on 64-bit
146 * architectures then we use the maximum timeout:
148 if (idx > 0xffffffffUL) {
150 expires = idx + base->timer_jiffies;
152 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
153 vec = base->tv5.vec + i;
158 list_add_tail(&timer->entry, vec);
161 int __mod_timer(struct timer_list *timer, unsigned long expires)
163 tvec_base_t *old_base, *new_base;
167 BUG_ON(!timer->function);
171 spin_lock_irqsave(&timer->lock, flags);
172 new_base = &__get_cpu_var(tvec_bases);
174 old_base = timer->base;
177 * Prevent deadlocks via ordering by old_base < new_base.
179 if (old_base && (new_base != old_base)) {
180 if (old_base < new_base) {
181 spin_lock(&new_base->lock);
182 spin_lock(&old_base->lock);
184 spin_lock(&old_base->lock);
185 spin_lock(&new_base->lock);
188 * The timer base might have been cancelled while we were
189 * trying to take the lock(s):
191 if (timer->base != old_base) {
192 spin_unlock(&new_base->lock);
193 spin_unlock(&old_base->lock);
197 spin_lock(&new_base->lock);
198 if (timer->base != old_base) {
199 spin_unlock(&new_base->lock);
205 * Delete the previous timeout (if there was any), and install
209 list_del(&timer->entry);
212 timer->expires = expires;
213 internal_add_timer(new_base, timer);
214 timer->base = new_base;
216 if (old_base && (new_base != old_base))
217 spin_unlock(&old_base->lock);
218 spin_unlock(&new_base->lock);
219 spin_unlock_irqrestore(&timer->lock, flags);
224 EXPORT_SYMBOL(__mod_timer);
227 * add_timer_on - start a timer on a particular CPU
228 * @timer: the timer to be added
229 * @cpu: the CPU to start it on
231 * This is not very scalable on SMP. Double adds are not possible.
233 void add_timer_on(struct timer_list *timer, int cpu)
235 tvec_base_t *base = &per_cpu(tvec_bases, cpu);
238 BUG_ON(timer_pending(timer) || !timer->function);
242 spin_lock_irqsave(&base->lock, flags);
243 internal_add_timer(base, timer);
245 spin_unlock_irqrestore(&base->lock, flags);
250 * mod_timer - modify a timer's timeout
251 * @timer: the timer to be modified
253 * mod_timer is a more efficient way to update the expire field of an
254 * active timer (if the timer is inactive it will be activated)
256 * mod_timer(timer, expires) is equivalent to:
258 * del_timer(timer); timer->expires = expires; add_timer(timer);
260 * Note that if there are multiple unserialized concurrent users of the
261 * same timer, then mod_timer() is the only safe way to modify the timeout,
262 * since add_timer() cannot modify an already running timer.
264 * The function returns whether it has modified a pending timer or not.
265 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
266 * active timer returns 1.)
268 int mod_timer(struct timer_list *timer, unsigned long expires)
270 BUG_ON(!timer->function);
275 * This is a common optimization triggered by the
276 * networking code - if the timer is re-modified
277 * to be the same thing then just return:
279 if (timer->expires == expires && timer_pending(timer))
282 return __mod_timer(timer, expires);
285 EXPORT_SYMBOL(mod_timer);
288 * del_timer - deactive a timer.
289 * @timer: the timer to be deactivated
291 * del_timer() deactivates a timer - this works on both active and inactive
294 * The function returns whether it has deactivated a pending timer or not.
295 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
296 * active timer returns 1.)
298 int del_timer(struct timer_list *timer)
309 spin_lock_irqsave(&base->lock, flags);
310 if (base != timer->base) {
311 spin_unlock_irqrestore(&base->lock, flags);
314 list_del(&timer->entry);
315 /* Need to make sure that anybody who sees a NULL base also sees the list ops */
318 spin_unlock_irqrestore(&base->lock, flags);
323 EXPORT_SYMBOL(del_timer);
327 * del_timer_sync - deactivate a timer and wait for the handler to finish.
328 * @timer: the timer to be deactivated
330 * This function only differs from del_timer() on SMP: besides deactivating
331 * the timer it also makes sure the handler has finished executing on other
334 * Synchronization rules: callers must prevent restarting of the timer,
335 * otherwise this function is meaningless. It must not be called from
336 * interrupt contexts. The caller must not hold locks which would prevent
337 * completion of the timer's handler. Upon exit the timer is not queued and
338 * the handler is not running on any CPU.
340 * The function returns whether it has deactivated a pending timer or not.
342 * del_timer_sync() is slow and complicated because it copes with timer
343 * handlers which re-arm the timer (periodic timers). If the timer handler
344 * is known to not do this (a single shot timer) then use
345 * del_singleshot_timer_sync() instead.
347 int del_timer_sync(struct timer_list *timer)
355 ret += del_timer(timer);
357 for_each_online_cpu(i) {
358 base = &per_cpu(tvec_bases, i);
359 if (base->running_timer == timer) {
360 while (base->running_timer == timer) {
362 preempt_check_resched();
368 if (timer_pending(timer))
373 EXPORT_SYMBOL(del_timer_sync);
376 * del_singleshot_timer_sync - deactivate a non-recursive timer
377 * @timer: the timer to be deactivated
379 * This function is an optimization of del_timer_sync for the case where the
380 * caller can guarantee the timer does not reschedule itself in its timer
383 * Synchronization rules: callers must prevent restarting of the timer,
384 * otherwise this function is meaningless. It must not be called from
385 * interrupt contexts. The caller must not hold locks which wold prevent
386 * completion of the timer's handler. Upon exit the timer is not queued and
387 * the handler is not running on any CPU.
389 * The function returns whether it has deactivated a pending timer or not.
391 int del_singleshot_timer_sync(struct timer_list *timer)
393 int ret = del_timer(timer);
396 ret = del_timer_sync(timer);
402 EXPORT_SYMBOL(del_singleshot_timer_sync);
405 static int cascade(tvec_base_t *base, tvec_t *tv, int index)
407 /* cascade all the timers from tv up one level */
408 struct list_head *head, *curr;
410 head = tv->vec + index;
413 * We are removing _all_ timers from the list, so we don't have to
414 * detach them individually, just clear the list afterwards.
416 while (curr != head) {
417 struct timer_list *tmp;
419 tmp = list_entry(curr, struct timer_list, entry);
420 BUG_ON(tmp->base != base);
422 internal_add_timer(base, tmp);
424 INIT_LIST_HEAD(head);
430 * __run_timers - run all expired timers (if any) on this CPU.
431 * @base: the timer vector to be processed.
433 * This function cascades all vectors and executes all expired timer
436 #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
438 static inline void __run_timers(tvec_base_t *base)
440 struct timer_list *timer;
442 spin_lock_irq(&base->lock);
443 while (time_after_eq(jiffies, base->timer_jiffies)) {
444 struct list_head work_list = LIST_HEAD_INIT(work_list);
445 struct list_head *head = &work_list;
446 int index = base->timer_jiffies & TVR_MASK;
452 (!cascade(base, &base->tv2, INDEX(0))) &&
453 (!cascade(base, &base->tv3, INDEX(1))) &&
454 !cascade(base, &base->tv4, INDEX(2)))
455 cascade(base, &base->tv5, INDEX(3));
456 ++base->timer_jiffies;
457 list_splice_init(base->tv1.vec + index, &work_list);
459 if (!list_empty(head)) {
460 void (*fn)(unsigned long);
463 timer = list_entry(head->next,struct timer_list,entry);
464 fn = timer->function;
467 list_del(&timer->entry);
468 set_running_timer(base, timer);
471 spin_unlock_irq(&base->lock);
473 u32 preempt_count = preempt_count();
475 if (preempt_count != preempt_count()) {
476 printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
480 spin_lock_irq(&base->lock);
484 set_running_timer(base, NULL);
485 spin_unlock_irq(&base->lock);
488 #ifdef CONFIG_NO_IDLE_HZ
490 * Find out when the next timer event is due to happen. This
491 * is used on S/390 to stop all activity when a cpus is idle.
492 * This functions needs to be called disabled.
494 unsigned long next_timer_interrupt(void)
497 struct list_head *list;
498 struct timer_list *nte;
499 unsigned long expires;
503 base = &__get_cpu_var(tvec_bases);
504 spin_lock(&base->lock);
505 expires = base->timer_jiffies + (LONG_MAX >> 1);
508 /* Look for timer events in tv1. */
509 j = base->timer_jiffies & TVR_MASK;
511 list_for_each_entry(nte, base->tv1.vec + j, entry) {
512 expires = nte->expires;
513 if (j < (base->timer_jiffies & TVR_MASK))
514 list = base->tv2.vec + (INDEX(0));
517 j = (j + 1) & TVR_MASK;
518 } while (j != (base->timer_jiffies & TVR_MASK));
521 varray[0] = &base->tv2;
522 varray[1] = &base->tv3;
523 varray[2] = &base->tv4;
524 varray[3] = &base->tv5;
525 for (i = 0; i < 4; i++) {
528 if (list_empty(varray[i]->vec + j)) {
529 j = (j + 1) & TVN_MASK;
532 list_for_each_entry(nte, varray[i]->vec + j, entry)
533 if (time_before(nte->expires, expires))
534 expires = nte->expires;
535 if (j < (INDEX(i)) && i < 3)
536 list = varray[i + 1]->vec + (INDEX(i + 1));
538 } while (j != (INDEX(i)));
543 * The search wrapped. We need to look at the next list
544 * from next tv element that would cascade into tv element
545 * where we found the timer element.
547 list_for_each_entry(nte, list, entry) {
548 if (time_before(nte->expires, expires))
549 expires = nte->expires;
552 spin_unlock(&base->lock);
557 /******************************************************************/
560 * Timekeeping variables
562 unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
563 unsigned long tick_nsec = TICK_NSEC; /* ACTHZ period (nsec) */
567 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
568 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
569 * at zero at system boot time, so wall_to_monotonic will be negative,
570 * however, we will ALWAYS keep the tv_nsec part positive so we can use
571 * the usual normalization.
573 struct timespec xtime __attribute__ ((aligned (16)));
574 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
576 EXPORT_SYMBOL(xtime);
578 /* Don't completely fail for HZ > 500. */
579 int tickadj = 500/HZ ? : 1; /* microsecs */
583 * phase-lock loop variables
585 /* TIME_ERROR prevents overwriting the CMOS clock */
586 int time_state = TIME_OK; /* clock synchronization status */
587 int time_status = STA_UNSYNC; /* clock status bits */
588 long time_offset; /* time adjustment (us) */
589 long time_constant = 2; /* pll time constant */
590 long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
591 long time_precision = 1; /* clock precision (us) */
592 long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
593 long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
594 static long time_phase; /* phase offset (scaled us) */
595 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
596 /* frequency offset (scaled ppm)*/
597 static long time_adj; /* tick adjust (scaled 1 / HZ) */
598 long time_reftime; /* time at last adjustment (s) */
600 long time_next_adjust;
603 * this routine handles the overflow of the microsecond field
605 * The tricky bits of code to handle the accurate clock support
606 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
607 * They were originally developed for SUN and DEC kernels.
608 * All the kudos should go to Dave for this stuff.
611 static void second_overflow(void)
615 /* Bump the maxerror field */
616 time_maxerror += time_tolerance >> SHIFT_USEC;
617 if ( time_maxerror > NTP_PHASE_LIMIT ) {
618 time_maxerror = NTP_PHASE_LIMIT;
619 time_status |= STA_UNSYNC;
623 * Leap second processing. If in leap-insert state at
624 * the end of the day, the system clock is set back one
625 * second; if in leap-delete state, the system clock is
626 * set ahead one second. The microtime() routine or
627 * external clock driver will insure that reported time
628 * is always monotonic. The ugly divides should be
631 switch (time_state) {
634 if (time_status & STA_INS)
635 time_state = TIME_INS;
636 else if (time_status & STA_DEL)
637 time_state = TIME_DEL;
641 if (xtime.tv_sec % 86400 == 0) {
643 wall_to_monotonic.tv_sec++;
644 /* The timer interpolator will make time change gradually instead
645 * of an immediate jump by one second.
647 time_interpolator_update(-NSEC_PER_SEC);
648 time_state = TIME_OOP;
650 printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
655 if ((xtime.tv_sec + 1) % 86400 == 0) {
657 wall_to_monotonic.tv_sec--;
658 /* Use of time interpolator for a gradual change of time */
659 time_interpolator_update(NSEC_PER_SEC);
660 time_state = TIME_WAIT;
662 printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
667 time_state = TIME_WAIT;
671 if (!(time_status & (STA_INS | STA_DEL)))
672 time_state = TIME_OK;
676 * Compute the phase adjustment for the next second. In
677 * PLL mode, the offset is reduced by a fixed factor
678 * times the time constant. In FLL mode the offset is
679 * used directly. In either mode, the maximum phase
680 * adjustment for each second is clamped so as to spread
681 * the adjustment over not more than the number of
682 * seconds between updates.
684 if (time_offset < 0) {
685 ltemp = -time_offset;
686 if (!(time_status & STA_FLL))
687 ltemp >>= SHIFT_KG + time_constant;
688 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
689 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
690 time_offset += ltemp;
691 #if SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE > 0
692 time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
694 time_adj = -ltemp >> (SHIFT_HZ + SHIFT_UPDATE - SHIFT_SCALE);
698 if (!(time_status & STA_FLL))
699 ltemp >>= SHIFT_KG + time_constant;
700 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
701 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
702 time_offset -= ltemp;
703 #if SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE > 0
704 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
706 time_adj = ltemp >> (SHIFT_HZ + SHIFT_UPDATE - SHIFT_SCALE);
711 * Compute the frequency estimate and additional phase
712 * adjustment due to frequency error for the next
713 * second. When the PPS signal is engaged, gnaw on the
714 * watchdog counter and update the frequency computed by
715 * the pll and the PPS signal.
718 if (pps_valid == PPS_VALID) { /* PPS signal lost */
719 pps_jitter = MAXTIME;
720 pps_stabil = MAXFREQ;
721 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
722 STA_PPSWANDER | STA_PPSERROR);
724 ltemp = time_freq + pps_freq;
726 time_adj -= -ltemp >>
727 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
730 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
733 /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
734 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
737 time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
739 time_adj += (time_adj >> 2) + (time_adj >> 5);
742 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
743 * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
746 time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
748 time_adj += (time_adj >> 6) + (time_adj >> 7);
752 /* in the NTP reference this is called "hardclock()" */
753 static void update_wall_time_one_tick(void)
755 long time_adjust_step, delta_nsec;
757 if ( (time_adjust_step = time_adjust) != 0 ) {
758 /* We are doing an adjtime thing.
760 * Prepare time_adjust_step to be within bounds.
761 * Note that a positive time_adjust means we want the clock
764 * Limit the amount of the step to be in the range
765 * -tickadj .. +tickadj
767 if (time_adjust > tickadj)
768 time_adjust_step = tickadj;
769 else if (time_adjust < -tickadj)
770 time_adjust_step = -tickadj;
772 /* Reduce by this step the amount of time left */
773 time_adjust -= time_adjust_step;
775 delta_nsec = tick_nsec + time_adjust_step * 1000;
777 * Advance the phase, once it gets to one microsecond, then
778 * advance the tick more.
780 time_phase += time_adj;
781 if (time_phase <= -FINENSEC) {
782 long ltemp = -time_phase >> (SHIFT_SCALE - 10);
783 time_phase += ltemp << (SHIFT_SCALE - 10);
786 else if (time_phase >= FINENSEC) {
787 long ltemp = time_phase >> (SHIFT_SCALE - 10);
788 time_phase -= ltemp << (SHIFT_SCALE - 10);
791 xtime.tv_nsec += delta_nsec;
792 time_interpolator_update(delta_nsec);
794 /* Changes by adjtime() do not take effect till next tick. */
795 if (time_next_adjust != 0) {
796 time_adjust = time_next_adjust;
797 time_next_adjust = 0;
802 * Using a loop looks inefficient, but "ticks" is
803 * usually just one (we shouldn't be losing ticks,
804 * we're doing this this way mainly for interrupt
805 * latency reasons, not because we think we'll
806 * have lots of lost timer ticks
808 static void update_wall_time(unsigned long ticks)
812 update_wall_time_one_tick();
813 if (xtime.tv_nsec >= 1000000000) {
814 xtime.tv_nsec -= 1000000000;
822 * Called from the timer interrupt handler to charge one tick to the current
823 * process. user_tick is 1 if the tick is user time, 0 for system.
825 void update_process_times(int user_tick)
827 struct task_struct *p = current;
828 int cpu = smp_processor_id();
830 /* Note: this timer irq context must be accounted for as well. */
832 account_user_time(p, jiffies_to_cputime(1));
834 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
836 if (rcu_pending(cpu))
837 rcu_check_callbacks(cpu, user_tick);
839 run_posix_cpu_timers(p);
843 * Nr of active tasks - counted in fixed-point numbers
845 static unsigned long count_active_tasks(void)
847 return (nr_running() + nr_uninterruptible()) * FIXED_1;
851 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
852 * imply that avenrun[] is the standard name for this kind of thing.
853 * Nothing else seems to be standardized: the fractional size etc
854 * all seem to differ on different machines.
856 * Requires xtime_lock to access.
858 unsigned long avenrun[3];
860 EXPORT_SYMBOL(avenrun);
863 * calc_load - given tick count, update the avenrun load estimates.
864 * This is called while holding a write_lock on xtime_lock.
866 static inline void calc_load(unsigned long ticks)
868 unsigned long active_tasks; /* fixed-point */
869 static int count = LOAD_FREQ;
874 active_tasks = count_active_tasks();
875 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
876 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
877 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
881 /* jiffies at the most recent update of wall time */
882 unsigned long wall_jiffies = INITIAL_JIFFIES;
885 * This read-write spinlock protects us from races in SMP while
886 * playing with xtime and avenrun.
888 #ifndef ARCH_HAVE_XTIME_LOCK
889 seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
891 EXPORT_SYMBOL(xtime_lock);
895 * This function runs timers and the timer-tq in bottom half context.
897 static void run_timer_softirq(struct softirq_action *h)
899 tvec_base_t *base = &__get_cpu_var(tvec_bases);
901 if (time_after_eq(jiffies, base->timer_jiffies))
906 * Called by the local, per-CPU timer interrupt on SMP.
908 void run_local_timers(void)
910 raise_softirq(TIMER_SOFTIRQ);
914 * Called by the timer interrupt. xtime_lock must already be taken
917 static inline void update_times(void)
921 ticks = jiffies - wall_jiffies;
923 wall_jiffies += ticks;
924 update_wall_time(ticks);
930 * The 64-bit jiffies value is not atomic - you MUST NOT read it
931 * without sampling the sequence number in xtime_lock.
932 * jiffies is defined in the linker script...
935 void do_timer(struct pt_regs *regs)
941 #ifdef __ARCH_WANT_SYS_ALARM
944 * For backwards compatibility? This can be done in libc so Alpha
945 * and all newer ports shouldn't need it.
947 asmlinkage unsigned long sys_alarm(unsigned int seconds)
949 struct itimerval it_new, it_old;
950 unsigned int oldalarm;
952 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
953 it_new.it_value.tv_sec = seconds;
954 it_new.it_value.tv_usec = 0;
955 do_setitimer(ITIMER_REAL, &it_new, &it_old);
956 oldalarm = it_old.it_value.tv_sec;
957 /* ehhh.. We can't return 0 if we have an alarm pending.. */
958 /* And we'd better return too much than too little anyway */
959 if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
968 * sys_getpid - return the thread group id of the current process
970 * Note, despite the name, this returns the tgid not the pid. The tgid and
971 * the pid are identical unless CLONE_THREAD was specified on clone() in
972 * which case the tgid is the same in all threads of the same group.
974 * This is SMP safe as current->tgid does not change.
976 asmlinkage long sys_getpid(void)
978 return vx_map_tgid(current->tgid);
982 * Accessing ->group_leader->real_parent is not SMP-safe, it could
983 * change from under us. However, rather than getting any lock
984 * we can use an optimistic algorithm: get the parent
985 * pid, and go back and check that the parent is still
986 * the same. If it has changed (which is extremely unlikely
987 * indeed), we just try again..
989 * NOTE! This depends on the fact that even if we _do_
990 * get an old value of "parent", we can happily dereference
991 * the pointer (it was and remains a dereferencable kernel pointer
992 * no matter what): we just can't necessarily trust the result
993 * until we know that the parent pointer is valid.
995 * NOTE2: ->group_leader never changes from under us.
997 asmlinkage long sys_getppid(void)
1000 struct task_struct *me = current;
1001 struct task_struct *parent;
1003 parent = me->group_leader->real_parent;
1008 struct task_struct *old = parent;
1011 * Make sure we read the pid before re-reading the
1015 parent = me->group_leader->real_parent;
1022 return vx_map_pid(pid);
1028 * The Alpha uses getxpid, getxuid, and getxgid instead.
1031 asmlinkage long do_getxpid(long *ppid)
1033 *ppid = sys_getppid();
1034 return sys_getpid();
1039 asmlinkage long sys_getuid(void)
1041 /* Only we change this so SMP safe */
1042 return current->uid;
1045 asmlinkage long sys_geteuid(void)
1047 /* Only we change this so SMP safe */
1048 return current->euid;
1051 asmlinkage long sys_getgid(void)
1053 /* Only we change this so SMP safe */
1054 return current->gid;
1057 asmlinkage long sys_getegid(void)
1059 /* Only we change this so SMP safe */
1060 return current->egid;
1065 static void process_timeout(unsigned long __data)
1067 wake_up_process((task_t *)__data);
1071 * schedule_timeout - sleep until timeout
1072 * @timeout: timeout value in jiffies
1074 * Make the current task sleep until @timeout jiffies have
1075 * elapsed. The routine will return immediately unless
1076 * the current task state has been set (see set_current_state()).
1078 * You can set the task state as follows -
1080 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
1081 * pass before the routine returns. The routine will return 0
1083 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1084 * delivered to the current task. In this case the remaining time
1085 * in jiffies will be returned, or 0 if the timer expired in time
1087 * The current task state is guaranteed to be TASK_RUNNING when this
1090 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1091 * the CPU away without a bound on the timeout. In this case the return
1092 * value will be %MAX_SCHEDULE_TIMEOUT.
1094 * In all cases the return value is guaranteed to be non-negative.
1096 fastcall signed long __sched schedule_timeout(signed long timeout)
1098 struct timer_list timer;
1099 unsigned long expire;
1103 case MAX_SCHEDULE_TIMEOUT:
1105 * These two special cases are useful to be comfortable
1106 * in the caller. Nothing more. We could take
1107 * MAX_SCHEDULE_TIMEOUT from one of the negative value
1108 * but I' d like to return a valid offset (>=0) to allow
1109 * the caller to do everything it want with the retval.
1115 * Another bit of PARANOID. Note that the retval will be
1116 * 0 since no piece of kernel is supposed to do a check
1117 * for a negative retval of schedule_timeout() (since it
1118 * should never happens anyway). You just have the printk()
1119 * that will tell you if something is gone wrong and where.
1123 printk(KERN_ERR "schedule_timeout: wrong timeout "
1124 "value %lx from %p\n", timeout,
1125 __builtin_return_address(0));
1126 current->state = TASK_RUNNING;
1131 expire = timeout + jiffies;
1134 timer.expires = expire;
1135 timer.data = (unsigned long) current;
1136 timer.function = process_timeout;
1140 del_singleshot_timer_sync(&timer);
1142 timeout = expire - jiffies;
1145 return timeout < 0 ? 0 : timeout;
1148 EXPORT_SYMBOL(schedule_timeout);
1150 /* Thread ID - the internal kernel "pid" */
1151 asmlinkage long sys_gettid(void)
1153 return current->pid;
1156 static long __sched nanosleep_restart(struct restart_block *restart)
1158 unsigned long expire = restart->arg0, now = jiffies;
1159 struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
1162 /* Did it expire while we handled signals? */
1163 if (!time_after(expire, now))
1166 current->state = TASK_INTERRUPTIBLE;
1167 expire = schedule_timeout(expire - now);
1172 jiffies_to_timespec(expire, &t);
1174 ret = -ERESTART_RESTARTBLOCK;
1175 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1177 /* The 'restart' block is already filled in */
1182 asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1185 unsigned long expire;
1188 if (copy_from_user(&t, rqtp, sizeof(t)))
1191 if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
1194 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1195 current->state = TASK_INTERRUPTIBLE;
1196 expire = schedule_timeout(expire);
1200 struct restart_block *restart;
1201 jiffies_to_timespec(expire, &t);
1202 if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1205 restart = ¤t_thread_info()->restart_block;
1206 restart->fn = nanosleep_restart;
1207 restart->arg0 = jiffies + expire;
1208 restart->arg1 = (unsigned long) rmtp;
1209 ret = -ERESTART_RESTARTBLOCK;
1215 * sys_sysinfo - fill in sysinfo struct
1217 asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1220 unsigned long mem_total, sav_total;
1221 unsigned int mem_unit, bitcount;
1224 memset((char *)&val, 0, sizeof(struct sysinfo));
1228 seq = read_seqbegin(&xtime_lock);
1231 * This is annoying. The below is the same thing
1232 * posix_get_clock_monotonic() does, but it wants to
1233 * take the lock which we want to cover the loads stuff
1237 getnstimeofday(&tp);
1238 tp.tv_sec += wall_to_monotonic.tv_sec;
1239 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1240 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1241 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1244 if (vx_flags(VXF_VIRT_UPTIME, 0))
1245 vx_vsi_uptime(&tp, NULL);
1246 val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1248 val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
1249 val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1250 val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1252 val.procs = nr_threads;
1253 } while (read_seqretry(&xtime_lock, seq));
1259 * If the sum of all the available memory (i.e. ram + swap)
1260 * is less than can be stored in a 32 bit unsigned long then
1261 * we can be binary compatible with 2.2.x kernels. If not,
1262 * well, in that case 2.2.x was broken anyways...
1264 * -Erik Andersen <andersee@debian.org>
1267 mem_total = val.totalram + val.totalswap;
1268 if (mem_total < val.totalram || mem_total < val.totalswap)
1271 mem_unit = val.mem_unit;
1272 while (mem_unit > 1) {
1275 sav_total = mem_total;
1277 if (mem_total < sav_total)
1282 * If mem_total did not overflow, multiply all memory values by
1283 * val.mem_unit and set it to 1. This leaves things compatible
1284 * with 2.2.x, and also retains compatibility with earlier 2.4.x
1289 val.totalram <<= bitcount;
1290 val.freeram <<= bitcount;
1291 val.sharedram <<= bitcount;
1292 val.bufferram <<= bitcount;
1293 val.totalswap <<= bitcount;
1294 val.freeswap <<= bitcount;
1295 val.totalhigh <<= bitcount;
1296 val.freehigh <<= bitcount;
1299 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1305 static void __devinit init_timers_cpu(int cpu)
1310 base = &per_cpu(tvec_bases, cpu);
1311 spin_lock_init(&base->lock);
1312 for (j = 0; j < TVN_SIZE; j++) {
1313 INIT_LIST_HEAD(base->tv5.vec + j);
1314 INIT_LIST_HEAD(base->tv4.vec + j);
1315 INIT_LIST_HEAD(base->tv3.vec + j);
1316 INIT_LIST_HEAD(base->tv2.vec + j);
1318 for (j = 0; j < TVR_SIZE; j++)
1319 INIT_LIST_HEAD(base->tv1.vec + j);
1321 base->timer_jiffies = jiffies;
1324 #ifdef CONFIG_HOTPLUG_CPU
1325 static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1327 struct timer_list *timer;
1329 while (!list_empty(head)) {
1330 timer = list_entry(head->next, struct timer_list, entry);
1331 /* We're locking backwards from __mod_timer order here,
1333 if (!spin_trylock(&timer->lock))
1335 list_del(&timer->entry);
1336 internal_add_timer(new_base, timer);
1337 timer->base = new_base;
1338 spin_unlock(&timer->lock);
1343 static void __devinit migrate_timers(int cpu)
1345 tvec_base_t *old_base;
1346 tvec_base_t *new_base;
1349 BUG_ON(cpu_online(cpu));
1350 old_base = &per_cpu(tvec_bases, cpu);
1351 new_base = &get_cpu_var(tvec_bases);
1353 local_irq_disable();
1355 /* Prevent deadlocks via ordering by old_base < new_base. */
1356 if (old_base < new_base) {
1357 spin_lock(&new_base->lock);
1358 spin_lock(&old_base->lock);
1360 spin_lock(&old_base->lock);
1361 spin_lock(&new_base->lock);
1364 if (old_base->running_timer)
1366 for (i = 0; i < TVR_SIZE; i++)
1367 if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
1369 for (i = 0; i < TVN_SIZE; i++)
1370 if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
1371 || !migrate_timer_list(new_base, old_base->tv3.vec + i)
1372 || !migrate_timer_list(new_base, old_base->tv4.vec + i)
1373 || !migrate_timer_list(new_base, old_base->tv5.vec + i))
1375 spin_unlock(&old_base->lock);
1376 spin_unlock(&new_base->lock);
1378 put_cpu_var(tvec_bases);
1382 /* Avoid deadlock with __mod_timer, by backing off. */
1383 spin_unlock(&old_base->lock);
1384 spin_unlock(&new_base->lock);
1388 #endif /* CONFIG_HOTPLUG_CPU */
1390 static int __devinit timer_cpu_notify(struct notifier_block *self,
1391 unsigned long action, void *hcpu)
1393 long cpu = (long)hcpu;
1395 case CPU_UP_PREPARE:
1396 init_timers_cpu(cpu);
1398 #ifdef CONFIG_HOTPLUG_CPU
1400 migrate_timers(cpu);
1409 static struct notifier_block __devinitdata timers_nb = {
1410 .notifier_call = timer_cpu_notify,
1414 void __init init_timers(void)
1416 timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1417 (void *)(long)smp_processor_id());
1418 register_cpu_notifier(&timers_nb);
1419 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1422 #ifdef CONFIG_TIME_INTERPOLATION
1424 struct time_interpolator *time_interpolator;
1425 static struct time_interpolator *time_interpolator_list;
1426 static DEFINE_SPINLOCK(time_interpolator_lock);
1428 static inline u64 time_interpolator_get_cycles(unsigned int src)
1430 unsigned long (*x)(void);
1434 case TIME_SOURCE_FUNCTION:
1435 x = time_interpolator->addr;
1438 case TIME_SOURCE_MMIO64 :
1439 return readq((void __iomem *) time_interpolator->addr);
1441 case TIME_SOURCE_MMIO32 :
1442 return readl((void __iomem *) time_interpolator->addr);
1444 default: return get_cycles();
1448 static inline u64 time_interpolator_get_counter(void)
1450 unsigned int src = time_interpolator->source;
1452 if (time_interpolator->jitter)
1458 lcycle = time_interpolator->last_cycle;
1459 now = time_interpolator_get_cycles(src);
1460 if (lcycle && time_after(lcycle, now))
1462 /* Keep track of the last timer value returned. The use of cmpxchg here
1463 * will cause contention in an SMP environment.
1465 } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
1469 return time_interpolator_get_cycles(src);
1472 void time_interpolator_reset(void)
1474 time_interpolator->offset = 0;
1475 time_interpolator->last_counter = time_interpolator_get_counter();
1478 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
1480 unsigned long time_interpolator_get_offset(void)
1482 /* If we do not have a time interpolator set up then just return zero */
1483 if (!time_interpolator)
1486 return time_interpolator->offset +
1487 GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
1490 #define INTERPOLATOR_ADJUST 65536
1491 #define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1493 static void time_interpolator_update(long delta_nsec)
1496 unsigned long offset;
1498 /* If there is no time interpolator set up then do nothing */
1499 if (!time_interpolator)
1502 /* The interpolator compensates for late ticks by accumulating
1503 * the late time in time_interpolator->offset. A tick earlier than
1504 * expected will lead to a reset of the offset and a corresponding
1505 * jump of the clock forward. Again this only works if the
1506 * interpolator clock is running slightly slower than the regular clock
1507 * and the tuning logic insures that.
1510 counter = time_interpolator_get_counter();
1511 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
1513 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1514 time_interpolator->offset = offset - delta_nsec;
1516 time_interpolator->skips++;
1517 time_interpolator->ns_skipped += delta_nsec - offset;
1518 time_interpolator->offset = 0;
1520 time_interpolator->last_counter = counter;
1522 /* Tuning logic for time interpolator invoked every minute or so.
1523 * Decrease interpolator clock speed if no skips occurred and an offset is carried.
1524 * Increase interpolator clock speed if we skip too much time.
1526 if (jiffies % INTERPOLATOR_ADJUST == 0)
1528 if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
1529 time_interpolator->nsec_per_cyc--;
1530 if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1531 time_interpolator->nsec_per_cyc++;
1532 time_interpolator->skips = 0;
1533 time_interpolator->ns_skipped = 0;
1538 is_better_time_interpolator(struct time_interpolator *new)
1540 if (!time_interpolator)
1542 return new->frequency > 2*time_interpolator->frequency ||
1543 (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
1547 register_time_interpolator(struct time_interpolator *ti)
1549 unsigned long flags;
1552 if (ti->frequency == 0 || ti->mask == 0)
1555 ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1556 spin_lock(&time_interpolator_lock);
1557 write_seqlock_irqsave(&xtime_lock, flags);
1558 if (is_better_time_interpolator(ti)) {
1559 time_interpolator = ti;
1560 time_interpolator_reset();
1562 write_sequnlock_irqrestore(&xtime_lock, flags);
1564 ti->next = time_interpolator_list;
1565 time_interpolator_list = ti;
1566 spin_unlock(&time_interpolator_lock);
1570 unregister_time_interpolator(struct time_interpolator *ti)
1572 struct time_interpolator *curr, **prev;
1573 unsigned long flags;
1575 spin_lock(&time_interpolator_lock);
1576 prev = &time_interpolator_list;
1577 for (curr = *prev; curr; curr = curr->next) {
1585 write_seqlock_irqsave(&xtime_lock, flags);
1586 if (ti == time_interpolator) {
1587 /* we lost the best time-interpolator: */
1588 time_interpolator = NULL;
1589 /* find the next-best interpolator */
1590 for (curr = time_interpolator_list; curr; curr = curr->next)
1591 if (is_better_time_interpolator(curr))
1592 time_interpolator = curr;
1593 time_interpolator_reset();
1595 write_sequnlock_irqrestore(&xtime_lock, flags);
1596 spin_unlock(&time_interpolator_lock);
1598 #endif /* CONFIG_TIME_INTERPOLATION */
1601 * msleep - sleep safely even with waitqueue interruptions
1602 * @msecs: Time in milliseconds to sleep for
1604 void msleep(unsigned int msecs)
1606 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1609 set_current_state(TASK_UNINTERRUPTIBLE);
1610 timeout = schedule_timeout(timeout);
1614 EXPORT_SYMBOL(msleep);
1617 * msleep_interruptible - sleep waiting for waitqueue interruptions
1618 * @msecs: Time in milliseconds to sleep for
1620 unsigned long msleep_interruptible(unsigned int msecs)
1622 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1624 while (timeout && !signal_pending(current)) {
1625 set_current_state(TASK_INTERRUPTIBLE);
1626 timeout = schedule_timeout(timeout);
1628 return jiffies_to_msecs(timeout);
1631 EXPORT_SYMBOL(msleep_interruptible);