vserver 2.0-rc4
[linux-2.6.git] / kernel / timer.c
index f7d42e8..b79489e 100644 (file)
@@ -31,7 +31,7 @@
 #include <linux/time.h>
 #include <linux/jiffies.h>
 #include <linux/cpu.h>
-#include <linux/vs_base.h>
+#include <linux/syscalls.h>
 #include <linux/vs_cvirt.h>
 #include <linux/vserver/sched.h>
 
 #include <asm/unistd.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
+#include <asm/io.h>
+
+#ifdef CONFIG_TIME_INTERPOLATION
+static void time_interpolator_update(long delta_nsec);
+#else
+#define time_interpolator_update(x)
+#endif
 
 /*
  * per-CPU timer vector definitions:
@@ -236,6 +243,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
        spin_unlock_irqrestore(&base->lock, flags);
 }
 
+
 /***
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -302,6 +310,8 @@ repeat:
                goto repeat;
        }
        list_del(&timer->entry);
+       /* Need to make sure that anybody who sees a NULL base also sees the list ops */
+       smp_wmb();
        timer->base = NULL;
        spin_unlock_irqrestore(&base->lock, flags);
 
@@ -457,7 +467,14 @@ repeat:
                        smp_wmb();
                        timer->base = NULL;
                        spin_unlock_irq(&base->lock);
-                       fn(data);
+                       {
+                               u32 preempt_count = preempt_count();
+                               fn(data);
+                               if (preempt_count != preempt_count()) {
+                                       printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
+                                       BUG();
+                               }
+                       }
                        spin_lock_irq(&base->lock);
                        goto repeat;
                }
@@ -546,7 +563,7 @@ unsigned long tick_nsec = TICK_NSEC;                /* ACTHZ period (nsec) */
 /* 
  * The current time 
  * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
- * for sub jiffie times) to get to monotonic time.  Monotonic is pegged at zero
+ * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
  * at zero at system boot time, so wall_to_monotonic will be negative,
  * however, we will ALWAYS keep the tv_nsec part positive so we can use
  * the usual normalization.
@@ -622,6 +639,9 @@ static void second_overflow(void)
        if (xtime.tv_sec % 86400 == 0) {
            xtime.tv_sec--;
            wall_to_monotonic.tv_sec++;
+           /* The timer interpolator will make time change gradually instead
+            * of an immediate jump by one second.
+            */
            time_interpolator_update(-NSEC_PER_SEC);
            time_state = TIME_OOP;
            clock_was_set();
@@ -633,6 +653,7 @@ static void second_overflow(void)
        if ((xtime.tv_sec + 1) % 86400 == 0) {
            xtime.tv_sec++;
            wall_to_monotonic.tv_sec--;
+           /* Use of time interpolator for a gradual change of time */
            time_interpolator_update(NSEC_PER_SEC);
            time_state = TIME_WAIT;
            clock_was_set();
@@ -665,7 +686,11 @@ static void second_overflow(void)
        if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
            ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
        time_offset += ltemp;
+       #if SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE > 0
        time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+       #else
+       time_adj = -ltemp >> (SHIFT_HZ + SHIFT_UPDATE - SHIFT_SCALE);
+       #endif
     } else {
        ltemp = time_offset;
        if (!(time_status & STA_FLL))
@@ -673,7 +698,11 @@ static void second_overflow(void)
        if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
            ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
        time_offset -= ltemp;
+       #if SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE > 0
        time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+       #else
+       time_adj = ltemp >> (SHIFT_HZ + SHIFT_UPDATE - SHIFT_SCALE);
+       #endif
     }
 
     /*
@@ -779,67 +808,14 @@ static void update_wall_time(unsigned long ticks)
        do {
                ticks--;
                update_wall_time_one_tick();
-       } while (ticks);
-
-       if (xtime.tv_nsec >= 1000000000) {
-           xtime.tv_nsec -= 1000000000;
-           xtime.tv_sec++;
-           second_overflow();
-       }
-}
-
-static inline void do_process_times(struct task_struct *p,
-       unsigned long user, unsigned long system)
-{
-       unsigned long psecs;
-
-       psecs = (p->utime += user);
-       psecs += (p->stime += system);
-       if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_cur) {
-               /* Send SIGXCPU every second.. */
-               if (!(psecs % HZ))
-                       send_sig(SIGXCPU, p, 1);
-               /* and SIGKILL when we go over max.. */
-               if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_max)
-                       send_sig(SIGKILL, p, 1);
-       }
-}
-
-static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
-{
-       unsigned long it_virt = p->it_virt_value;
-
-       if (it_virt) {
-               it_virt -= ticks;
-               if (!it_virt) {
-                       it_virt = p->it_virt_incr;
-                       send_sig(SIGVTALRM, p, 1);
-               }
-               p->it_virt_value = it_virt;
-       }
-}
-
-static inline void do_it_prof(struct task_struct *p)
-{
-       unsigned long it_prof = p->it_prof_value;
-
-       if (it_prof) {
-               if (--it_prof == 0) {
-                       it_prof = p->it_prof_incr;
-                       send_sig(SIGPROF, p, 1);
+               if (xtime.tv_nsec >= 1000000000) {
+                       xtime.tv_nsec -= 1000000000;
+                       xtime.tv_sec++;
+                       second_overflow();
                }
-               p->it_prof_value = it_prof;
-       }
+       } while (ticks);
 }
 
-static void update_one_process(struct task_struct *p, unsigned long user,
-                       unsigned long system, int cpu)
-{
-       do_process_times(p, user, system);
-       do_it_virt(p, user);
-       do_it_prof(p);
-}      
-
 /*
  * Called from the timer interrupt handler to charge one tick to the current 
  * process.  user_tick is 1 if the tick is user time, 0 for system.
@@ -847,11 +823,17 @@ static void update_one_process(struct task_struct *p, unsigned long user,
 void update_process_times(int user_tick)
 {
        struct task_struct *p = current;
-       int cpu = smp_processor_id(), system = user_tick ^ 1;
+       int cpu = smp_processor_id();
 
-       update_one_process(p, user_tick, system, cpu);
+       /* Note: this timer irq context must be accounted for as well. */
+       if (user_tick)
+               account_user_time(p, jiffies_to_cputime(1));
+       else
+               account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
        run_local_timers();
-       scheduler_tick(user_tick, system);
+       if (rcu_pending(cpu))
+               rcu_check_callbacks(cpu, user_tick);
+       scheduler_tick();
 }
 
 /*
@@ -948,11 +930,6 @@ static inline void update_times(void)
 void do_timer(struct pt_regs *regs)
 {
        jiffies_64++;
-#ifndef CONFIG_SMP
-       /* SMP process accounting uses the local APIC timer */
-
-       update_process_times(user_mode(regs));
-#endif
        update_times();
 }
 
@@ -999,7 +976,7 @@ asmlinkage unsigned long sys_alarm(unsigned int seconds)
  */
 asmlinkage long sys_getpid(void)
 {
-       return vx_map_tgid(current->vx_info, current->tgid);
+       return vx_map_tgid(current->tgid);
 }
 
 /*
@@ -1043,7 +1020,7 @@ asmlinkage long sys_getppid(void)
 #endif
                break;
        }
-       return vx_map_tgid(current->vx_info, pid);
+       return vx_map_pid(pid);
 }
 
 asmlinkage long sys_getuid(void)
@@ -1244,8 +1221,7 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
                 * too.
                 */
 
-               do_gettimeofday((struct timeval *)&tp);
-               tp.tv_nsec *= NSEC_PER_USEC;
+               getnstimeofday(&tp);
                tp.tv_sec += wall_to_monotonic.tv_sec;
                tp.tv_nsec += wall_to_monotonic.tv_nsec;
                if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
@@ -1263,9 +1239,6 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
                val.procs = nr_threads;
        } while (read_seqretry(&xtime_lock, seq));
 
-/*     if (vx_flags(VXF_VIRT_CPU, 0))
-               vx_vsi_cpu(val);
-*/
        si_meminfo(&val);
        si_swapinfo(&val);
 
@@ -1434,14 +1407,119 @@ void __init init_timers(void)
 }
 
 #ifdef CONFIG_TIME_INTERPOLATION
-volatile unsigned long last_nsec_offset;
-#ifndef __HAVE_ARCH_CMPXCHG
-spinlock_t last_nsec_offset_lock = SPIN_LOCK_UNLOCKED;
-#endif
 
 struct time_interpolator *time_interpolator;
 static struct time_interpolator *time_interpolator_list;
-static spinlock_t time_interpolator_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(time_interpolator_lock);
+
+static inline u64 time_interpolator_get_cycles(unsigned int src)
+{
+       unsigned long (*x)(void);
+
+       switch (src)
+       {
+               case TIME_SOURCE_FUNCTION:
+                       x = time_interpolator->addr;
+                       return x();
+
+               case TIME_SOURCE_MMIO64 :
+                       return readq((void __iomem *) time_interpolator->addr);
+
+               case TIME_SOURCE_MMIO32 :
+                       return readl((void __iomem *) time_interpolator->addr);
+
+               default: return get_cycles();
+       }
+}
+
+static inline u64 time_interpolator_get_counter(void)
+{
+       unsigned int src = time_interpolator->source;
+
+       if (time_interpolator->jitter)
+       {
+               u64 lcycle;
+               u64 now;
+
+               do {
+                       lcycle = time_interpolator->last_cycle;
+                       now = time_interpolator_get_cycles(src);
+                       if (lcycle && time_after(lcycle, now))
+                               return lcycle;
+                       /* Keep track of the last timer value returned. The use of cmpxchg here
+                        * will cause contention in an SMP environment.
+                        */
+               } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
+               return now;
+       }
+       else
+               return time_interpolator_get_cycles(src);
+}
+
+void time_interpolator_reset(void)
+{
+       time_interpolator->offset = 0;
+       time_interpolator->last_counter = time_interpolator_get_counter();
+}
+
+#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
+
+unsigned long time_interpolator_get_offset(void)
+{
+       /* If we do not have a time interpolator set up then just return zero */
+       if (!time_interpolator)
+               return 0;
+
+       return time_interpolator->offset +
+               GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
+}
+
+#define INTERPOLATOR_ADJUST 65536
+#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
+
+static void time_interpolator_update(long delta_nsec)
+{
+       u64 counter;
+       unsigned long offset;
+
+       /* If there is no time interpolator set up then do nothing */
+       if (!time_interpolator)
+               return;
+
+       /* The interpolator compensates for late ticks by accumulating
+         * the late time in time_interpolator->offset. A tick earlier than
+        * expected will lead to a reset of the offset and a corresponding
+        * jump of the clock forward. Again this only works if the
+        * interpolator clock is running slightly slower than the regular clock
+        * and the tuning logic insures that.
+         */
+
+       counter = time_interpolator_get_counter();
+       offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
+
+       if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
+               time_interpolator->offset = offset - delta_nsec;
+       else {
+               time_interpolator->skips++;
+               time_interpolator->ns_skipped += delta_nsec - offset;
+               time_interpolator->offset = 0;
+       }
+       time_interpolator->last_counter = counter;
+
+       /* Tuning logic for time interpolator invoked every minute or so.
+        * Decrease interpolator clock speed if no skips occurred and an offset is carried.
+        * Increase interpolator clock speed if we skip too much time.
+        */
+       if (jiffies % INTERPOLATOR_ADJUST == 0)
+       {
+               if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
+                       time_interpolator->nsec_per_cyc--;
+               if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
+                       time_interpolator->nsec_per_cyc++;
+               time_interpolator->skips = 0;
+               time_interpolator->ns_skipped = 0;
+       }
+}
 
 static inline int
 is_better_time_interpolator(struct time_interpolator *new)
@@ -1455,11 +1533,20 @@ is_better_time_interpolator(struct time_interpolator *new)
 void
 register_time_interpolator(struct time_interpolator *ti)
 {
+       unsigned long flags;
+
+       /* Sanity check */
+       if (ti->frequency == 0 || ti->mask == 0)
+               BUG();
+
+       ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
        spin_lock(&time_interpolator_lock);
-       write_seqlock_irq(&xtime_lock);
-       if (is_better_time_interpolator(ti))
+       write_seqlock_irqsave(&xtime_lock, flags);
+       if (is_better_time_interpolator(ti)) {
                time_interpolator = ti;
-       write_sequnlock_irq(&xtime_lock);
+               time_interpolator_reset();
+       }
+       write_sequnlock_irqrestore(&xtime_lock, flags);
 
        ti->next = time_interpolator_list;
        time_interpolator_list = ti;
@@ -1470,6 +1557,7 @@ void
 unregister_time_interpolator(struct time_interpolator *ti)
 {
        struct time_interpolator *curr, **prev;
+       unsigned long flags;
 
        spin_lock(&time_interpolator_lock);
        prev = &time_interpolator_list;
@@ -1481,7 +1569,7 @@ unregister_time_interpolator(struct time_interpolator *ti)
                prev = &curr->next;
        }
 
-       write_seqlock_irq(&xtime_lock);
+       write_seqlock_irqsave(&xtime_lock, flags);
        if (ti == time_interpolator) {
                /* we lost the best time-interpolator: */
                time_interpolator = NULL;
@@ -1489,8 +1577,9 @@ unregister_time_interpolator(struct time_interpolator *ti)
                for (curr = time_interpolator_list; curr; curr = curr->next)
                        if (is_better_time_interpolator(curr))
                                time_interpolator = curr;
+               time_interpolator_reset();
        }
-       write_sequnlock_irq(&xtime_lock);
+       write_sequnlock_irqrestore(&xtime_lock, flags);
        spin_unlock(&time_interpolator_lock);
 }
 #endif /* CONFIG_TIME_INTERPOLATION */
@@ -1501,7 +1590,7 @@ unregister_time_interpolator(struct time_interpolator *ti)
  */
 void msleep(unsigned int msecs)
 {
-       unsigned long timeout = msecs_to_jiffies(msecs);
+       unsigned long timeout = msecs_to_jiffies(msecs) + 1;
 
        while (timeout) {
                set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1511,3 +1600,19 @@ void msleep(unsigned int msecs)
 
 EXPORT_SYMBOL(msleep);
 
+/**
+ * msleep_interruptible - sleep waiting for waitqueue interruptions
+ * @msecs: Time in milliseconds to sleep for
+ */
+unsigned long msleep_interruptible(unsigned int msecs)
+{
+       unsigned long timeout = msecs_to_jiffies(msecs) + 1;
+
+       while (timeout && !signal_pending(current)) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               timeout = schedule_timeout(timeout);
+       }
+       return jiffies_to_msecs(timeout);
+}
+
+EXPORT_SYMBOL(msleep_interruptible);