vserver 1.9.3
[linux-2.6.git] / kernel / timer.c
index 08cec6a..69719ee 100644 (file)
 #include <linux/time.h>
 #include <linux/jiffies.h>
 #include <linux/cpu.h>
+#include <linux/vs_cvirt.h>
+#include <linux/vserver/sched.h>
 
 #include <asm/uaccess.h>
+#include <asm/unistd.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
+#include <asm/io.h>
+
+#ifdef CONFIG_TIME_INTERPOLATION
+static void time_interpolator_update(long delta_nsec);
+#else
+#define time_interpolator_update(x)
+#endif
 
 /*
  * per-CPU timer vector definitions:
@@ -232,6 +242,8 @@ void add_timer_on(struct timer_list *timer, int cpu)
        spin_unlock_irqrestore(&base->lock, flags);
 }
 
+EXPORT_SYMBOL(add_timer_on);
+
 /***
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -317,10 +329,16 @@ EXPORT_SYMBOL(del_timer);
  *
  * Synchronization rules: callers must prevent restarting of the timer,
  * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. Upon exit the timer is not queued and the handler
- * is not running on any CPU.
+ * interrupt contexts. The caller must not hold locks which would prevent
+ * completion of the timer's handler.  Upon exit the timer is not queued and
+ * the handler is not running on any CPU.
  *
  * The function returns whether it has deactivated a pending timer or not.
+ *
+ * del_timer_sync() is slow and complicated because it copes with timer
+ * handlers which re-arm the timer (periodic timers).  If the timer handler
+ * is known to not do this (a single shot timer) then use
+ * del_singleshot_timer_sync() instead.
  */
 int del_timer_sync(struct timer_list *timer)
 {
@@ -332,7 +350,7 @@ int del_timer_sync(struct timer_list *timer)
 del_again:
        ret += del_timer(timer);
 
-       for_each_cpu(i) {
+       for_each_online_cpu(i) {
                base = &per_cpu(tvec_bases, i);
                if (base->running_timer == timer) {
                        while (base->running_timer == timer) {
@@ -348,8 +366,36 @@ del_again:
 
        return ret;
 }
-
 EXPORT_SYMBOL(del_timer_sync);
+
+/***
+ * del_singleshot_timer_sync - deactivate a non-recursive timer
+ * @timer: the timer to be deactivated
+ *
+ * This function is an optimization of del_timer_sync for the case where the
+ * caller can guarantee the timer does not reschedule itself in its timer
+ * function.
+ *
+ * Synchronization rules: callers must prevent restarting of the timer,
+ * otherwise this function is meaningless. It must not be called from
+ * interrupt contexts. The caller must not hold locks which wold prevent
+ * completion of the timer's handler.  Upon exit the timer is not queued and
+ * the handler is not running on any CPU.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
+ */
+int del_singleshot_timer_sync(struct timer_list *timer)
+{
+       int ret = del_timer(timer);
+
+       if (!ret) {
+               ret = del_timer_sync(timer);
+               BUG_ON(ret);
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL(del_singleshot_timer_sync);
 #endif
 
 static int cascade(tvec_base_t *base, tvec_t *tv, int index)
@@ -584,6 +630,9 @@ static void second_overflow(void)
        if (xtime.tv_sec % 86400 == 0) {
            xtime.tv_sec--;
            wall_to_monotonic.tv_sec++;
+           /* The timer interpolator will make time change gradually instead
+            * of an immediate jump by one second.
+            */
            time_interpolator_update(-NSEC_PER_SEC);
            time_state = TIME_OOP;
            clock_was_set();
@@ -595,6 +644,7 @@ static void second_overflow(void)
        if ((xtime.tv_sec + 1) % 86400 == 0) {
            xtime.tv_sec++;
            wall_to_monotonic.tv_sec--;
+           /* Use of time interpolator for a gradual change of time */
            time_interpolator_update(NSEC_PER_SEC);
            time_state = TIME_WAIT;
            clock_was_set();
@@ -757,12 +807,12 @@ static inline void do_process_times(struct task_struct *p,
 
        psecs = (p->utime += user);
        psecs += (p->stime += system);
-       if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
+       if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_cur) {
                /* Send SIGXCPU every second.. */
                if (!(psecs % HZ))
                        send_sig(SIGXCPU, p, 1);
                /* and SIGKILL when we go over max.. */
-               if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
+               if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_max)
                        send_sig(SIGKILL, p, 1);
        }
 }
@@ -794,7 +844,7 @@ static inline void do_it_prof(struct task_struct *p)
        }
 }
 
-void update_one_process(struct task_struct *p, unsigned long user,
+static void update_one_process(struct task_struct *p, unsigned long user,
                        unsigned long system, int cpu)
 {
        do_process_times(p, user, system);
@@ -918,7 +968,7 @@ void do_timer(struct pt_regs *regs)
        update_times();
 }
 
-#if !defined(__alpha__) && !defined(__ia64__)
+#ifdef __ARCH_WANT_SYS_ALARM
 
 /*
  * For backwards compatibility?  This can be done in libc so Alpha
@@ -961,7 +1011,7 @@ asmlinkage unsigned long sys_alarm(unsigned int seconds)
  */
 asmlinkage long sys_getpid(void)
 {
-       return current->tgid;
+       return vx_map_tgid(current->tgid);
 }
 
 /*
@@ -1005,7 +1055,7 @@ asmlinkage long sys_getppid(void)
 #endif
                break;
        }
-       return pid;
+       return vx_map_pid(pid);
 }
 
 asmlinkage long sys_getuid(void)
@@ -1109,7 +1159,7 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 
        add_timer(&timer);
        schedule();
-       del_timer_sync(&timer);
+       del_singleshot_timer_sync(&timer);
 
        timeout = expire - jiffies;
 
@@ -1206,14 +1256,15 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
                 * too.
                 */
 
-               do_gettimeofday((struct timeval *)&tp);
-               tp.tv_nsec *= NSEC_PER_USEC;
+               getnstimeofday(&tp);
                tp.tv_sec += wall_to_monotonic.tv_sec;
                tp.tv_nsec += wall_to_monotonic.tv_nsec;
                if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
                        tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
                        tp.tv_sec++;
                }
+               if (vx_flags(VXF_VIRT_UPTIME, 0))
+                       vx_vsi_uptime(&tp, NULL);
                val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
                val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
@@ -1223,6 +1274,9 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
                val.procs = nr_threads;
        } while (read_seqretry(&xtime_lock, seq));
 
+/*     if (vx_flags(VXF_VIRT_CPU, 0))
+               vx_vsi_cpu(val);
+*/
        si_meminfo(&val);
        si_swapinfo(&val);
 
@@ -1391,15 +1445,112 @@ void __init init_timers(void)
 }
 
 #ifdef CONFIG_TIME_INTERPOLATION
-volatile unsigned long last_nsec_offset;
-#ifndef __HAVE_ARCH_CMPXCHG
-spinlock_t last_nsec_offset_lock = SPIN_LOCK_UNLOCKED;
-#endif
 
 struct time_interpolator *time_interpolator;
 static struct time_interpolator *time_interpolator_list;
 static spinlock_t time_interpolator_lock = SPIN_LOCK_UNLOCKED;
 
+static inline unsigned long time_interpolator_get_cycles(unsigned int src)
+{
+       unsigned long (*x)(void);
+
+       switch (src)
+       {
+               case TIME_SOURCE_FUNCTION:
+                       x = time_interpolator->addr;
+                       return x();
+
+               case TIME_SOURCE_MMIO64 :
+                       return readq(time_interpolator->addr);
+
+               case TIME_SOURCE_MMIO32 :
+                       return readl(time_interpolator->addr);
+               default: return get_cycles();
+       }
+}
+
+static inline unsigned long time_interpolator_get_counter(void)
+{
+       unsigned int src = time_interpolator->source;
+
+       if (time_interpolator->jitter)
+       {
+               unsigned long lcycle;
+               unsigned long now;
+
+               do {
+                       lcycle = time_interpolator->last_cycle;
+                       now = time_interpolator_get_cycles(src);
+                       if (lcycle && time_after(lcycle, now)) return lcycle;
+                       /* Keep track of the last timer value returned. The use of cmpxchg here
+                        * will cause contention in an SMP environment.
+                        */
+               } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
+               return now;
+       }
+       else
+               return time_interpolator_get_cycles(src);
+}
+
+void time_interpolator_reset(void)
+{
+       time_interpolator->offset = 0;
+       time_interpolator->last_counter = time_interpolator_get_counter();
+}
+
+unsigned long time_interpolator_resolution(void)
+{
+       if (time_interpolator->frequency < NSEC_PER_SEC)
+               return NSEC_PER_SEC / time_interpolator->frequency;
+       else
+               return 1;
+}
+
+#define GET_TI_NSECS(count,i) ((((count) - i->last_counter) * i->nsec_per_cyc) >> i->shift)
+
+unsigned long time_interpolator_get_offset(void)
+{
+       return time_interpolator->offset +
+               GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
+}
+
+static void time_interpolator_update(long delta_nsec)
+{
+       unsigned long counter = time_interpolator_get_counter();
+       unsigned long offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
+
+       /* The interpolator compensates for late ticks by accumulating
+         * the late time in time_interpolator->offset. A tick earlier than
+        * expected will lead to a reset of the offset and a corresponding
+        * jump of the clock forward. Again this only works if the
+        * interpolator clock is running slightly slower than the regular clock
+        * and the tuning logic insures that.
+         */
+
+       if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
+               time_interpolator->offset = offset - delta_nsec;
+       else {
+               time_interpolator->skips++;
+               time_interpolator->ns_skipped += delta_nsec - offset;
+               time_interpolator->offset = 0;
+       }
+       time_interpolator->last_counter = counter;
+
+       /* Tuning logic for time interpolator invoked every minute or so.
+        * Decrease interpolator clock speed if no skips occurred and an offset is carried.
+        * Increase interpolator clock speed if we skip too much time.
+        */
+       if (jiffies % INTERPOLATOR_ADJUST == 0)
+       {
+               if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
+                       time_interpolator->nsec_per_cyc--;
+               if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
+                       time_interpolator->nsec_per_cyc++;
+               time_interpolator->skips = 0;
+               time_interpolator->ns_skipped = 0;
+       }
+}
+
 static inline int
 is_better_time_interpolator(struct time_interpolator *new)
 {
@@ -1412,11 +1563,16 @@ is_better_time_interpolator(struct time_interpolator *new)
 void
 register_time_interpolator(struct time_interpolator *ti)
 {
+       unsigned long flags;
+
+       ti->nsec_per_cyc = (NSEC_PER_SEC << ti->shift) / ti->frequency;
        spin_lock(&time_interpolator_lock);
-       write_seqlock_irq(&xtime_lock);
-       if (is_better_time_interpolator(ti))
+       write_seqlock_irqsave(&xtime_lock, flags);
+       if (is_better_time_interpolator(ti)) {
                time_interpolator = ti;
-       write_sequnlock_irq(&xtime_lock);
+               time_interpolator_reset();
+       }
+       write_sequnlock_irqrestore(&xtime_lock, flags);
 
        ti->next = time_interpolator_list;
        time_interpolator_list = ti;
@@ -1427,6 +1583,7 @@ void
 unregister_time_interpolator(struct time_interpolator *ti)
 {
        struct time_interpolator *curr, **prev;
+       unsigned long flags;
 
        spin_lock(&time_interpolator_lock);
        prev = &time_interpolator_list;
@@ -1438,7 +1595,7 @@ unregister_time_interpolator(struct time_interpolator *ti)
                prev = &curr->next;
        }
 
-       write_seqlock_irq(&xtime_lock);
+       write_seqlock_irqsave(&xtime_lock, flags);
        if (ti == time_interpolator) {
                /* we lost the best time-interpolator: */
                time_interpolator = NULL;
@@ -1446,8 +1603,42 @@ unregister_time_interpolator(struct time_interpolator *ti)
                for (curr = time_interpolator_list; curr; curr = curr->next)
                        if (is_better_time_interpolator(curr))
                                time_interpolator = curr;
+               time_interpolator_reset();
        }
-       write_sequnlock_irq(&xtime_lock);
+       write_sequnlock_irqrestore(&xtime_lock, flags);
        spin_unlock(&time_interpolator_lock);
 }
 #endif /* CONFIG_TIME_INTERPOLATION */
+
+/**
+ * msleep - sleep safely even with waitqueue interruptions
+ * @msecs: Time in milliseconds to sleep for
+ */
+void msleep(unsigned int msecs)
+{
+       unsigned long timeout = msecs_to_jiffies(msecs);
+
+       while (timeout) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               timeout = schedule_timeout(timeout);
+       }
+}
+
+EXPORT_SYMBOL(msleep);
+
+/**
+ * msleep_interruptible - sleep waiting for waitqueue interruptions
+ * @msecs: Time in milliseconds to sleep for
+ */
+unsigned long msleep_interruptible(unsigned int msecs)
+{
+       unsigned long timeout = msecs_to_jiffies(msecs);
+
+       while (timeout && !signal_pending(current)) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               timeout = schedule_timeout(timeout);
+       }
+       return jiffies_to_msecs(timeout);
+}
+
+EXPORT_SYMBOL(msleep_interruptible);