fedora core 6 1.2949 + vserver 2.2.0
[linux-2.6.git] / kernel / vserver / sched.c
index d3e58be..f009a9b 100644 (file)
@@ -3,10 +3,11 @@
  *
  *  Virtual Server: Scheduler Support
  *
- *  Copyright (C) 2004-2005  Herbert Pötzl
+ *  Copyright (C) 2004-2007  Herbert Pötzl
  *
  *  V0.01  adapted Sam Vilains version to 2.6.3
  *  V0.02  removed legacy interface
+ *  V0.03  changed vcmds to vxi arg
  *
  */
 
 #include <asm/errno.h>
 #include <asm/uaccess.h>
 
+#define vxd_check_range(val, min, max) do {            \
+       vxlprintk((val<min) || (val>max),               \
+               "check_range(%ld,%ld,%ld)",             \
+               (long)val, (long)min, (long)max,        \
+               __FILE__, __LINE__);                    \
+       } while (0)
+
+
+void vx_update_sched_param(struct _vx_sched *sched,
+       struct _vx_sched_pc *sched_pc)
+{
+       unsigned int set_mask = sched->update_mask;
+
+       if (set_mask & VXSM_FILL_RATE)
+               sched_pc->fill_rate[0] = sched->fill_rate[0];
+       if (set_mask & VXSM_INTERVAL)
+               sched_pc->interval[0] = sched->interval[0];
+       if (set_mask & VXSM_FILL_RATE2)
+               sched_pc->fill_rate[1] = sched->fill_rate[1];
+       if (set_mask & VXSM_INTERVAL2)
+               sched_pc->interval[1] = sched->interval[1];
+       if (set_mask & VXSM_TOKENS)
+               sched_pc->tokens = sched->tokens;
+       if (set_mask & VXSM_TOKENS_MIN)
+               sched_pc->tokens_min = sched->tokens_min;
+       if (set_mask & VXSM_TOKENS_MAX)
+               sched_pc->tokens_max = sched->tokens_max;
+       if (set_mask & VXSM_PRIO_BIAS)
+               sched_pc->prio_bias = sched->prio_bias;
+
+       if (set_mask & VXSM_IDLE_TIME)
+               sched_pc->flags |= VXSF_IDLE_TIME;
+       else
+               sched_pc->flags &= ~VXSF_IDLE_TIME;
+
+       /* reset time */
+       sched_pc->norm_time = jiffies;
+}
+
 
 /*
  * recalculate the context's scheduling tokens
  *
  * ret > 0 : number of tokens available
- * ret = 0 : context is paused
- * ret < 0 : number of jiffies until new tokens arrive
+ * ret < 0 : on hold, check delta_min[]
+ *          -1 only jiffies
+ *          -2 also idle time
  *
  */
-int vx_tokens_recalc(struct vx_info *vxi)
+int vx_tokens_recalc(struct _vx_sched_pc *sched_pc,
+       unsigned long *norm_time, unsigned long *idle_time, int delta_min[2])
 {
-       long delta, tokens = 0;
+       long delta;
+       long tokens = 0;
+       int flags = sched_pc->flags;
 
-       if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0))
-               /* we are paused */
-               return 0;
+       /* how much time did pass? */
+       delta = *norm_time - sched_pc->norm_time;
+       vxd_check_range(delta, 0, INT_MAX);
 
-       delta = jiffies - vxi->sched.jiffies;
-
-       if (delta >= vxi->sched.interval) {
-               /* lockdown scheduler info */
-               spin_lock(&vxi->sched.tokens_lock);
+       if (delta >= sched_pc->interval[0]) {
+               long tokens, integral;
 
                /* calc integral token part */
-               delta = jiffies - vxi->sched.jiffies;
-               tokens = delta / vxi->sched.interval;
-               delta = tokens * vxi->sched.interval;
-               tokens *= vxi->sched.fill_rate;
-
-               atomic_add(tokens, &vxi->sched.tokens);
-               vxi->sched.jiffies += delta;
-               tokens = atomic_read(&vxi->sched.tokens);
-
-               if (tokens > vxi->sched.tokens_max) {
-                       tokens = vxi->sched.tokens_max;
-                       atomic_set(&vxi->sched.tokens, tokens);
+               tokens = delta / sched_pc->interval[0];
+               integral = tokens * sched_pc->interval[0];
+               tokens *= sched_pc->fill_rate[0];
+#ifdef CONFIG_VSERVER_HARDCPU
+               delta_min[0] = delta - integral;
+               vxd_check_range(delta_min[0], 0, sched_pc->interval[0]);
+#endif
+               /* advance time */
+               sched_pc->norm_time += delta;
+
+               /* add tokens */
+               sched_pc->tokens += tokens;
+               sched_pc->token_time += tokens;
+       }
+       else
+               delta_min[0] = delta;
+
+#ifdef CONFIG_VSERVER_IDLETIME
+       if (!(flags & VXSF_IDLE_TIME))
+               goto skip_idle;
+
+       /* how much was the idle skip? */
+       delta = *idle_time - sched_pc->idle_time;
+       vxd_check_range(delta, 0, INT_MAX);
+
+       if (delta >= sched_pc->interval[1]) {
+               long tokens, integral;
+
+               /* calc fair share token part */
+               tokens = delta / sched_pc->interval[1];
+               integral = tokens * sched_pc->interval[1];
+               tokens *= sched_pc->fill_rate[1];
+               delta_min[1] = delta - integral;
+               vxd_check_range(delta_min[1], 0, sched_pc->interval[1]);
+
+               /* advance idle time */
+               sched_pc->idle_time += integral;
+
+               /* add tokens */
+               sched_pc->tokens += tokens;
+               sched_pc->token_time += tokens;
+       }
+       else
+               delta_min[1] = delta;
+skip_idle:
+#endif
+
+       /* clip at maximum */
+       if (sched_pc->tokens > sched_pc->tokens_max)
+               sched_pc->tokens = sched_pc->tokens_max;
+       tokens = sched_pc->tokens;
+
+       if ((flags & VXSF_ONHOLD)) {
+               /* can we unhold? */
+               if (tokens >= sched_pc->tokens_min) {
+                       flags &= ~VXSF_ONHOLD;
+                       sched_pc->hold_ticks +=
+                               *norm_time - sched_pc->onhold;
                }
-               spin_unlock(&vxi->sched.tokens_lock);
+               else
+                       goto on_hold;
        } else {
-               /* no new tokens */
-               tokens = vx_tokens_avail(vxi);
-               if (tokens <= 0)
-                       vxi->vx_state |= VXS_ONHOLD;
-               if (tokens < vxi->sched.tokens_min) {
-                       /* enough tokens will be available in */
-                       if (vxi->sched.tokens_min == 0)
-                               return delta - vxi->sched.interval;
-                       return delta - vxi->sched.interval *
-                               vxi->sched.tokens_min / vxi->sched.fill_rate;
+               /* put on hold? */
+               if (tokens <= 0) {
+                       flags |= VXSF_ONHOLD;
+                       sched_pc->onhold = *norm_time;
+                       goto on_hold;
                }
        }
+       sched_pc->flags = flags;
+       return tokens;
 
-       /* we have some tokens left */
-       if (vx_info_state(vxi, VXS_ONHOLD) &&
-               (tokens >= vxi->sched.tokens_min))
-               vxi->vx_state &= ~VXS_ONHOLD;
-       if (vx_info_state(vxi, VXS_ONHOLD))
-               tokens -= vxi->sched.tokens_min;
+on_hold:
+       tokens = sched_pc->tokens_min - tokens;
+       sched_pc->flags = flags;
+       BUG_ON(tokens < 0);
+
+#ifdef CONFIG_VSERVER_HARDCPU
+       /* next interval? */
+       if (!sched_pc->fill_rate[0])
+               delta_min[0] = HZ;
+       else if (tokens > sched_pc->fill_rate[0])
+               delta_min[0] += sched_pc->interval[0] *
+                       tokens / sched_pc->fill_rate[0];
+       else
+               delta_min[0] = sched_pc->interval[0] - delta_min[0];
+       vxd_check_range(delta_min[0], 0, INT_MAX);
+
+#ifdef CONFIG_VSERVER_IDLETIME
+       if (!(flags & VXSF_IDLE_TIME))
+               return -1;
+
+       /* next interval? */
+       if (!sched_pc->fill_rate[1])
+               delta_min[1] = HZ;
+       else if (tokens > sched_pc->fill_rate[1])
+               delta_min[1] += sched_pc->interval[1] *
+                       tokens / sched_pc->fill_rate[1];
+       else
+               delta_min[1] = sched_pc->interval[1] - delta_min[1];
+       vxd_check_range(delta_min[1], 0, INT_MAX);
+
+       return -2;
+#else
+       return -1;
+#endif /* CONFIG_VSERVER_IDLETIME */
+#else
+       return 0;
+#endif /* CONFIG_VSERVER_HARDCPU */
+}
 
-       return tokens;
+static inline unsigned long msec_to_ticks(unsigned long msec)
+{
+       return msecs_to_jiffies(msec);
 }
 
-/*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into a -4 ... 0 ... +4 bonus/penalty range.
- *
- * Additionally, we scale another amount based on the number of
- * CPU tokens currently held by the context, if the process is
- * part of a context (and the appropriate SCHED flag is set).
- * This ranges from -5 ... 0 ... +15, quadratically.
- *
- * So, the total bonus is -9 .. 0 .. +19
- * We use ~50% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *    unless that context is far exceeding its CPU allocation.
- *
- * Both properties are important to certain workloads.
- */
-int vx_effective_vavavoom(struct vx_info *vxi, int max_prio)
+static inline unsigned long ticks_to_msec(unsigned long ticks)
+{
+       return jiffies_to_msecs(ticks);
+}
+
+static inline unsigned long ticks_to_usec(unsigned long ticks)
+{
+       return jiffies_to_usecs(ticks);
+}
+
+
+static int do_set_sched(struct vx_info *vxi, struct vcmd_sched_v5 *data)
+{
+       unsigned int set_mask = data->mask;
+       unsigned int update_mask;
+       int i, cpu;
+
+       /* Sanity check data values */
+       if (data->tokens_max <= 0)
+               data->tokens_max = HZ;
+       if (data->tokens_min < 0)
+               data->tokens_min = HZ/3;
+       if (data->tokens_min >= data->tokens_max)
+               data->tokens_min = data->tokens_max;
+
+       if (data->prio_bias > MAX_PRIO_BIAS)
+               data->prio_bias = MAX_PRIO_BIAS;
+       if (data->prio_bias < MIN_PRIO_BIAS)
+               data->prio_bias = MIN_PRIO_BIAS;
+
+       spin_lock(&vxi->sched.tokens_lock);
+
+       /* sync up on delayed updates */
+       for_each_cpu_mask(cpu, vxi->sched.update)
+               vx_update_sched_param(&vxi->sched,
+                       &vx_per_cpu(vxi, sched_pc, cpu));
+
+       if (set_mask & VXSM_FILL_RATE)
+               vxi->sched.fill_rate[0] = data->fill_rate[0];
+       if (set_mask & VXSM_FILL_RATE2)
+               vxi->sched.fill_rate[1] = data->fill_rate[1];
+       if (set_mask & VXSM_INTERVAL)
+               vxi->sched.interval[0] = (set_mask & VXSM_MSEC) ?
+                       msec_to_ticks(data->interval[0]) : data->interval[0];
+       if (set_mask & VXSM_INTERVAL2)
+               vxi->sched.interval[1] = (set_mask & VXSM_MSEC) ?
+                       msec_to_ticks(data->interval[1]) : data->interval[1];
+       if (set_mask & VXSM_TOKENS)
+               vxi->sched.tokens = data->tokens;
+       if (set_mask & VXSM_TOKENS_MIN)
+               vxi->sched.tokens_min = data->tokens_min;
+       if (set_mask & VXSM_TOKENS_MAX)
+               vxi->sched.tokens_max = data->tokens_max;
+       if (set_mask & VXSM_PRIO_BIAS)
+               vxi->sched.prio_bias = data->prio_bias;
+
+       /* Sanity check rate/interval */
+       for (i=0; i<2; i++) {
+               if (data->fill_rate[i] < 0)
+                       data->fill_rate[i] = 0;
+               if (data->interval[i] <= 0)
+                       data->interval[i] = HZ;
+       }
+
+       update_mask = vxi->sched.update_mask & VXSM_SET_MASK;
+       update_mask |= (set_mask & (VXSM_SET_MASK|VXSM_IDLE_TIME));
+       vxi->sched.update_mask = update_mask;
+#ifdef CONFIG_SMP
+       rmb();
+       if (set_mask & VXSM_CPU_ID) {
+               vxi->sched.update = cpumask_of_cpu(data->cpu_id);
+               cpus_and(vxi->sched.update, cpu_online_map,
+                       vxi->sched.update);
+       }
+       else
+               vxi->sched.update = cpu_online_map;
+
+       /* forced reload? */
+       if (set_mask & VXSM_FORCE) {
+               for_each_cpu_mask(cpu, vxi->sched.update)
+                       vx_update_sched_param(&vxi->sched,
+                               &vx_per_cpu(vxi, sched_pc, cpu));
+               vxi->sched.update = CPU_MASK_NONE;
+       }
+#else
+       /* on UP we update immediately */
+       vx_update_sched_param(&vxi->sched,
+               &vx_per_cpu(vxi, sched_pc, 0));
+#endif
+
+       spin_unlock(&vxi->sched.tokens_lock);
+       return 0;
+}
+
+#define COPY_IDS(C) C(cpu_id); C(bucket_id)
+#define COPY_PRI(C) C(prio_bias)
+#define COPY_TOK(C) C(tokens); C(tokens_min); C(tokens_max)
+#define COPY_FRI(C) C(fill_rate[0]); C(interval[0]);   \
+                   C(fill_rate[1]); C(interval[1]);
+
+#define COPY_VALUE(name) vc_data.name = data->name
+
+static int do_set_sched_v4(struct vx_info *vxi, struct vcmd_set_sched_v4 *data)
 {
-       int vavavoom, max;
-
-       /* lots of tokens = lots of vavavoom
-        *      no tokens = no vavavoom      */
-       if ((vavavoom = atomic_read(&vxi->sched.tokens)) >= 0) {
-               max = vxi->sched.tokens_max;
-               vavavoom = max - vavavoom;
-               max = max * max;
-               vavavoom = max_prio * VAVAVOOM_RATIO / 100
-                       * (vavavoom*vavavoom - (max >> 2)) / max;
-       } else
-               vavavoom = 0;
-
-       vxi->sched.vavavoom = vavavoom;
-       return vavavoom + vxi->sched.priority_bias;
+       struct vcmd_sched_v5 vc_data;
+
+       vc_data.mask = data->set_mask;
+       COPY_IDS(COPY_VALUE);
+       COPY_PRI(COPY_VALUE);
+       COPY_TOK(COPY_VALUE);
+       vc_data.fill_rate[0] = vc_data.fill_rate[1] = data->fill_rate;
+       vc_data.interval[0] = vc_data.interval[1] = data->interval;
+       return do_set_sched(vxi, &vc_data);
 }
 
+#ifdef CONFIG_VSERVER_LEGACY
+
+#define COPY_MASK_V2(name, mask)                       \
+       if (vc_data.name != SCHED_KEEP) {               \
+               vc_data_v4.name = vc_data.name;         \
+               vc_data_v4.set_mask |= mask;            \
+       }
 
-int vc_set_sched_v2(uint32_t xid, void __user *data)
+int vc_set_sched_v2(struct vx_info *vxi, void __user *data)
 {
        struct vcmd_set_sched_v2 vc_data;
-       struct vx_info *vxi;
+       struct vcmd_set_sched_v4 vc_data_v4 = { .set_mask = 0 };
 
        if (copy_from_user (&vc_data, data, sizeof(vc_data)))
                return -EFAULT;
 
-       vxi = lookup_vx_info(xid);
-       if (!vxi)
-               return -EINVAL;
+       COPY_MASK_V2(fill_rate,  VXSM_FILL_RATE);
+       COPY_MASK_V2(interval,   VXSM_INTERVAL);
+       COPY_MASK_V2(tokens,     VXSM_TOKENS);
+       COPY_MASK_V2(tokens_min, VXSM_TOKENS_MIN);
+       COPY_MASK_V2(tokens_max, VXSM_TOKENS_MAX);
+       vc_data_v4.bucket_id = 0;
 
-       spin_lock(&vxi->sched.tokens_lock);
+       do_set_sched_v4(vxi, &vc_data_v4);
+       return 0;
+}
+#endif
 
-       if (vc_data.interval != SCHED_KEEP)
-               vxi->sched.interval = vc_data.interval;
-       if (vc_data.fill_rate != SCHED_KEEP)
-               vxi->sched.fill_rate = vc_data.fill_rate;
-       if (vc_data.tokens_min != SCHED_KEEP)
-               vxi->sched.tokens_min = vc_data.tokens_min;
-       if (vc_data.tokens_max != SCHED_KEEP)
-               vxi->sched.tokens_max = vc_data.tokens_max;
-       if (vc_data.tokens != SCHED_KEEP)
-               atomic_set(&vxi->sched.tokens, vc_data.tokens);
-
-       /* Sanity check the resultant values */
-       if (vxi->sched.fill_rate <= 0)
-               vxi->sched.fill_rate = 1;
-       if (vxi->sched.interval <= 0)
-               vxi->sched.interval = HZ;
-       if (vxi->sched.tokens_max == 0)
-               vxi->sched.tokens_max = 1;
-       if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max)
-               atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max);
-       if (vxi->sched.tokens_min > vxi->sched.tokens_max)
-               vxi->sched.tokens_min = vxi->sched.tokens_max;
+int vc_set_sched_v3(struct vx_info *vxi, void __user *data)
+{
+       struct vcmd_set_sched_v3 vc_data;
+       struct vcmd_set_sched_v4 vc_data_v4;
 
-       spin_unlock(&vxi->sched.tokens_lock);
-       put_vx_info(vxi);
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       /* structures are binary compatible */
+       memcpy(&vc_data_v4, &vc_data, sizeof(vc_data));
+       vc_data_v4.set_mask &= VXSM_V3_MASK;
+       vc_data_v4.bucket_id = 0;
+
+       return do_set_sched_v4(vxi, &vc_data_v4);
+}
+
+int vc_set_sched_v4(struct vx_info *vxi, void __user *data)
+{
+       struct vcmd_set_sched_v4 vc_data;
+
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       return do_set_sched_v4(vxi, &vc_data);
+}
+
+       /* latest interface is v5 */
+
+int vc_set_sched(struct vx_info *vxi, void __user *data)
+{
+       struct vcmd_sched_v5 vc_data;
+
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       return do_set_sched(vxi, &vc_data);
+}
+
+
+int vc_get_sched(struct vx_info *vxi, void __user *data)
+{
+       struct vcmd_sched_v5 vc_data;
+
+       if (copy_from_user (&vc_data, data, sizeof(vc_data)))
+               return -EFAULT;
+
+       if (vc_data.mask & VXSM_CPU_ID) {
+               int cpu = vc_data.cpu_id;
+               struct _vx_sched_pc *data;
+
+               if (!cpu_possible(cpu))
+                       return -EINVAL;
+
+               data = &vx_per_cpu(vxi, sched_pc, cpu);
+               COPY_TOK(COPY_VALUE);
+               COPY_PRI(COPY_VALUE);
+               COPY_FRI(COPY_VALUE);
+
+               if (data->flags & VXSF_IDLE_TIME)
+                       vc_data.mask |= VXSM_IDLE_TIME;
+       } else {
+               struct _vx_sched *data = &vxi->sched;
+
+               COPY_TOK(COPY_VALUE);
+               COPY_PRI(COPY_VALUE);
+               COPY_FRI(COPY_VALUE);
+       }
+
+       if (vc_data.mask & VXSM_MSEC) {
+               vc_data.interval[0] = ticks_to_msec(vc_data.interval[0]);
+               vc_data.interval[1] = ticks_to_msec(vc_data.interval[1]);
+       }
+
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
        return 0;
 }
 
 
-int vc_set_sched(uint32_t xid, void __user *data)
+int vc_sched_info(struct vx_info *vxi, void __user *data)
 {
-       struct vcmd_set_sched_v3 vc_data;
-       struct vx_info *vxi;
-       unsigned int set_mask;
+       struct vcmd_sched_info vc_data;
+       int cpu;
 
        if (copy_from_user (&vc_data, data, sizeof(vc_data)))
                return -EFAULT;
 
-       vxi = lookup_vx_info(xid);
-       if (!vxi)
+       cpu = vc_data.cpu_id;
+       if (!cpu_possible(cpu))
                return -EINVAL;
 
-       set_mask = vc_data.set_mask;
+       if (vxi) {
+               struct _vx_sched_pc *sched_pc =
+                       &vx_per_cpu(vxi, sched_pc, cpu);
 
-       spin_lock(&vxi->sched.tokens_lock);
-
-       if (set_mask & VXSM_FILL_RATE)
-               vxi->sched.fill_rate = vc_data.fill_rate;
-       if (set_mask & VXSM_INTERVAL)
-               vxi->sched.interval = vc_data.interval;
-       if (set_mask & VXSM_TOKENS)
-               atomic_set(&vxi->sched.tokens, vc_data.tokens);
-       if (set_mask & VXSM_TOKENS_MIN)
-               vxi->sched.tokens_min = vc_data.tokens_min;
-       if (set_mask & VXSM_TOKENS_MAX)
-               vxi->sched.tokens_max = vc_data.tokens_max;
-       if (set_mask & VXSM_PRIO_BIAS)
-               vxi->sched.priority_bias = vc_data.priority_bias;
-
-       /* Sanity check the resultant values */
-       if (vxi->sched.fill_rate <= 0)
-               vxi->sched.fill_rate = 1;
-       if (vxi->sched.interval <= 0)
-               vxi->sched.interval = HZ;
-       if (vxi->sched.tokens_max == 0)
-               vxi->sched.tokens_max = 1;
-       if (atomic_read(&vxi->sched.tokens) > vxi->sched.tokens_max)
-               atomic_set(&vxi->sched.tokens, vxi->sched.tokens_max);
-       if (vxi->sched.tokens_min > vxi->sched.tokens_max)
-               vxi->sched.tokens_min = vxi->sched.tokens_max;
-       if (vxi->sched.priority_bias > MAX_PRIO_BIAS)
-               vxi->sched.priority_bias = MAX_PRIO_BIAS;
-       if (vxi->sched.priority_bias < MIN_PRIO_BIAS)
-               vxi->sched.priority_bias = MIN_PRIO_BIAS;
+               vc_data.user_msec = ticks_to_msec(sched_pc->user_ticks);
+               vc_data.sys_msec = ticks_to_msec(sched_pc->sys_ticks);
+               vc_data.hold_msec = ticks_to_msec(sched_pc->hold_ticks);
+               vc_data.vavavoom = sched_pc->vavavoom;
+       }
+       vc_data.token_usec = ticks_to_usec(1);
 
-       spin_unlock(&vxi->sched.tokens_lock);
-       put_vx_info(vxi);
+       if (copy_to_user (data, &vc_data, sizeof(vc_data)))
+               return -EFAULT;
        return 0;
 }