Changes to support Andy's new CPU scheduler
authorAndy Bavier <acb@cs.princeton.edu>
Thu, 8 Sep 2005 03:10:24 +0000 (03:10 +0000)
committerAndy Bavier <acb@cs.princeton.edu>
Thu, 8 Sep 2005 03:10:24 +0000 (03:10 +0000)
configs/kernel-2.6.12-i686-planetlab.config
include/linux/vs_sched.h
include/linux/vserver/context.h
include/linux/vserver/sched_def.h
kernel/sched.c
kernel/vserver/Kconfig
kernel/vserver/sched.c
kernel/vserver/sched_init.h

index 0e3ae64..241cc9b 100644 (file)
@@ -1408,7 +1408,8 @@ CONFIG_VSERVER_LEGACY=y
 # CONFIG_VSERVER_NGNET is not set
 # CONFIG_VSERVER_PROC_SECURE is not set
 CONFIG_VSERVER_HARDCPU=y
-# CONFIG_VSERVER_HARDCPU_IDLE is not set
+CONFIG_VSERVER_HARDCPU_IDLE=y
+CONFIG_VSERVER_ACB_SCHED=y
 # CONFIG_INOXID_NONE is not set
 # CONFIG_INOXID_UID16 is not set
 # CONFIG_INOXID_GID16 is not set
index 42fca7d..aa0ff71 100644 (file)
 #define MAX_PRIO_BIAS           20
 #define MIN_PRIO_BIAS          -20
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+
+#define VX_INVALID_TICKS        -1000000
+#define IS_BEST_EFFORT(vxi)     (vx_info_flags(vxi, VXF_SCHED_SHARE, 0))
+
+int vx_tokens_avail(struct vx_info *vxi);
+void vx_consume_token(struct vx_info *vxi);
+void vx_scheduler_tick(void);
+void vx_advance_best_effort_ticks(int ticks);
+void vx_advance_guaranteed_ticks(int ticks);
+
+#else
 
 static inline int vx_tokens_avail(struct vx_info *vxi)
 {
@@ -24,6 +36,8 @@ static inline void vx_consume_token(struct vx_info *vxi)
        atomic_dec(&vxi->sched.tokens);
 }
 
+#endif /* CONFIG_VSERVER_ACB_SCHED */
+
 static inline int vx_need_resched(struct task_struct *p)
 {
 #ifdef CONFIG_VSERVER_HARDCPU
index 79f9053..48ed9ca 100644 (file)
@@ -24,7 +24,8 @@
 #define VXF_SCHED_HARD         0x00000100
 #define VXF_SCHED_PRIO         0x00000200
 #define VXF_SCHED_PAUSE                0x00000400
-
+#define VXF_SCHED_SHARE         0x00000800
 #define VXF_VIRT_MEM           0x00010000
 #define VXF_VIRT_UPTIME                0x00020000
 #define VXF_VIRT_CPU           0x00040000
index 3a57761..4d915a2 100644 (file)
@@ -15,9 +15,26 @@ struct _vx_ticks {
        uint64_t unused[5];             /* cacheline ? */
 };
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+enum {
+// Different scheduling classes
+    SCH_GUARANTEE = 0,
+    SCH_BEST_EFFORT = 1,
+    SCH_NUM_CLASSES = 2,
+// States
+    SCH_UNINITIALIZED,
+    SCH_INITIALIZED,
+};
+#endif
+
 /* context sub struct */
 
 struct _vx_sched {
+#ifdef CONFIG_VSERVER_ACB_SCHED
+        uint64_t ticks[SCH_NUM_CLASSES];
+        uint64_t last_ticks[SCH_NUM_CLASSES];
+        int      state[SCH_NUM_CLASSES];
+#endif
        atomic_t tokens;                /* number of CPU tokens */
        spinlock_t tokens_lock;         /* lock for token bucket */
 
index daf0319..4e10dc6 100644 (file)
@@ -2453,6 +2453,10 @@ void scheduler_tick(void)
 
        rq->timestamp_last_tick = now;
 
+#if defined(CONFIG_VSERVER_HARDCPU) && defined(CONFIG_VSERVER_ACB_SCHED) 
+       vx_scheduler_tick();
+#endif
+
        if (p == rq->idle) {
                if (wake_priority_sleeper(rq))
                        goto out;
@@ -2711,6 +2715,10 @@ asmlinkage void __sched schedule(void)
        struct vx_info *vxi;
 #ifdef CONFIG_VSERVER_HARDCPU
        int maxidle = -HZ;
+# ifdef CONFIG_VSERVER_ACB_SCHED
+        int min_guarantee_ticks = VX_INVALID_TICKS;
+        int min_best_effort_ticks = VX_INVALID_TICKS;
+# endif
 #endif
        int cpu, idx;
 
@@ -2781,6 +2789,9 @@ need_resched_nonpreemptible:
        }
 
 #ifdef CONFIG_VSERVER_HARDCPU
+# ifdef CONFIG_VSERVER_ACB_SCHED
+drain_hold_queue:
+# endif        
        if (!list_empty(&rq->hold_queue)) {
                struct list_head *l, *n;
                int ret;
@@ -2800,6 +2811,17 @@ need_resched_nonpreemptible:
                        }
                        if ((ret < 0) && (maxidle < ret))
                                maxidle = ret;
+# ifdef CONFIG_VSERVER_ACB_SCHED
+                       if (ret < 0) {
+                               if (IS_BEST_EFFORT(vxi)) {
+                                       if (min_best_effort_ticks < ret) 
+                                               min_best_effort_ticks = ret;
+                               } else {
+                                       if (min_guarantee_ticks < ret)
+                                               min_guarantee_ticks = ret;
+                               }
+                       }
+# endif
                }
        }
        rq->idle_tokens = -maxidle;
@@ -2860,8 +2882,19 @@ go_idle:
                int ret = vx_tokens_recalc(vxi);
 
                if (unlikely(ret <= 0)) {
-                       if (ret && (rq->idle_tokens > -ret))
-                               rq->idle_tokens = -ret;
+                       if (ret) {
+                               if ((rq->idle_tokens > -ret))
+                                       rq->idle_tokens = -ret;
+# ifdef CONFIG_VSERVER_ACB_SCHED
+                               if (IS_BEST_EFFORT(vxi)) {
+                                       if (min_best_effort_ticks < ret) 
+                                               min_best_effort_ticks = ret;
+                               } else {
+                                       if (min_guarantee_ticks < ret)
+                                               min_guarantee_ticks = ret;
+                               }
+# endif
+                       }
                        vx_hold_task(vxi, next, rq);
                        goto pick_next;
                }
@@ -2885,6 +2918,18 @@ go_idle:
        }
        next->activated = 0;
 switch_tasks:
+#if defined(CONFIG_VSERVER_HARDCPU) && defined(CONFIG_VSERVER_ACB_SCHED)
+       if (next == rq->idle && !list_empty(&rq->hold_queue)) {
+               if (min_best_effort_ticks != VX_INVALID_TICKS) {
+                       vx_advance_best_effort_ticks(-min_best_effort_ticks);
+                       goto drain_hold_queue;
+               } 
+               if (min_guarantee_ticks != VX_INVALID_TICKS) {
+                       vx_advance_guaranteed_ticks(-min_guarantee_ticks);
+                       goto drain_hold_queue;
+               }
+       }
+#endif
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
        prefetch(next);
index f820729..b4ee9b6 100644 (file)
@@ -101,6 +101,13 @@ config     VSERVER_HARDCPU_IDLE
          This might improve interactivity and latency, but
          will also marginally increase scheduling overhead.
 
+config VSERVER_ACB_SCHED
+       bool    "Guaranteed/fair share scheduler"
+       depends on VSERVER_HARDCPU
+       default n
+       help
+         Andy Bavier's experimental scheduler
+
 choice
        prompt  "Persistent Inode Context Tagging"
        default INOXID_UGID24
index 60f3c6a..e978e7a 100644 (file)
 #include <asm/errno.h>
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+
+#define TICK_SCALE 1000
+#define TICKS_PER_TOKEN(vxi) \
+        ((vxi->sched.interval * TICK_SCALE) / vxi->sched.fill_rate)
+#define CLASS(vxi) \
+    (IS_BEST_EFFORT(vxi) ? SCH_BEST_EFFORT : SCH_GUARANTEE)
+#define GLOBAL_TICKS(vxi) \
+    (IS_BEST_EFFORT(vxi) ? vx_best_effort_ticks : vx_guaranteed_ticks)
+
+uint64_t vx_guaranteed_ticks = 0;
+uint64_t vx_best_effort_ticks = 0;
+
+void vx_tokens_set(struct vx_info *vxi, int tokens) {
+    int class = CLASS(vxi);
+
+    vxi->sched.ticks[class] = GLOBAL_TICKS(vxi);
+    vxi->sched.ticks[class] -= tokens * TICKS_PER_TOKEN(vxi);
+}
+
+void vx_scheduler_tick(void) {
+    vx_guaranteed_ticks += TICK_SCALE;
+    vx_best_effort_ticks += TICK_SCALE;
+}
+
+void vx_advance_best_effort_ticks(int ticks) {
+    vx_best_effort_ticks += TICK_SCALE * ticks;
+}
+
+void vx_advance_guaranteed_ticks(int ticks) {
+    vx_guaranteed_ticks += TICK_SCALE * ticks;
+}
+
+int vx_tokens_avail(struct vx_info *vxi)
+{
+    uint64_t diff;
+    int tokens;
+    long rem;
+    int class = CLASS(vxi);
+
+    if (vxi->sched.state[class] == SCH_UNINITIALIZED) {
+       /* Set the "real" token count */
+       tokens = atomic_read(&vxi->sched.tokens);
+       vx_tokens_set(vxi, tokens);
+       vxi->sched.state[class] = SCH_INITIALIZED;
+       goto out;
+    } 
+
+    if (vxi->sched.last_ticks[class] == GLOBAL_TICKS(vxi)) {
+       tokens = atomic_read(&vxi->sched.tokens);
+       goto out;
+    }
+
+    /* Use of fixed-point arithmetic in these calculations leads to
+     * some limitations.  These should be made explicit.
+     */
+    diff = GLOBAL_TICKS(vxi) - vxi->sched.ticks[class];
+    tokens = div_long_long_rem(diff, TICKS_PER_TOKEN(vxi), &rem);
+
+    if (tokens > vxi->sched.tokens_max) {
+       vx_tokens_set(vxi, vxi->sched.tokens_max);
+       tokens = vxi->sched.tokens_max;
+    }
+
+    atomic_set(&vxi->sched.tokens, tokens);
+
+out:
+    vxi->sched.last_ticks[class] = GLOBAL_TICKS(vxi);
+    return tokens;
+}
+
+void vx_consume_token(struct vx_info *vxi)
+{
+    int class = CLASS(vxi);
+
+    vxi->sched.ticks[class] += TICKS_PER_TOKEN(vxi);
+}
+
+/*
+ * recalculate the context's scheduling tokens
+ *
+ * ret > 0 : number of tokens available
+ * ret = 0 : context is paused
+ * ret < 0 : number of jiffies until new tokens arrive
+ *
+ */
+int vx_tokens_recalc(struct vx_info *vxi)
+{
+        long delta, tokens;
+
+       if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0))
+               /* we are paused */
+               return 0;
+
+       tokens = vx_tokens_avail(vxi);
+       if (tokens <= 0)
+           vxi->vx_state |= VXS_ONHOLD;
+       if (tokens < vxi->sched.tokens_min) {
+           delta = tokens - vxi->sched.tokens_min;
+           /* enough tokens will be available in */
+           return (delta * vxi->sched.interval) / vxi->sched.fill_rate;
+       }
+
+       /* we have some tokens left */
+       if (vx_info_state(vxi, VXS_ONHOLD) &&
+               (tokens >= vxi->sched.tokens_min))
+               vxi->vx_state &= ~VXS_ONHOLD;
+       if (vx_info_state(vxi, VXS_ONHOLD))
+               tokens -= vxi->sched.tokens_min;
+
+       return tokens;
+}
+
+#else
 
 /*
  * recalculate the context's scheduling tokens
@@ -81,6 +195,8 @@ int vx_tokens_recalc(struct vx_info *vxi)
        return tokens;
 }
 
+#endif /* CONFIG_VSERVER_ACB_SCHED */
+
 /*
  * effective_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
@@ -159,6 +275,10 @@ int vc_set_sched_v2(uint32_t xid, void __user *data)
        if (vxi->sched.tokens_min > vxi->sched.tokens_max)
                vxi->sched.tokens_min = vxi->sched.tokens_max;
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+       vx_tokens_set(vxi, atomic_read(&vxi->sched.tokens));
+#endif
+
        spin_unlock(&vxi->sched.tokens_lock);
        put_vx_info(vxi);
        return 0;
@@ -211,6 +331,10 @@ int vc_set_sched(uint32_t xid, void __user *data)
        if (vxi->sched.priority_bias < MIN_PRIO_BIAS)
                vxi->sched.priority_bias = MIN_PRIO_BIAS;
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+       vx_tokens_set(vxi, atomic_read(&vxi->sched.tokens));
+#endif
+
        spin_unlock(&vxi->sched.tokens_lock);
        put_vx_info(vxi);
        return 0;
index 90d1396..6724c28 100644 (file)
@@ -11,6 +11,14 @@ static inline void vx_info_init_sched(struct _vx_sched *sched)
        sched->jiffies          = jiffies;
        sched->tokens_lock      = SPIN_LOCK_UNLOCKED;
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+       /* We can't set the "real" token count here because we don't have
+        * access to the vx_info struct.  Do it later... */
+       for (i = 0; i < SCH_NUM_CLASSES; i++) {
+           sched->state[i] = SCH_UNINITIALIZED;
+       }
+#endif
+
        atomic_set(&sched->tokens, HZ >> 2);
        sched->cpus_allowed     = CPU_MASK_ALL;
        sched->priority_bias    = 0;