From 43bd81b87d6440efd70ac30c1e957aba54a7fd8e Mon Sep 17 00:00:00 2001
From: Andy Bavier <acb@cs.princeton.edu>
Date: Thu, 8 Sep 2005 03:10:24 +0000
Subject: [PATCH] Changes to support Andy's new CPU scheduler

---
 configs/kernel-2.6.12-i686-planetlab.config |   3 +-
 include/linux/vs_sched.h                    |  14 +++
 include/linux/vserver/context.h             |   3 +-
 include/linux/vserver/sched_def.h           |  17 +++
 kernel/sched.c                              |  49 +++++++-
 kernel/vserver/Kconfig                      |   7 ++
 kernel/vserver/sched.c                      | 124 ++++++++++++++++++++
 kernel/vserver/sched_init.h                 |   8 ++
 8 files changed, 221 insertions(+), 4 deletions(-)

diff --git a/configs/kernel-2.6.12-i686-planetlab.config b/configs/kernel-2.6.12-i686-planetlab.config
index 0e3ae6456..241cc9b18 100644
--- a/configs/kernel-2.6.12-i686-planetlab.config
+++ b/configs/kernel-2.6.12-i686-planetlab.config
@@ -1408,7 +1408,8 @@ CONFIG_VSERVER_LEGACY=y
 # CONFIG_VSERVER_NGNET is not set
 # CONFIG_VSERVER_PROC_SECURE is not set
 CONFIG_VSERVER_HARDCPU=y
-# CONFIG_VSERVER_HARDCPU_IDLE is not set
+CONFIG_VSERVER_HARDCPU_IDLE=y
+CONFIG_VSERVER_ACB_SCHED=y
 # CONFIG_INOXID_NONE is not set
 # CONFIG_INOXID_UID16 is not set
 # CONFIG_INOXID_GID16 is not set
diff --git a/include/linux/vs_sched.h b/include/linux/vs_sched.h
index 42fca7de7..aa0ff7103 100644
--- a/include/linux/vs_sched.h
+++ b/include/linux/vs_sched.h
@@ -13,6 +13,18 @@
 #define MAX_PRIO_BIAS		 20
 #define MIN_PRIO_BIAS		-20
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+
+#define VX_INVALID_TICKS        -1000000
+#define IS_BEST_EFFORT(vxi)     (vx_info_flags(vxi, VXF_SCHED_SHARE, 0))
+
+int vx_tokens_avail(struct vx_info *vxi);
+void vx_consume_token(struct vx_info *vxi);
+void vx_scheduler_tick(void);
+void vx_advance_best_effort_ticks(int ticks);
+void vx_advance_guaranteed_ticks(int ticks);
+
+#else
 
 static inline int vx_tokens_avail(struct vx_info *vxi)
 {
@@ -24,6 +36,8 @@ static inline void vx_consume_token(struct vx_info *vxi)
 	atomic_dec(&vxi->sched.tokens);
 }
 
+#endif /* CONFIG_VSERVER_ACB_SCHED */
+
 static inline int vx_need_resched(struct task_struct *p)
 {
 #ifdef	CONFIG_VSERVER_HARDCPU
diff --git a/include/linux/vserver/context.h b/include/linux/vserver/context.h
index 79f90537f..48ed9ca53 100644
--- a/include/linux/vserver/context.h
+++ b/include/linux/vserver/context.h
@@ -24,7 +24,8 @@
 #define VXF_SCHED_HARD		0x00000100
 #define VXF_SCHED_PRIO		0x00000200
 #define VXF_SCHED_PAUSE		0x00000400
-
+#define VXF_SCHED_SHARE         0x00000800
+ 
 #define VXF_VIRT_MEM		0x00010000
 #define VXF_VIRT_UPTIME		0x00020000
 #define VXF_VIRT_CPU		0x00040000
diff --git a/include/linux/vserver/sched_def.h b/include/linux/vserver/sched_def.h
index 3a577616f..4d915a2e4 100644
--- a/include/linux/vserver/sched_def.h
+++ b/include/linux/vserver/sched_def.h
@@ -15,9 +15,26 @@ struct _vx_ticks {
 	uint64_t unused[5];		/* cacheline ? */
 };
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+enum {
+// Different scheduling classes
+    SCH_GUARANTEE = 0,
+    SCH_BEST_EFFORT = 1,
+    SCH_NUM_CLASSES = 2,
+// States
+    SCH_UNINITIALIZED,
+    SCH_INITIALIZED,
+};
+#endif
+
 /* context sub struct */
 
 struct _vx_sched {
+#ifdef CONFIG_VSERVER_ACB_SCHED
+        uint64_t ticks[SCH_NUM_CLASSES];
+        uint64_t last_ticks[SCH_NUM_CLASSES];
+        int      state[SCH_NUM_CLASSES];
+#endif
 	atomic_t tokens;		/* number of CPU tokens */
 	spinlock_t tokens_lock;		/* lock for token bucket */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index daf0319ea..4e10dc63a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2453,6 +2453,10 @@ void scheduler_tick(void)
 
 	rq->timestamp_last_tick = now;
 
+#if defined(CONFIG_VSERVER_HARDCPU) && defined(CONFIG_VSERVER_ACB_SCHED) 
+	vx_scheduler_tick();
+#endif
+
 	if (p == rq->idle) {
 		if (wake_priority_sleeper(rq))
 			goto out;
@@ -2711,6 +2715,10 @@ asmlinkage void __sched schedule(void)
 	struct vx_info *vxi;
 #ifdef	CONFIG_VSERVER_HARDCPU
 	int maxidle = -HZ;
+# ifdef CONFIG_VSERVER_ACB_SCHED
+        int min_guarantee_ticks = VX_INVALID_TICKS;
+        int min_best_effort_ticks = VX_INVALID_TICKS;
+# endif
 #endif
 	int cpu, idx;
 
@@ -2781,6 +2789,9 @@ need_resched_nonpreemptible:
 	}
 
 #ifdef CONFIG_VSERVER_HARDCPU
+# ifdef CONFIG_VSERVER_ACB_SCHED
+drain_hold_queue:
+# endif	
 	if (!list_empty(&rq->hold_queue)) {
 		struct list_head *l, *n;
 		int ret;
@@ -2800,6 +2811,17 @@ need_resched_nonpreemptible:
 			}
 			if ((ret < 0) && (maxidle < ret))
 				maxidle = ret;
+# ifdef CONFIG_VSERVER_ACB_SCHED
+			if (ret < 0) {
+			        if (IS_BEST_EFFORT(vxi)) {
+				        if (min_best_effort_ticks < ret) 
+					        min_best_effort_ticks = ret;
+				} else {
+				        if (min_guarantee_ticks < ret)
+					        min_guarantee_ticks = ret;
+				}
+			}
+# endif
 		}
 	}
 	rq->idle_tokens = -maxidle;
@@ -2860,8 +2882,19 @@ go_idle:
 		int ret = vx_tokens_recalc(vxi);
 
 		if (unlikely(ret <= 0)) {
-			if (ret && (rq->idle_tokens > -ret))
-				rq->idle_tokens = -ret;
+			if (ret) {
+			        if ((rq->idle_tokens > -ret))
+				        rq->idle_tokens = -ret;
+# ifdef CONFIG_VSERVER_ACB_SCHED
+				if (IS_BEST_EFFORT(vxi)) {
+				        if (min_best_effort_ticks < ret) 
+					        min_best_effort_ticks = ret;
+				} else {
+				        if (min_guarantee_ticks < ret)
+					        min_guarantee_ticks = ret;
+				}
+# endif
+			}
 			vx_hold_task(vxi, next, rq);
 			goto pick_next;
 		}
@@ -2885,6 +2918,18 @@ go_idle:
 	}
 	next->activated = 0;
 switch_tasks:
+#if defined(CONFIG_VSERVER_HARDCPU) && defined(CONFIG_VSERVER_ACB_SCHED)
+	if (next == rq->idle && !list_empty(&rq->hold_queue)) {
+	        if (min_best_effort_ticks != VX_INVALID_TICKS) {
+		        vx_advance_best_effort_ticks(-min_best_effort_ticks);
+			goto drain_hold_queue;
+		} 
+		if (min_guarantee_ticks != VX_INVALID_TICKS) {
+		        vx_advance_guaranteed_ticks(-min_guarantee_ticks);
+			goto drain_hold_queue;
+		}
+	}
+#endif
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
 	prefetch(next);
diff --git a/kernel/vserver/Kconfig b/kernel/vserver/Kconfig
index f8207296e..b4ee9b6c3 100644
--- a/kernel/vserver/Kconfig
+++ b/kernel/vserver/Kconfig
@@ -101,6 +101,13 @@ config	VSERVER_HARDCPU_IDLE
 	  This might improve interactivity and latency, but
 	  will also marginally increase scheduling overhead.
 
+config	VSERVER_ACB_SCHED
+	bool	"Guaranteed/fair share scheduler"
+	depends on VSERVER_HARDCPU
+	default n
+	help
+	  Andy Bavier's experimental scheduler
+
 choice
 	prompt	"Persistent Inode Context Tagging"
 	default	INOXID_UGID24
diff --git a/kernel/vserver/sched.c b/kernel/vserver/sched.c
index 60f3c6ac5..e978e7a6b 100644
--- a/kernel/vserver/sched.c
+++ b/kernel/vserver/sched.c
@@ -19,6 +19,120 @@
 #include <asm/errno.h>
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+
+#define TICK_SCALE 1000
+#define TICKS_PER_TOKEN(vxi) \
+        ((vxi->sched.interval * TICK_SCALE) / vxi->sched.fill_rate)
+#define CLASS(vxi) \
+    (IS_BEST_EFFORT(vxi) ? SCH_BEST_EFFORT : SCH_GUARANTEE)
+#define GLOBAL_TICKS(vxi) \
+    (IS_BEST_EFFORT(vxi) ? vx_best_effort_ticks : vx_guaranteed_ticks)
+
+uint64_t vx_guaranteed_ticks = 0;
+uint64_t vx_best_effort_ticks = 0;
+
+void vx_tokens_set(struct vx_info *vxi, int tokens) {
+    int class = CLASS(vxi);
+
+    vxi->sched.ticks[class] = GLOBAL_TICKS(vxi);
+    vxi->sched.ticks[class] -=	tokens * TICKS_PER_TOKEN(vxi);
+}
+
+void vx_scheduler_tick(void) {
+    vx_guaranteed_ticks += TICK_SCALE;
+    vx_best_effort_ticks += TICK_SCALE;
+}
+
+void vx_advance_best_effort_ticks(int ticks) {
+    vx_best_effort_ticks += TICK_SCALE * ticks;
+}
+
+void vx_advance_guaranteed_ticks(int ticks) {
+    vx_guaranteed_ticks += TICK_SCALE * ticks;
+}
+
+int vx_tokens_avail(struct vx_info *vxi)
+{
+    uint64_t diff;
+    int tokens;
+    long rem;
+    int class = CLASS(vxi);
+
+    if (vxi->sched.state[class] == SCH_UNINITIALIZED) {
+	/* Set the "real" token count */
+	tokens = atomic_read(&vxi->sched.tokens);
+	vx_tokens_set(vxi, tokens);
+	vxi->sched.state[class] = SCH_INITIALIZED;
+	goto out;
+    } 
+
+    if (vxi->sched.last_ticks[class] == GLOBAL_TICKS(vxi)) {
+	tokens = atomic_read(&vxi->sched.tokens);
+	goto out;
+    }
+
+    /* Use of fixed-point arithmetic in these calculations leads to
+     * some limitations.  These should be made explicit.
+     */
+    diff = GLOBAL_TICKS(vxi) - vxi->sched.ticks[class];
+    tokens = div_long_long_rem(diff, TICKS_PER_TOKEN(vxi), &rem);
+
+    if (tokens > vxi->sched.tokens_max) {
+	vx_tokens_set(vxi, vxi->sched.tokens_max);
+	tokens = vxi->sched.tokens_max;
+    }
+
+    atomic_set(&vxi->sched.tokens, tokens);
+
+out:
+    vxi->sched.last_ticks[class] = GLOBAL_TICKS(vxi);
+    return tokens;
+}
+
+void vx_consume_token(struct vx_info *vxi)
+{
+    int class = CLASS(vxi);
+
+    vxi->sched.ticks[class] += TICKS_PER_TOKEN(vxi);
+}
+
+/*
+ * recalculate the context's scheduling tokens
+ *
+ * ret > 0 : number of tokens available
+ * ret = 0 : context is paused
+ * ret < 0 : number of jiffies until new tokens arrive
+ *
+ */
+int vx_tokens_recalc(struct vx_info *vxi)
+{
+        long delta, tokens;
+
+	if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0))
+		/* we are paused */
+		return 0;
+
+	tokens = vx_tokens_avail(vxi);
+	if (tokens <= 0)
+	    vxi->vx_state |= VXS_ONHOLD;
+	if (tokens < vxi->sched.tokens_min) {
+	    delta = tokens - vxi->sched.tokens_min;
+	    /* enough tokens will be available in */
+	    return (delta * vxi->sched.interval) / vxi->sched.fill_rate;
+	}
+
+	/* we have some tokens left */
+	if (vx_info_state(vxi, VXS_ONHOLD) &&
+		(tokens >= vxi->sched.tokens_min))
+		vxi->vx_state &= ~VXS_ONHOLD;
+	if (vx_info_state(vxi, VXS_ONHOLD))
+		tokens -= vxi->sched.tokens_min;
+
+	return tokens;
+}
+
+#else
 
 /*
  * recalculate the context's scheduling tokens
@@ -81,6 +195,8 @@ int vx_tokens_recalc(struct vx_info *vxi)
 	return tokens;
 }
 
+#endif /* CONFIG_VSERVER_ACB_SCHED */
+
 /*
  * effective_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
@@ -159,6 +275,10 @@ int vc_set_sched_v2(uint32_t xid, void __user *data)
 	if (vxi->sched.tokens_min > vxi->sched.tokens_max)
 		vxi->sched.tokens_min = vxi->sched.tokens_max;
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+	vx_tokens_set(vxi, atomic_read(&vxi->sched.tokens));
+#endif
+
 	spin_unlock(&vxi->sched.tokens_lock);
 	put_vx_info(vxi);
 	return 0;
@@ -211,6 +331,10 @@ int vc_set_sched(uint32_t xid, void __user *data)
 	if (vxi->sched.priority_bias < MIN_PRIO_BIAS)
 		vxi->sched.priority_bias = MIN_PRIO_BIAS;
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+	vx_tokens_set(vxi, atomic_read(&vxi->sched.tokens));
+#endif
+
 	spin_unlock(&vxi->sched.tokens_lock);
 	put_vx_info(vxi);
 	return 0;
diff --git a/kernel/vserver/sched_init.h b/kernel/vserver/sched_init.h
index 90d13960f..6724c286f 100644
--- a/kernel/vserver/sched_init.h
+++ b/kernel/vserver/sched_init.h
@@ -11,6 +11,14 @@ static inline void vx_info_init_sched(struct _vx_sched *sched)
 	sched->jiffies		= jiffies;
 	sched->tokens_lock	= SPIN_LOCK_UNLOCKED;
 
+#ifdef CONFIG_VSERVER_ACB_SCHED
+	/* We can't set the "real" token count here because we don't have
+	 * access to the vx_info struct.  Do it later... */
+	for (i = 0; i < SCH_NUM_CLASSES; i++) {
+	    sched->state[i] = SCH_UNINITIALIZED;
+	}
+#endif
+
 	atomic_set(&sched->tokens, HZ >> 2);
 	sched->cpus_allowed	= CPU_MASK_ALL;
 	sched->priority_bias	= 0;
-- 
2.47.0