From: Marc Fiuczynski <mef@cs.princeton.edu>
Date: Sat, 17 Jul 2004 13:00:36 +0000 (+0000)
Subject: Merged ckrm-E15 CPU controller
X-Git-Tag: before-ipod-patch~14
X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=92ade5af0325d0597318bb21d38bb34c99726835;p=linux-2.6.git

Merged ckrm-E15 CPU controller
---

diff --git a/.config b/.config
index 7f9bc8d9d..a1a300be4 100644
--- a/.config
+++ b/.config
@@ -29,6 +29,8 @@ CONFIG_CKRM=y
 CONFIG_RCFS_FS=y
 CONFIG_CKRM_TYPE_TASKCLASS=y
 CONFIG_CKRM_RES_NUMTASKS=y
+CONFIG_CKRM_CPU_SCHEDULE=y
+CONFIG_CKRM_CPU_MONITOR=y
 CONFIG_CKRM_TYPE_SOCKETCLASS=y
 CONFIG_CKRM_RES_LISTENAQ=m
 CONFIG_CKRM_RBCE=m
diff --git a/.config.old b/.config.old
index 55ca63656..b22fb717d 100644
--- a/.config.old
+++ b/.config.old
@@ -29,8 +29,13 @@ CONFIG_CKRM=y
 CONFIG_RCFS_FS=y
 CONFIG_CKRM_TYPE_TASKCLASS=y
 CONFIG_CKRM_RES_NUMTASKS=y
+CONFIG_CKRM_CPU_SCHEDULE=y
+CONFIG_CKRM_CPU_MONITOR=m
 CONFIG_CKRM_TYPE_SOCKETCLASS=y
 CONFIG_CKRM_RES_LISTENAQ=m
+CONFIG_CKRM_RBCE=m
+CONFIG_CKRM_CRBCE=m
+# CONFIG_BSD_PROCESS_ACCT_V3 is not set
 CONFIG_SYSCTL=y
 CONFIG_AUDIT=y
 CONFIG_AUDITSYSCALL=y
@@ -414,7 +419,6 @@ CONFIG_BLK_DEV_IDE=y
 # CONFIG_BLK_DEV_HD_IDE is not set
 CONFIG_BLK_DEV_IDEDISK=y
 CONFIG_IDEDISK_MULTI_MODE=y
-# CONFIG_IDEDISK_STROKE is not set
 CONFIG_BLK_DEV_IDECS=m
 CONFIG_BLK_DEV_IDECD=y
 CONFIG_BLK_DEV_IDETAPE=m
@@ -506,6 +510,7 @@ CONFIG_SCSI_FC_ATTRS=m
 # SCSI low-level drivers
 #
 CONFIG_BLK_DEV_3W_XXXX_RAID=m
+# CONFIG_SCSI_3W_9XXX is not set
 # CONFIG_SCSI_7000FASST is not set
 # CONFIG_SCSI_ACARD is not set
 CONFIG_SCSI_AHA152X=m
@@ -542,7 +547,6 @@ CONFIG_SCSI_SATA_VIA=m
 CONFIG_SCSI_SATA_VITESSE=m
 CONFIG_SCSI_BUSLOGIC=m
 # CONFIG_SCSI_OMIT_FLASHPOINT is not set
-# CONFIG_SCSI_CPQFCTS is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_DTC3280 is not set
 # CONFIG_SCSI_EATA is not set
@@ -613,6 +617,9 @@ CONFIG_MD_RAID6=m
 CONFIG_MD_MULTIPATH=m
 CONFIG_BLK_DEV_DM=m
 CONFIG_DM_CRYPT=m
+# CONFIG_DM_SNAPSHOT is not set
+# CONFIG_DM_MIRROR is not set
+# CONFIG_DM_ZERO is not set
 
 #
 # Fusion MPT device support
@@ -1083,6 +1090,7 @@ CONFIG_SUNDANCE=m
 CONFIG_TLAN=m
 CONFIG_VIA_RHINE=m
 CONFIG_VIA_RHINE_MMIO=y
+# CONFIG_VIA_VELOCITY is not set
 CONFIG_NET_POCKET=y
 CONFIG_ATP=m
 CONFIG_DE600=m
@@ -1576,6 +1584,7 @@ CONFIG_DRM_SIS=m
 CONFIG_SYNCLINK_CS=m
 CONFIG_MWAVE=m
 # CONFIG_RAW_DRIVER is not set
+# CONFIG_HPET is not set
 CONFIG_HANGCHECK_TIMER=m
 
 #
@@ -1783,6 +1792,7 @@ CONFIG_VIDEO_SELECT=y
 CONFIG_FB_HGA=m
 # CONFIG_FB_HGA_ACCEL is not set
 CONFIG_FB_RIVA=m
+# CONFIG_FB_RIVA_I2C is not set
 CONFIG_FB_I810=m
 CONFIG_FB_I810_GTF=y
 CONFIG_FB_MATROX=m
@@ -1993,6 +2003,7 @@ CONFIG_USB_ACM=m
 CONFIG_USB_PRINTER=m
 CONFIG_USB_STORAGE=m
 # CONFIG_USB_STORAGE_DEBUG is not set
+# CONFIG_USB_STORAGE_RW_DETECT is not set
 CONFIG_USB_STORAGE_DATAFAB=y
 CONFIG_USB_STORAGE_FREECOM=y
 CONFIG_USB_STORAGE_ISD200=y
@@ -2284,7 +2295,6 @@ CONFIG_MINIX_SUBPARTITION=y
 CONFIG_SOLARIS_X86_PARTITION=y
 CONFIG_UNIXWARE_DISKLABEL=y
 # CONFIG_LDM_PARTITION is not set
-# CONFIG_NEC98_PARTITION is not set
 CONFIG_SGI_PARTITION=y
 # CONFIG_ULTRIX_PARTITION is not set
 CONFIG_SUN_PARTITION=y
@@ -2411,5 +2421,4 @@ CONFIG_LIBCRC32C=m
 CONFIG_ZLIB_INFLATE=y
 CONFIG_ZLIB_DEFLATE=m
 CONFIG_X86_BIOS_REBOOT=y
-CONFIG_X86_STD_RESOURCES=y
 CONFIG_PC=y
diff --git a/include/linux/autoconf.h b/include/linux/autoconf.h
index 5ee385a8a..307d630b3 100644
--- a/include/linux/autoconf.h
+++ b/include/linux/autoconf.h
@@ -30,6 +30,8 @@
 #define CONFIG_RCFS_FS 1
 #define CONFIG_CKRM_TYPE_TASKCLASS 1
 #define CONFIG_CKRM_RES_NUMTASKS 1
+#define CONFIG_CKRM_CPU_SCHEDULE 1
+#define CONFIG_CKRM_CPU_MONITOR 1
 #define CONFIG_CKRM_TYPE_SOCKETCLASS 1
 #define CONFIG_CKRM_RES_LISTENAQ_MODULE 1
 #define CONFIG_CKRM_RBCE_MODULE 1
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 385b390fe..0b1efea4e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -551,6 +551,9 @@ struct task_struct {
 	// .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS 
 	struct ckrm_task_class *taskclass;
 	struct list_head        taskclass_link;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+        struct ckrm_cpu_class *cpu_class;
+#endif
 #endif // CONFIG_CKRM_TYPE_TASKCLASS
 #endif // CONFIG_CKRM
 
@@ -1009,8 +1012,7 @@ static inline struct mm_struct * get_task_mm(struct task_struct * task)
 
 	return mm;
 }
- 
- 
+
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/init/Kconfig b/init/Kconfig
index d9153c5aa..94af58d28 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -168,6 +168,24 @@ config CKRM_RES_NUMTASKS
 	
 	  Say N if unsure, Y to use the feature.
 
+config CKRM_CPU_SCHEDULE
+	bool "CKRM CPU scheduler"
+	depends on CKRM_TYPE_TASKCLASS
+	default m
+	help
+	  Use CKRM CPU scheduler instead of Linux Scheduler
+	
+	  Say N if unsure, Y to use the feature.
+
+config CKRM_CPU_MONITOR
+	tristate  "CKRM CPU Resoure Monitor"
+	depends on CKRM_CPU_SCHEDULE
+	default m
+	help
+	  Monitor CPU Resource Usage of the classes
+	
+	  Say N if unsure, Y to use the feature.
+
 config CKRM_TYPE_SOCKETCLASS
 	bool "Class Manager for socket groups"
 	depends on CKRM
diff --git a/init/main.c b/init/main.c
index 155d05f35..0dfaf47af 100644
--- a/init/main.c
+++ b/init/main.c
@@ -49,6 +49,8 @@
 #include <asm/bugs.h>
 
 #include <linux/ckrm.h>
+int __init init_ckrm_sched_res(void);
+
 
 /*
  * This is one of the first .c files built. Error out early
@@ -419,7 +421,6 @@ asmlinkage void __init start_kernel(void)
 	 * printk() and can access its per-cpu storage.
 	 */
 	smp_prepare_boot_cpu();
-
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -638,8 +639,8 @@ static int init(void * unused)
 	 * firmware files.
 	 */
 	populate_rootfs();
-
 	do_basic_setup();
+	init_ckrm_sched_res();
 
 	/*
 	 * check if there is an early userspace init.  If yes, let it do all
diff --git a/kernel/Makefile b/kernel/Makefile
index e1b650130..e0a5febdb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -26,8 +26,12 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o
+obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_sched.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+obj-$(CONFIG_KGDB) += kgdbstub.o
+
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/sched.c b/kernel/sched.c
index a5058d6ee..9667edb46 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -17,7 +17,6 @@
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  */
-
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
@@ -161,9 +160,6 @@
 #define LOW_CREDIT(p) \
 	((p)->interactive_credit < -CREDIT_LIMIT)
 
-#define TASK_PREEMPTS_CURR(p, rq) \
-	((p)->prio < (rq)->curr->prio)
-
 /*
  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
  * to time slice values.
@@ -189,16 +185,24 @@ static unsigned int task_timeslice(task_t *p)
 /*
  * These are the runqueue data structures:
  */
+typedef struct runqueue runqueue_t;
 
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#include <linux/ckrm_classqueue.h>
+#endif
 
-typedef struct runqueue runqueue_t;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
 
-struct prio_array {
-	unsigned int nr_active;
-	unsigned long bitmap[BITMAP_SIZE];
-	struct list_head queue[MAX_PRIO];
-};
+/**
+ *  if belong to different class, compare class priority
+ *  otherwise compare task priority 
+ */
+#define TASK_PREEMPTS_CURR(p, rq) \
+	(((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio)
+#else
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
+#endif
 
 /*
  * This is the main, per-CPU runqueue data structure.
@@ -215,7 +219,7 @@ struct runqueue {
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP)
 	unsigned long cpu_load;
 #endif
 	unsigned long long nr_switches;
@@ -223,7 +227,12 @@ struct runqueue {
 	unsigned long long timestamp_last_tick;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
-	prio_array_t *active, *expired, arrays[2];
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+	unsigned long ckrm_cpu_load;
+	struct classqueue_struct classqueue;   
+#else
+        prio_array_t *active, *expired, arrays[2];
+#endif
 	int best_expired_prio;
 	atomic_t nr_iowait;
 
@@ -260,12 +269,158 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 # define task_running(rq, p)		((rq)->curr == (p))
 #endif
 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#include <linux/ckrm_sched.h>
+spinlock_t cvt_lock        = SPIN_LOCK_UNLOCKED;
+rwlock_t   class_list_lock = RW_LOCK_UNLOCKED;
+LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
+struct ckrm_cpu_class default_cpu_class_obj;
+
+/*
+ * the minimum CVT allowed is the base_cvt
+ * otherwise, it will starve others
+ */
+CVT_t get_min_cvt(int cpu)
+{
+	cq_node_t *node;
+	struct ckrm_local_runqueue * lrq;
+	CVT_t min_cvt;
+
+	node = classqueue_get_head(bpt_queue(cpu));
+	lrq =  (node) ? class_list_entry(node) : NULL;
+	
+	if (lrq) 
+		min_cvt = lrq->local_cvt;
+	else 
+		min_cvt = 0;
+		
+	return min_cvt;
+}
+
+/*
+ * update the classueue base for all the runqueues
+ * TODO: we can only update half of the min_base to solve the movebackward issue
+ */
+static inline void check_update_class_base(int this_cpu) {
+	unsigned long min_base = 0xFFFFFFFF; 
+	cq_node_t *node;
+	int i;
+
+	if (! cpu_online(this_cpu)) return;
+
+	/*
+	 * find the min_base across all the processors
+	 */
+	for_each_online_cpu(i) {
+		/*
+		 * I should change it to directly use bpt->base
+		 */
+		node = classqueue_get_head(bpt_queue(i));
+		if (node && node->prio < min_base) {
+			min_base = node->prio;
+		}
+	}
+	if (min_base != 0xFFFFFFFF) 
+		classqueue_update_base(bpt_queue(this_cpu),min_base);
+}
+
+static inline void ckrm_rebalance_tick(int j,int this_cpu)
+{
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+	read_lock(&class_list_lock);
+	if (!(j % CVT_UPDATE_TICK))
+		update_global_cvts(this_cpu);
+
+#define CKRM_BASE_UPDATE_RATE 400
+	if (! (jiffies % CKRM_BASE_UPDATE_RATE))
+		check_update_class_base(this_cpu);
+
+	read_unlock(&class_list_lock);
+#endif
+}
+
+static inline struct ckrm_local_runqueue *rq_get_next_class(struct runqueue *rq)
+{
+	cq_node_t *node = classqueue_get_head(&rq->classqueue);
+	return ((node) ? class_list_entry(node) : NULL);
+}
+
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+{
+	prio_array_t               *array;
+	struct task_struct         *next;
+	struct ckrm_local_runqueue *queue;
+	int cpu = smp_processor_id();
+	
+	next = rq->idle;
+ retry_next_class:
+	if ((queue = rq_get_next_class(rq))) {
+		array = queue->active;
+		//check switch active/expired queue
+		if (unlikely(!queue->active->nr_active)) {
+			queue->active = queue->expired;
+			queue->expired = array;
+			queue->expired_timestamp = 0;
+
+			if (queue->active->nr_active)
+				set_top_priority(queue,
+						 find_first_bit(queue->active->bitmap, MAX_PRIO));
+			else {
+				classqueue_dequeue(queue->classqueue,
+						   &queue->classqueue_linkobj);
+				cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
+			}
+
+			goto retry_next_class; 				
+		}
+		BUG_ON(!queue->active->nr_active);
+		next = task_list_entry(array->queue[queue->top_priority].next);
+	}
+	return next;
+}
+
+static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load += cpu_class_weight(p->cpu_class); }
+static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load -= cpu_class_weight(p->cpu_class); }
+
+#else /*CONFIG_CKRM_CPU_SCHEDULE*/
+
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+{
+	prio_array_t *array;
+        struct list_head *queue;
+	int idx;
+
+	array = rq->active;
+	if (unlikely(!array->nr_active)) {
+		/*
+		 * Switch the active and expired arrays.
+		 */
+		rq->active = rq->expired;
+		rq->expired = array;
+		array = rq->active;
+		rq->expired_timestamp = 0;
+		rq->best_expired_prio = MAX_PRIO;
+	}
+
+	idx = sched_find_first_bit(array->bitmap);
+	queue = array->queue + idx;
+	return list_entry(queue->next, task_t, run_list);
+}
+
+static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void init_cpu_classes(void) { }
+static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { }
+static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { }
+#endif  /* CONFIG_CKRM_CPU_SCHEDULE */
+
+
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
-static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 {
 	struct runqueue *rq;
 
@@ -280,7 +435,7 @@ repeat_lock_task:
 	return rq;
 }
 
-static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 {
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
@@ -307,20 +462,23 @@ static inline void rq_unlock(runqueue_t *rq)
 /*
  * Adding/removing a task to/from a priority array:
  */
-static void dequeue_task(struct task_struct *p, prio_array_t *array)
+void dequeue_task(struct task_struct *p, prio_array_t *array)
 {
+	BUG_ON(! array);
 	array->nr_active--;
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	class_dequeue_task(p,array);
 }
 
-static void enqueue_task(struct task_struct *p, prio_array_t *array)
+void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
+	class_enqueue_task(p,array);
 }
 
 /*
@@ -334,6 +492,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
+	class_enqueue_task(p,array);
 }
 
 /*
@@ -375,8 +534,9 @@ static int effective_prio(task_t *p)
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
-	enqueue_task(p, rq->active);
+	enqueue_task(p, rq_active(p,rq));
 	rq->nr_running++;
+	rq_load_inc(rq,p);
 }
 
 /*
@@ -384,8 +544,9 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
  */
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
-	enqueue_task_head(p, rq->active);
+	enqueue_task_head(p, rq_active(p,rq));
 	rq->nr_running++;
+	rq_load_inc(rq,p);
 }
 
 static void recalc_task_prio(task_t *p, unsigned long long now)
@@ -517,6 +678,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
 	rq->nr_running--;
+	rq_load_dec(rq,p);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
 	dequeue_task(p, p->array);
@@ -966,6 +1128,7 @@ void fastcall wake_up_forked_process(task_t * p)
 		p->array = current->array;
 		p->array->nr_active++;
 		rq->nr_running++;
+		rq_load_inc(rq,p);
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -1288,6 +1451,7 @@ lock_again:
 			p->array = current->array;
 			p->array->nr_active++;
 			rq->nr_running++;
+			rq_load_inc(rq,p);
 		}
 	} else {
 		/* Not the local CPU - must adjust timestamp */
@@ -1392,9 +1556,13 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 {
 	dequeue_task(p, src_array);
 	src_rq->nr_running--;
+	rq_load_dec(src_rq,p);
+
 	set_task_cpu(p, this_cpu);
 	this_rq->nr_running++;
+	rq_load_inc(this_rq,p);
 	enqueue_task(p, this_array);
+
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
 	/*
@@ -1433,6 +1601,194 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 	return 1;
 }
 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+
+struct ckrm_cpu_class *find_unbalanced_class(int busiest_cpu, int this_cpu, unsigned long *cls_imbalance)
+{
+	struct ckrm_cpu_class *most_unbalanced_class = NULL;
+	struct ckrm_cpu_class *clsptr;
+	int max_unbalance = 0;
+
+	list_for_each_entry(clsptr,&active_cpu_classes,links) {
+		struct ckrm_local_runqueue *this_lrq    = get_ckrm_local_runqueue(clsptr,this_cpu);
+		struct ckrm_local_runqueue *busiest_lrq = get_ckrm_local_runqueue(clsptr,busiest_cpu);
+		int unbalance_degree;
+		
+		unbalance_degree = (local_queue_nr_running(busiest_lrq) - local_queue_nr_running(this_lrq)) * cpu_class_weight(clsptr);
+		if (unbalance_degree >= *cls_imbalance) 
+			continue;  // already looked at this class
+
+		if (unbalance_degree > max_unbalance) {
+			max_unbalance = unbalance_degree;
+			most_unbalanced_class = clsptr;
+		}
+	}
+	*cls_imbalance = max_unbalance;
+	return most_unbalanced_class;
+}
+
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
+ */
+static int find_busiest_cpu(runqueue_t *this_rq, int this_cpu, int idle, 
+			    int *imbalance)
+{
+	int cpu_load, load, max_load, i, busiest_cpu;
+	runqueue_t *busiest, *rq_src;
+
+
+	/*Hubertus ... the concept of nr_running is replace with cpu_load */
+	cpu_load = this_rq->ckrm_cpu_load;
+
+	busiest = NULL;
+	busiest_cpu = -1;
+
+	max_load = -1;
+	for_each_online_cpu(i) {
+		rq_src = cpu_rq(i);
+		load = rq_src->ckrm_cpu_load;
+
+		if ((load > max_load) && (rq_src != this_rq)) {
+			busiest = rq_src;
+			busiest_cpu = i;
+			max_load = load;
+		}
+	}
+
+	if (likely(!busiest))
+		goto out;
+
+	*imbalance = max_load - cpu_load;
+
+	/* It needs an at least ~25% imbalance to trigger balancing. */
+	if (!idle && ((*imbalance)*4 < max_load)) {
+		busiest = NULL;
+		goto out;
+	}
+
+	double_lock_balance(this_rq, busiest);
+	/*
+	 * Make sure nothing changed since we checked the
+	 * runqueue length.
+	 */
+	if (busiest->ckrm_cpu_load <= cpu_load) {
+		spin_unlock(&busiest->lock);
+		busiest = NULL;
+	}
+out:
+	return (busiest ? busiest_cpu : -1);
+}
+
+static int load_balance(int this_cpu, runqueue_t *this_rq,
+			struct sched_domain *sd, enum idle_type idle)
+{
+	int imbalance, idx;
+	int busiest_cpu;
+	runqueue_t *busiest;
+	prio_array_t *array;
+	struct list_head *head, *curr;
+	task_t *tmp;
+        struct ckrm_local_runqueue * busiest_local_queue;
+	struct ckrm_cpu_class *clsptr;
+	int weight;
+	unsigned long cls_imbalance;      // so we can retry other classes
+
+	// need to update global CVT based on local accumulated CVTs
+	read_lock(&class_list_lock);
+	busiest_cpu = find_busiest_cpu(this_rq, this_cpu, idle, &imbalance);
+	if (busiest_cpu == -1)
+		goto out;
+
+	busiest = cpu_rq(busiest_cpu);
+
+	/*
+	 * We only want to steal a number of tasks equal to 1/2 the imbalance,
+	 * otherwise we'll just shift the imbalance to the new queue:
+	 */
+	imbalance /= 2;
+		
+	/* now find class on that runqueue with largest inbalance */
+	cls_imbalance = 0xFFFFFFFF; 
+
+ retry_other_class:
+	clsptr = find_unbalanced_class(busiest_cpu, this_cpu, &cls_imbalance);
+	if (!clsptr) 
+		goto out_unlock;
+
+	busiest_local_queue = get_ckrm_local_runqueue(clsptr,busiest_cpu);
+	weight = cpu_class_weight(clsptr);
+
+	/*
+	 * We first consider expired tasks. Those will likely not be
+	 * executed in the near future, and they are most likely to
+	 * be cache-cold, thus switching CPUs has the least effect
+	 * on them.
+	 */
+	if (busiest_local_queue->expired->nr_active)
+		array = busiest_local_queue->expired;
+	else
+		array = busiest_local_queue->active;
+	
+ new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
+ skip_bitmap:
+	if (!idx)
+		idx = sched_find_first_bit(array->bitmap);
+	else
+		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
+		if (array == busiest_local_queue->expired && busiest_local_queue->active->nr_active) {
+			array = busiest_local_queue->active;
+			goto new_array;
+		}
+		goto retry_other_class;
+	}
+	
+	head = array->queue + idx;
+	curr = head->prev;
+ skip_queue:
+	tmp = list_entry(curr, task_t, run_list);
+	
+	curr = curr->prev;
+	
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd,idle)) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+	pull_task(busiest, array, tmp, this_rq, rq_active(tmp,this_rq),this_cpu);
+	/*
+	 * tmp BUG FIX: hzheng
+	 * load balancing can make the busiest local queue empty
+	 * thus it should be removed from bpt
+	 */
+	if (! local_queue_nr_running(busiest_local_queue)) {
+		classqueue_dequeue(busiest_local_queue->classqueue,&busiest_local_queue->classqueue_linkobj);
+		cpu_demand_event(get_rq_local_stat(busiest_local_queue,busiest_cpu),CPU_DEMAND_DEQUEUE,0);		
+	}
+
+	imbalance -= weight;
+	if (!idle && (imbalance>0)) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+ out_unlock:
+	spin_unlock(&busiest->lock);
+ out:
+	read_unlock(&class_list_lock);
+	return 0;
+}
+
+
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+{
+}
+#else /* CONFIG_CKRM_CPU_SCHEDULE */
 /*
  * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
  * as part of a balancing operation within "domain". Returns the number of
@@ -1883,6 +2239,7 @@ next_group:
 	} while (group != sd->groups);
 >>>>>>> 1.1.9.3
 }
+#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
 
 /*
  * rebalance_tick will get called every timer tick, on every CPU.
@@ -1903,6 +2260,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
 
+	ckrm_rebalance_tick(j,this_cpu);
+
 	/* Update our load */
 	old_load = this_rq->cpu_load;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -1935,13 +2294,15 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 		}
 	}
 }
-#else
+#else /* SMP*/
 /*
  * on UP we do not need to balance between CPUs:
  */
 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
 {
+	ckrm_rebalance_tick(jiffies,cpu);
 }
+
 static inline void idle_balance(int cpu, runqueue_t *rq)
 {
 }
@@ -1962,7 +2323,7 @@ static inline int wake_priority_sleeper(runqueue_t *rq)
 	return 0;
 }
 
-DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
@@ -1976,11 +2337,19 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  * increasing number of running tasks. We also ignore the interactivity
  * if a better static_prio task has expired:
  */
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
 #define EXPIRED_STARVING(rq) \
 	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
 		(jiffies - (rq)->expired_timestamp >= \
 			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
 			((rq)->curr->static_prio > (rq)->best_expired_prio))
+#else
+#define EXPIRED_STARVING(rq) \
+ 		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
+ 		(jiffies - (rq)->expired_timestamp >= \
+ 			STARVATION_LIMIT * (local_queue_nr_running(rq)) + 1)))
+#endif
 
 /*
  * This function gets called by the timer code, with HZ frequency.
@@ -2030,7 +2399,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 	cpustat->system += sys_ticks;
 
 	/* Task might have expired already, but not scheduled off yet */
-	if (p->array != rq->active) {
+	if (p->array != rq_active(p,rq)) {
 		set_tsk_need_resched(p);
 		goto out;
 	}
@@ -2053,12 +2422,17 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			set_tsk_need_resched(p);
 
 			/* put it at the end of the queue: */
-			dequeue_task(p, rq->active);
-			enqueue_task(p, rq->active);
+			dequeue_task(p, rq_active(p,rq));
+			enqueue_task(p, rq_active(p,rq));
 		}
 		goto out_unlock;
 	}
+#warning MEF PLANETLAB: "if (vx_need_resched(p)) was if (!--p->time_slice) */"
 	if (vx_need_resched(p)) {
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+		/* Hubertus ... we can abstract this out */
+		struct ckrm_local_runqueue* rq = get_task_class_queue(p);
+#endif
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
 		p->prio = effective_prio(p);
@@ -2069,8 +2443,8 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			rq->expired_timestamp = jiffies;
 		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
 			enqueue_task(p, rq->expired);
-			if (p->static_prio < rq->best_expired_prio)
-				rq->best_expired_prio = p->static_prio;
+			if (p->static_prio < this_rq()->best_expired_prio)
+				this_rq()->best_expired_prio = p->static_prio;
 		} else
 			enqueue_task(p, rq->active);
 	} else {
@@ -2093,12 +2467,12 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
 			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
 			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
-			(p->array == rq->active)) {
+			(p->array == rq_active(p,rq))) {
 
-			dequeue_task(p, rq->active);
+			dequeue_task(p, rq_active(p,rq));
 			set_tsk_need_resched(p);
 			p->prio = effective_prio(p);
-			enqueue_task(p, rq->active);
+			enqueue_task(p, rq_active(p,rq));
 		}
 	}
 out_unlock:
@@ -2201,10 +2575,9 @@ asmlinkage void __sched schedule(void)
 	task_t *prev, *next;
 	runqueue_t *rq;
 	prio_array_t *array;
-	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx;
+	int cpu;
 #ifdef	CONFIG_VSERVER_HARDCPU		
 	struct vx_info *vxi;
 	int maxidle = -HZ;
@@ -2300,21 +2673,9 @@ pick_next:
 		}
 	}
 
-	array = rq->active;
-	if (unlikely(!array->nr_active)) {
-		/*
-		 * Switch the active and expired arrays.
-		 */
-		rq->active = rq->expired;
-		rq->expired = array;
-		array = rq->active;
-		rq->expired_timestamp = 0;
-		rq->best_expired_prio = MAX_PRIO;
-	}
-
-	idx = sched_find_first_bit(array->bitmap);
-	queue = array->queue + idx;
-	next = list_entry(queue->next, task_t, run_list);
+	next = rq_get_next_task(rq);
+	if (next == rq->idle) 
+		goto switch_tasks;
 
 	if (dependent_sleeper(cpu, rq, next)) {
 		next = rq->idle;
@@ -2355,6 +2716,14 @@ switch_tasks:
 	clear_tsk_need_resched(prev);
 	RCU_qsctr(task_cpu(prev))++;
 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+  	if (prev != rq->idle) {
+		unsigned long long run = now - prev->timestamp;
+		cpu_demand_event(get_task_local_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+  		update_local_cvt(prev, run);
+	}
+#endif
+
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0) {
 		prev->sleep_avg = 0;
@@ -3062,7 +3431,7 @@ asmlinkage long sys_sched_yield(void)
 {
 	runqueue_t *rq = this_rq_lock();
 	prio_array_t *array = current->array;
-	prio_array_t *target = rq->expired;
+	prio_array_t *target = rq_expired(current,rq);
 
 	/*
 	 * We implement yielding by moving the task into the expired
@@ -3072,7 +3441,7 @@ asmlinkage long sys_sched_yield(void)
 	 *  array.)
 	 */
 	if (unlikely(rt_task(current)))
-		target = rq->active;
+		target = rq_active(current,rq);
 
 	dequeue_task(current, array);
 	enqueue_task(current, target);
@@ -3497,7 +3866,9 @@ static int migration_thread(void * data)
 		}
 
 		if (rq->active_balance) {
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
 			active_load_balance(rq, cpu);
+#endif
 			rq->active_balance = 0;
 		}
 
@@ -3973,7 +4344,10 @@ int in_sched_functions(unsigned long addr)
 void __init sched_init(void)
 {
 	runqueue_t *rq;
-	int i, j, k;
+ 	int i;
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+ 	int j, k;
+#endif
 
 #ifdef CONFIG_SMP
 	/* Set up an initial dummy domain for early boot */
@@ -3993,13 +4367,21 @@ void __init sched_init(void)
 	sched_group_init.cpu_power = SCHED_LOAD_SCALE;
 #endif
 
+ 	init_cpu_classes();
+
 	for (i = 0; i < NR_CPUS; i++) {
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
 		prio_array_t *array;
-
+#endif
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
+#else
+		rq->ckrm_cpu_load = 0;
+#endif
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
@@ -4013,6 +4395,7 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->hold_queue);
 		atomic_set(&rq->nr_iowait, 0);
 
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
 			for (k = 0; k < MAX_PRIO; k++) {
@@ -4022,7 +4405,9 @@ void __init sched_init(void)
 			// delimiter for bitsearch
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
+#endif
 	}
+
 	/*
 	 * We have to do a little magic to get the first
 	 * thread right in SMP mode.
@@ -4031,6 +4416,10 @@ void __init sched_init(void)
 	rq->curr = current;
 	rq->idle = current;
 	set_task_cpu(current, smp_processor_id());
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+	current->cpu_class = default_cpu_class;
+	current->array = NULL;
+#endif
 	wake_up_forked_process(current);
 
 	/*
@@ -4116,3 +4505,13 @@ int task_running_sys(struct task_struct *p)
 EXPORT_SYMBOL(task_running_sys);
 #endif
 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/**
+ * return the classqueue object of a certain processor
+ * Note: not supposed to be used in performance sensitive functions
+ */
+struct classqueue_struct * get_cpu_classqueue(int cpu)
+{
+	return (& (cpu_rq(cpu)->classqueue) );
+}
+#endif