From fff615ac2fec079542bc1cc3a5fc5fe1ec4d09af Mon Sep 17 00:00:00 2001
From: Marc Fiuczynski <mef@cs.princeton.edu>
Date: Mon, 10 Jan 2005 19:02:49 +0000
Subject: [PATCH] ckrm-e16 cpu controller v9rc1

---
 init/Kconfig                   |   12 +
 kernel/ckrm/Makefile           |    4 +-
 kernel/ckrm/ckrm.c             |   10 +-
 kernel/ckrm/ckrm_cpu_class.c   |  400 ++++++--
 kernel/ckrm/ckrm_cpu_monitor.c | 1117 ++++++++++++++--------
 kernel/ckrm/ckrm_listenaq.c    |   12 +-
 kernel/ckrm/rbce/rbcemod.c     |    2 +-
 kernel/ckrm_classqueue.c       |  109 ++-
 kernel/ckrm_sched.c            |  182 +++-
 kernel/sched.c                 | 1580 +++++++++++++++++++-------------
 10 files changed, 2267 insertions(+), 1161 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index e5480f047..3db7aa02e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -181,6 +181,18 @@ config CKRM_CPU_SCHEDULE
 	
 	  Say N if unsure, Y to use the feature.
 
+config CKRM_CPU_SCHEDULE_AT_BOOT
+	bool "Turn on at boot time"
+	depends on CKRM_CPU_SCHEDULE
+	default n
+	help
+	  Enable CKRM CPU Scheduler at boot time. Otherwise
+	  it can be turned on dynamically at runtime. If not
+	  turned on the default Linux Scheduler behavior 
+	  will be obtained.
+
+	  Say N if unsure, Y to use this feature
+
 config CKRM_TYPE_SOCKETCLASS
 	bool "Class Manager for socket groups"
 	depends on CKRM
diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile
index de490232b..0d5b1b776 100644
--- a/kernel/ckrm/Makefile
+++ b/kernel/ckrm/Makefile
@@ -8,5 +8,5 @@ endif
     obj-$(CONFIG_CKRM_TYPE_TASKCLASS) 	+= ckrm_tc.o
     obj-$(CONFIG_CKRM_RES_NUMTASKS) 	+= ckrm_numtasks.o
     obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
-    obj-$(CONFIG_CKRM_RES_LISTENAQ) 	+= ckrm_laq.o
-    obj-$(CONFIG_CKRM_CPU_SCHEDULE)   += ckrm_cpu_class.o ckrm_cpu_monitor.o
+    obj-$(CONFIG_CKRM_RES_LISTENAQ) 	+= ckrm_listenaq.o
+    obj-$(CONFIG_CKRM_CPU_SCHEDULE)     += ckrm_cpu_class.o ckrm_cpu_monitor.o
diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c
index 2a611c1d0..12b7c2691 100644
--- a/kernel/ckrm/ckrm.c
+++ b/kernel/ckrm/ckrm.c
@@ -82,6 +82,7 @@ inline unsigned int is_res_regd(struct ckrm_classtype *clstype, int resid)
 	    );
 }
 
+static 
 struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
 					  const char *resname)
 {
@@ -101,10 +102,8 @@ struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
 	return NULL;
 }
 
-EXPORT_SYMBOL(ckrm_resctlr_lookup);
-
 /* given a classname return the class handle and its classtype*/
-void *ckrm_classobj(char *classname, int *classTypeID)
+void *ckrm_classobj(const char *classname, int *classTypeID)
 {
 	int i;
 
@@ -864,7 +863,10 @@ int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq)
 		atomic_inc(&clstype->nr_resusers[i]);
 		rcbs = clstype->res_ctlrs[i];
 		if (rcbs && rcbs->get_share_values) {
-			(*rcbs->get_share_values) (core->res_class[i], &shares);
+			int rc = (*rcbs->get_share_values)(core->res_class[i], 
+							   &shares);
+			if (rc == -ENOSYS) 
+				continue;
 			seq_printf(seq,"res=%s,guarantee=%d,limit=%d,"
 				   "total_guarantee=%d,max_limit=%d\n",
 				   rcbs->res_name, shares.my_guarantee,
diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c
index 09ea6ba80..3b4bd2a8c 100644
--- a/kernel/ckrm/ckrm_cpu_class.c
+++ b/kernel/ckrm/ckrm_cpu_class.c
@@ -22,9 +22,35 @@
 #include <linux/ckrm_sched.h>
 #include <linux/ckrm_classqueue.h>
 #include <linux/seq_file.h>
+#include <linux/parser.h>
+
+#define CPU_CTRL_NAME  "cpu"
 
 struct ckrm_res_ctlr cpu_rcbs;
 
+#define CKRM_CPU_USAGE_DETAIL_MAX 3
+static int usage_detail = 3;  /* 0: show usage 
+			       * 1: show settings
+			       * 2: show effectives
+			       * 3: show per runqueue stats
+			       */
+
+static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode);
+
+/*
+ * update effective share setting after:
+ * -- remove class
+ * -- change class share
+ * we don't need to call update_effectives() when add new class since 
+ * the defaults grt of new class is 0
+ * CAUTION: might need a lock here
+ */
+static inline void update_class_effectives(void) 
+{
+	//	update_effectives();
+	ckrm_cpu_monitor(0);
+}
+
 /**
  * insert_cpu_class - insert a class to active_cpu_class list
  *
@@ -38,49 +64,81 @@ static inline void insert_cpu_class(struct ckrm_cpu_class *cls)
 /*
  *  initialize a class object and its local queues
  */
+
+CVT_t get_min_cvt_locking(int cpu);
+ckrm_lrq_t *rq_get_dflt_lrq(int cpu);
+
+static void init_cpu_class_lrq(struct ckrm_cpu_class *cls, 
+			       int cpu, int isdflt)
+{
+	int j,k;
+	ckrm_lrq_t *queue = cls->local_queues[cpu];
+
+	queue->active   = queue->arrays;
+	queue->expired  = queue->arrays+1;
+	
+	for (j = 0; j < 2; j++) {
+		prio_array_t *array = queue->arrays + j;
+		for (k = 0; k < MAX_PRIO; k++) {
+			INIT_LIST_HEAD(array->queue + k);
+			__clear_bit(k, array->bitmap);
+		}
+		// delimiter for bitsearch
+		__set_bit(MAX_PRIO, array->bitmap);
+		array->nr_active = 0;
+	}
+	
+	queue->expired_timestamp = 0;
+	queue->best_expired_prio = MAX_PRIO;
+	
+	queue->cpu_class = cls;
+	queue->classqueue = get_cpu_classqueue(cpu);
+	queue->top_priority = MAX_PRIO;
+	cq_node_init(&queue->classqueue_linkobj);
+	queue->local_cvt = isdflt ? 0 : get_min_cvt_locking(cpu);
+	queue->lrq_load = 0;
+	queue->local_weight = cpu_class_weight(cls);
+	if (queue->local_weight == 0)
+		queue->local_weight = 1;
+	queue->over_weight = 0;
+	queue->skewed_weight = CKRM_MAX_WEIGHT/2; /*otherwise class might starve on start*/
+	queue->uncounted_ns = 0;
+	queue->savings = 0;
+	queue->magic = CKRM_LRQ_MAGIC;
+}
+
 void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
 {
-	int i,j,k;      
-	prio_array_t *array; 	
-	ckrm_lrq_t* queue;
+	int i;      
+	int isdflt;
+	struct ckrm_cpu_class *dfltcls;
+
+	dfltcls = get_default_cpu_class();
+
+	isdflt = (cls==dfltcls);
 
 	cls->shares = *shares;
 	cls->cnt_lock = SPIN_LOCK_UNLOCKED;
-	ckrm_cpu_stat_init(&cls->stat);
+	ckrm_cpu_stat_init(&cls->stat,isdflt ? CKRM_SHARE_MAX : 1);
 	ckrm_usage_init(&cls->usage);
 	cls->magic = CKRM_CPU_CLASS_MAGIC;
 
-	for (i = 0 ; i < NR_CPUS ; i++) {
-		queue = &cls->local_queues[i];
-		queue->active  = queue->arrays;
-		queue->expired = queue->arrays+1;
-		
-		for (j = 0; j < 2; j++) {
-			array = queue->arrays + j;
-			for (k = 0; k < MAX_PRIO; k++) {
-				INIT_LIST_HEAD(array->queue + k);
-				__clear_bit(k, array->bitmap);
-			}
-			// delimiter for bitsearch
-			__set_bit(MAX_PRIO, array->bitmap);
-			array->nr_active = 0;
+	memset(cls->local_queues,0,NR_CPUS*sizeof(ckrm_lrq_t*));
+	
+	if (isdflt) {
+		for (i=0; i< NR_CPUS; i++) {
+			cls->local_queues[i] = rq_get_dflt_lrq(i);
+			init_cpu_class_lrq(cls,i,1);
+		}
+	} else {
+		for_each_cpu(i) {
+			cls->local_queues[i] = kmalloc(sizeof(ckrm_lrq_t),
+						       GFP_KERNEL);
+			BUG_ON(cls->local_queues[i]==NULL);
+			init_cpu_class_lrq(cls,i,0);
 		}
-
-		queue->expired_timestamp = 0;
-		
-		queue->cpu_class = cls;
-		queue->classqueue = get_cpu_classqueue(i);
-		queue->top_priority = MAX_PRIO;
-		cq_node_init(&queue->classqueue_linkobj);
-		queue->local_cvt = 0;
-		queue->lrq_load = 0;
-		queue->local_weight = cpu_class_weight(cls);
-		queue->uncounted_ns = 0;
-		queue->savings = 0;
-		queue->magic = 0x43FF43D7;
 	}
 
-	// add to class list
 	write_lock(&class_list_lock);
 	insert_cpu_class(cls);
 	write_unlock(&class_list_lock);
@@ -100,14 +158,14 @@ struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core)
 {
 	struct ckrm_cpu_class * cls;
 	cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
-	if (valid_cpu_class(cls))
-		return cls;
+  	if (valid_cpu_class(cls))
+		return (ckrm_cpu_enabled() ? cls : get_default_cpu_class());
 	else
 		return NULL;
 }
 
-
-void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) 
+void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, 
+			   struct ckrm_core_class *parent) 
 {		
 	struct ckrm_cpu_class *cls;
 
@@ -128,7 +186,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
 			set_default_share(&shares);
 			init_cpu_class(cls,&shares);
 			cls->core = core;
-			cls->parent = parent;
+			cls->parent = parent;			
 		}
 	} else
 		printk(KERN_ERR"alloc_cpu_class failed\n");
@@ -136,15 +194,14 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
 	return cls;
 }		
 
-/*
- * hzheng: this is not a stable implementation
- *         need to check race condition issue here
- */		
+void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr);
+
 static void ckrm_free_cpu_class(void *my_res) 
 {			
 	struct ckrm_cpu_class *cls = my_res, *parres, *childres;
 	ckrm_core_class_t *child = NULL;
 	int maxlimit;
+	int i;
 
 	if (!cls) 
 		return;
@@ -179,10 +236,19 @@ static void ckrm_free_cpu_class(void *my_res)
 	list_del(&cls->links);
 	write_unlock(&class_list_lock);
 
+	ckrm_cpu_class_queue_delete_sync(cls);
+
+	for_each_cpu(i) {
+		ckrm_lrq_t *lrq = get_ckrm_lrq(cls,i);
+		if (!lrq) continue;
+		lrq->magic = -99;
+		kfree(lrq);
+	}
 	kfree(cls);
 
-	//call ckrm_cpu_monitor after class removed
-	ckrm_cpu_monitor(0);
+	//call ckrm_cpu_monitor after class is removed
+	if (ckrm_cpu_enabled())
+		update_class_effectives();
 }				
 
 /*
@@ -194,8 +260,12 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
         struct ckrm_shares *cur = &cls->shares, *par;
         int rc = -EINVAL;
 
-        if (!cls) 
-                return rc;
+	if (ckrm_cpu_disabled())
+		return -ENOSYS;
+        if (!cls)
+		return rc;
+	if (new_share->total_guarantee > CKRM_SHARE_MAX)
+		return -E2BIG;
 
         if (cls->parent) {
                 parres = ckrm_get_cpu_class(cls->parent);
@@ -215,7 +285,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
 		new_share->my_guarantee = 0;
 
 	rc = set_shares(new_share, cur, par);
-	if (cur->my_limit == CKRM_SHARE_DONTCARE)
+	if (!rc && cur->my_limit == CKRM_SHARE_DONTCARE)
 		cur->my_limit = cur->max_limit;
 
 
@@ -225,7 +295,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
 	}
 
 	//call ckrm_cpu_monitor after changes are changed
-	ckrm_cpu_monitor(0);
+	update_class_effectives();
 
 	return rc;
 }							
@@ -235,22 +305,90 @@ static int ckrm_cpu_get_share(void *my_res,
 {			
 	struct ckrm_cpu_class *cls = my_res;
 
-	if (!cls) 
+	if (ckrm_cpu_disabled())
+		return -ENOSYS;
+        if (!cls)
 		return -EINVAL;
+
 	*shares = cls->shares;
 	return 0;
 }				
 
+/*
+ *   get_ckrm_usage():
+ *     obtain a sequence of <num> usage informations
+ *     returns number of usages reported.
+ *
+ *     report IN:  specifies the sequence of jiffies for which to report
+ *                 must be ordered (smallest first)
+ *            OUT: returns the usage in each field
+ *
+ */
+
+
+int ckrm_cpu_get_usage(struct ckrm_cpu_class* clsptr, 
+		       int num, ulong report[])
+{
+	struct ckrm_usage* usage = &clsptr->usage;
+	unsigned long long total = 0;
+	int i, idx, cur, num_ofs;
+
+	num_ofs = cur = i = 0;
+	idx = usage->sample_pointer;	
+
+	for ( num_ofs = 0; num_ofs < num ; num_ofs++ ) {
+		int nr_samples;
+		int duration = report[num_ofs];	
+		unsigned long long totval = 0;
+
+		nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
+		
+		if (nr_samples > USAGE_MAX_HISTORY)
+			nr_samples = USAGE_MAX_HISTORY;
+
+		for ( ; i< nr_samples; i++) {
+			if (! idx)
+				idx = USAGE_MAX_HISTORY;
+			idx --;
+			total += usage->samples[idx];
+		}
+		totval = total * 1000;
+		do_div(totval,NS_PER_SAMPLE);
+		do_div(totval,nr_samples * cpus_weight(cpu_online_map));
+		report[num_ofs] = totval;
+	}
+
+        return num;
+}
+
 int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
 {
 	struct ckrm_cpu_class *cls = my_res;
 	struct ckrm_cpu_class_stat* stat = &cls->stat;
 	ckrm_lrq_t* lrq;
 	int i;
+	ulong usage[3] = { 2*HZ, 10*HZ, 60*HZ };
 
-	if (!cls) 
+	if (!cls || ckrm_cpu_disabled()) 
 		return -EINVAL;
 
+	ckrm_cpu_get_usage(cls,3,usage);
+
+	/* this will after full stabilization become the only cpu usage stats
+	 */
+
+	seq_printf(sfile, "cpu-usage(2,10,60)= %lu %lu %lu\n",
+		   usage[0],usage[1],usage[2]);
+
+	if (usage_detail < 1) 
+		return 0;
+
+	/* the extended statistics we can decide whether we want to make the 
+	 * additional statistics available over config options
+	 * eitherway they should be reported in a more concised form
+	 * during stabilization, this is OK
+	 */
+
 	seq_printf(sfile, "-------- CPU Class Status Start---------\n");
 	seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n",
 		   cls->shares.my_guarantee,
@@ -261,26 +399,35 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
 		   cls->shares.unused_guarantee,
 		   cls->shares.cur_max_limit);
 
+	if (usage_detail < 2) 
+		goto out;
+
 	seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt);
 	seq_printf(sfile, "\tmegrt= %d\n",stat->megrt);
 	seq_printf(sfile, "\tehl= %d\n",stat->ehl);
 	seq_printf(sfile, "\tmehl= %d\n",stat->mehl);
 	seq_printf(sfile, "\teshare= %d\n",stat->eshare);
-	seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls));
+	seq_printf(sfile, "\tmeshare= %d\n",stat->meshare);
 	seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand);
 	seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns);
-	seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n",
-		   get_ckrm_usage(cls,2*HZ),
-		   get_ckrm_usage(cls,10*HZ),
-		   get_ckrm_usage(cls,60*HZ)
-		   );
+	seq_printf(sfile, "\tusage(2,10,60)= %lu %lu %lu\n",
+		   usage[0],usage[1],usage[2]);
+
+	if (usage_detail < 3) 
+		goto out;
+
+	/* provide per run queue information */
 	for_each_online_cpu(i) {
 		lrq = get_ckrm_lrq(cls,i);		
-		seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings);
+		seq_printf(sfile, "\tlrq %d demand= %lu weight= %d "
+			   "lrq_load= %lu cvt= %llu sav= %llu\n",
+			   i,stat->local_stats[i].cpu_demand,
+			   local_class_weight(lrq),lrq->lrq_load,
+			   lrq->local_cvt,lrq->savings);
 	}
 
+out:
 	seq_printf(sfile, "-------- CPU Class Status END ---------\n");
-
 	return 0;
 }
 
@@ -296,10 +443,34 @@ void ckrm_cpu_change_class(void *task, void *old, void *new)
 	if (!task || ! old || !new)
 		return; 
 
+	if (ckrm_cpu_disabled())
+		newcls = get_default_cpu_class();
 	_ckrm_cpu_change_class(tsk,newcls);
 }							
 
-/*dummy function, not used*/
+enum config_token_t {
+	config_usage_detail,   /* define usage level 			  */
+	config_disable,        /* always use default linux scheduling     */
+			       /* effectively disables the ckrm scheduler */
+	config_enable,         /* always uses ckrm scheduling behavior    */
+	config_err             /* parsing error */
+};
+
+#define CKRM_SCHED_MODE_DISABLED_STR "disabled"
+#define CKRM_SCHED_MODE_ENABLED_STR  "enabled"
+
+static char *ckrm_sched_mode_str[] = { 
+		CKRM_SCHED_MODE_DISABLED_STR,
+		CKRM_SCHED_MODE_ENABLED_STR
+};
+
+static match_table_t config_tokens = {
+	{ config_disable,      "mode="CKRM_SCHED_MODE_DISABLED_STR },
+	{ config_enable,       "mode="CKRM_SCHED_MODE_ENABLED_STR  },
+	{ config_usage_detail, "usage_detail=%u" 		   },
+	{ config_err,          NULL                                }
+};
+
 static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
 {
 	struct ckrm_cpu_class *cls = my_res;
@@ -307,23 +478,61 @@ static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
 	if (!cls) 
 		return -EINVAL;
 
-	seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class");
+	seq_printf(sfile, "res=%s,mode=%s",
+		   CPU_CTRL_NAME,ckrm_sched_mode_str[ckrm_sched_mode]);
+	if (!ckrm_cpu_disabled())  /* enabled || mixed */
+		seq_printf(sfile, ",usage_detail=%u",usage_detail);
+	seq_printf(sfile,"\n");
 	return 0;
 }
 
-/*dummy function, not used*/
 static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
 {
 	struct ckrm_cpu_class *cls = my_res;
+	char *p;
+	char **cfgstr_p = (char**)&cfgstr;
+	substring_t args[MAX_OPT_ARGS];
+	int option,rc;
+	enum ckrm_sched_mode new_sched_mode;
 
 	if (!cls) 
 		return -EINVAL;
-	printk("ckrm_cpu config='%s'\n",cfgstr);
-	return 0;
+
+	new_sched_mode = ckrm_sched_mode;	
+	rc = 0;
+
+	while ((p = strsep(cfgstr_p, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		
+		token = match_token(p, config_tokens, args);
+		switch (token) {
+		case config_usage_detail:
+			if (ckrm_cpu_disabled() || 
+			    (match_int(&args[0], &option)) ||
+			    (option > CKRM_CPU_USAGE_DETAIL_MAX))
+			{
+				return -EINVAL;
+			}
+			usage_detail = option;
+			break;
+		case config_disable:
+		 	new_sched_mode = CKRM_SCHED_MODE_DISABLED;
+			break;
+		case config_enable:
+		 	new_sched_mode = CKRM_SCHED_MODE_ENABLED;
+			break;
+		case config_err:
+			return -EINVAL;
+		}
+	}
+	rc = ckrm_cpu_set_mode(new_sched_mode);
+	return rc;
 }
 	
 struct ckrm_res_ctlr cpu_rcbs = {
-	.res_name          = "cpu",
+	.res_name          = CPU_CTRL_NAME,
 	.res_hdepth        = 1,
 	.resid             = -1,
 	.res_alloc         = ckrm_alloc_cpu_class,
@@ -364,14 +573,69 @@ void init_cpu_classes(void)
 
 	//init classqueues for each processor
 	for (i=0; i < NR_CPUS; i++)
-		classqueue_init(get_cpu_classqueue(i)); 
+		classqueue_init(get_cpu_classqueue(i),ckrm_cpu_enabled()); 
 
-	/*
-	 * hzheng: initialize the default cpu class
-	 *  required for E14/E15 since ckrm_init is called after sched_init
-	 */
 	ckrm_alloc_cpu_class(NULL,NULL);
 }
 
+void ckrm_cpu_class_queue_update(int on);
+void ckrm_cpu_start_monitor(void);
+void ckrm_cpu_kill_monitor(void);
+
+static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode) 
+{
+        struct task_struct *proc, *tsk;
+	struct ckrm_cpu_class *new_cls = NULL;
+	int i;
+
+	if (mode == ckrm_sched_mode)
+		return 0;
+
+	printk("ckrm_cpu_set_mode from <%s> to <%s> pid=%d\n",
+		   ckrm_sched_mode_str[ckrm_sched_mode],
+		   ckrm_sched_mode_str[mode], 
+		   current->pid);
+
+	if (mode == CKRM_SCHED_MODE_DISABLED) {
+		ckrm_cpu_kill_monitor();
+		new_cls = get_default_cpu_class();
+	} else {
+		ckrm_cpu_class_queue_update(1);
+	}
+                             
+	/* run twice through the list to catch everyone,
+	 * current and transient once
+         */
+
+        read_lock(&tasklist_lock);
+
+	ckrm_sched_mode = mode;
+	/* we have to run through the list twice
+	 * first catch all existing tasks
+	 * and then deal with some potential race condition
+	 */
+	for ( i=2 ; i-- ; ) {
+		/* lock class_list_lock ? */
+	
+	        do_each_thread(proc, tsk) {
+			if (mode == CKRM_SCHED_MODE_ENABLED) {
+				new_cls = ckrm_get_res_class(class_core(tsk->taskclass),
+							     cpu_rcbs.resid,
+							     struct ckrm_cpu_class);
+			}	
+			_ckrm_cpu_change_class(tsk,new_cls);
+	        } while_each_thread(proc, tsk);
+	}
+        read_unlock(&tasklist_lock);
+
+	if (mode == CKRM_SCHED_MODE_DISABLED) 
+		ckrm_cpu_class_queue_update(0);
+	else 
+		ckrm_cpu_start_monitor();
+	return 0;
+}
 
 EXPORT_SYMBOL(ckrm_get_cpu_class);
+
+
+
diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c
index 11f65d73b..3cb8225c4 100644
--- a/kernel/ckrm/ckrm_cpu_monitor.c
+++ b/kernel/ckrm/ckrm_cpu_monitor.c
@@ -28,21 +28,30 @@
 #include <asm/div64.h>
 #include <linux/ckrm_sched.h>
 
+// #define CONFIG_CKRM_SUPPORT_MAXLIMITS
+
 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
-#define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
 
 #define CKRM_CPU_DEMAND_RUN 0
 #define CKRM_CPU_DEMAND_SLEEP 1
-//sample task cpu demand every 64ms
-#define CPU_DEMAND_TASK_RECALC  (64000000LL)
-#define CPU_DEMAND_CLASS_RECALC (256000000LL)
+//sample task cpu demand every 32ms
+#define CPU_DEMAND_TASK_RECALC  ( 32*1000*1000LL)
+#define CPU_DEMAND_CLASS_RECALC (256*1000*1000LL)
 #define CPU_DEMAND_TP_CLASS 0
 #define CPU_DEMAND_TP_TASK 1
 
+static void update_ckrm_idle(unsigned long surplus);
+
+void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu);
+int alloc_surplus(struct ckrm_core_class *root_core);
 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
-void update_ckrm_idle(unsigned long surplus);
 
 /*interface to share definition*/
+static inline int get_my_grt(struct ckrm_cpu_class *cls)
+{
+	return cls->shares.unused_guarantee;
+}
+
 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
 {
 	return cls->shares.my_limit;
@@ -63,6 +72,57 @@ static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
 	return cls->shares.total_guarantee;
 }
 
+static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
+				       int new_share)
+{
+	if (!new_share)
+		new_share = 1;
+
+	BUG_ON(new_share < 0);
+	stat->eshare = new_share;
+}
+
+static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
+					    int new_share)
+{
+	if (!new_share)
+		new_share = 1;
+
+	BUG_ON(new_share < 0);
+	stat->meshare = new_share;
+}
+
+/**
+ *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
+ *
+ * self_cpu_demand = sum(cpu demand of all local queues) 
+ */
+static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
+{
+	int cpu_demand = 0;
+	int i;
+	int cpuonline = 0;
+
+	for_each_online_cpu(i) {
+		cpu_demand_check_sleep(stat,i);
+		cpu_demand += stat->local_stats[i].cpu_demand;
+		cpuonline ++;
+	}
+
+	return (cpu_demand/cpuonline);
+}
+
+/*
+ * my max demand = min(cpu_demand, my effective hard limit)
+ */
+static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
+{
+	unsigned long mmax_demand = get_self_cpu_demand(stat);
+	if (mmax_demand > stat->mehl)
+		mmax_demand = stat->mehl;
+
+	return mmax_demand;
+}
 
 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
 {
@@ -85,7 +145,7 @@ static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat,
 	}
 }
 
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares)
 {
 	int i;
 
@@ -93,7 +153,7 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
 	stat->total_ns = 0;
 	stat->max_demand = 0;
 
-	for (i=0; i< NR_CPUS; i++) {
+	for (i=0; i<NR_CPUS; i++) {
 		cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
 	}
 
@@ -102,10 +162,517 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
 	stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
 	stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
 
-	stat->eshare = CKRM_SHARE_MAX;
-	stat->meshare = CKRM_SHARE_MAX;
+	stat->eshare = eshares;
+	stat->meshare = eshares;
+
+	stat->has_savings = 0;  
+	stat->demand_per_share = 0;
+
 }
 
+#if 0  // keep handy for debugging if necessary
+void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num)
+{
+	struct ckrm_cpu_class_stat* stat = &clsptr->stat;
+	printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num,
+		clsptr, (clsptr == get_default_cpu_class()),
+	        clsptr->shares.my_guarantee, 
+		clsptr->shares.my_limit, 
+		clsptr->shares.total_guarantee,
+	        clsptr->shares.max_limit, 
+		clsptr->shares.unused_guarantee);
+	printk("      egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n",
+		stat->egrt,stat->megrt,stat->ehl,stat->mehl,
+		stat->eshare,stat->meshare);
+}
+#endif
+
+/**********************************************/
+/*          surplus allocation                */
+/**********************************************/
+
+/*
+ * surplus = egrt - demand
+ * if surplus < 0, surplus = 0 
+ */
+static inline int get_node_surplus(struct ckrm_cpu_class *cls)
+{
+	int surplus = cls->stat.egrt - cls->stat.max_demand;
+
+	if (surplus < 0)
+		surplus = 0;
+
+	return surplus;
+}
+
+/*
+ * consume savings in advance because this class give surplus to others
+ * this is a quick hack, should be integrated with balance_savings()
+ */
+static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr, 
+					    int savings_consumed) 
+{
+	long long total_savings;
+	ckrm_lrq_t* lrq;
+	int i;
+	int cpu_online = 0;
+	
+	total_savings = 0;
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		total_savings += lrq->savings;
+		cpu_online ++;
+	}
+	
+	total_savings -= savings_consumed;
+	if (total_savings < 0)
+		total_savings = 0;
+
+	//get the average savings
+	do_div(total_savings,cpu_online);	
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		lrq->savings = total_savings;
+	}
+}
+
+static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
+{
+	int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
+	int savings_consumed;
+
+	if (surplus < 0)
+		surplus = 0;
+
+	/*
+	 * a quick hack about the hierarchy savings distribution 
+	 * may not be the right way to do
+	 *
+	 * since this node give its surplus to other nodes, 
+	 * it's savings should be consumed
+	 * suppose CPU_MONITOR_INTERVAL = (HZ) 
+	 * savings_consumed is roughly how much savings will be consumed for the next second
+	 */
+	if (surplus) {
+		savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT);
+		consumed_surplus_savings(cls, savings_consumed) ;
+	}
+
+	return surplus;
+}
+
+/*
+ * all the class in the queue consume the surplus in order
+ * each class consume the amount propotional to its egrt
+ */
+static int consume_surplus_in_order(struct list_head* queue,
+					   struct ckrm_cpu_class *p_cls,
+					   int total_surplus)
+{
+	int total_grt = 0;
+	struct ckrm_cpu_class *clsptr;	
+
+	/*
+	 * get total_grt of the classes in the queue
+	 * total_grt can be maintained instead of re-calcuated each time
+	 */
+	list_for_each_entry(clsptr,queue,surplus_queue) {
+		if (unlikely(clsptr == p_cls))
+			total_grt += clsptr->stat.megrt;
+		else
+			total_grt += clsptr->stat.egrt;
+	}
+
+	if (! total_grt)
+		goto consume_out;
+	
+	//allocate in order
+	list_for_each_entry(clsptr,queue,surplus_queue) {		
+		int surplus_per_share;
+		int consumed, my_grt;
+
+		BUG_ON(! total_grt);
+		surplus_per_share = 
+			(total_surplus << CKRM_SHARE_SHIFT) / total_grt;
+
+		if (surplus_per_share <= 0)
+			break;
+
+		if (unlikely(clsptr == p_cls))  //self_node consuming
+			my_grt =  clsptr->stat.megrt;
+		else
+			my_grt = clsptr->stat.egrt;
+
+		BUG_ON(clsptr->stat.demand_per_share <= 0);
+
+		if (clsptr->stat.demand_per_share < surplus_per_share)
+			surplus_per_share = clsptr->stat.demand_per_share;
+
+		consumed = surplus_per_share * my_grt;
+		consumed >>= CKRM_SHARE_SHIFT;
+		total_surplus -= consumed;
+		BUG_ON(total_surplus < 0);
+		total_grt -= my_grt;
+
+		if (unlikely(clsptr == p_cls))
+			set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed);			
+		else
+			set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed);
+	}	
+ consume_out:	
+	if (total_surplus <= 1) //if total_suplus too small, no need to allocate again
+		total_surplus = 0;
+	return total_surplus;
+}
+
+/*
+ * link all the children of parent and the parent itself using their surplus_queue field
+ * link the whole queue using src_queue
+ * if anything wrong return -1
+ */
+static int get_class_surplus_queue(struct ckrm_core_class *parent,
+				   struct list_head* src_queue)
+{
+	struct ckrm_core_class *child_core = NULL;
+	struct ckrm_cpu_class *p_cls,*c_cls;
+	int ret = -1;
+
+	p_cls = ckrm_get_cpu_class(parent);
+	if (! p_cls)
+		goto link_out;
+
+	INIT_LIST_HEAD(src_queue);
+
+	//add the parent node itself
+	list_add(&p_cls->surplus_queue,src_queue);
+	do {
+		child_core = ckrm_get_next_child(parent, child_core);
+		if (child_core) {
+			c_cls = ckrm_get_cpu_class(child_core);				
+			if (! c_cls)
+				goto link_out;
+			list_add(&c_cls->surplus_queue,src_queue);
+		}
+	} while (child_core);
+
+	ret = 0;
+
+ link_out:
+	return ret;
+}
+
+/*
+ * insert the class to queue based on stat->demand_per_share
+ * status: tested
+ */
+static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr)
+{
+	struct ckrm_cpu_class *cur_cls = NULL;	
+	int end_of_queue = 1;
+
+	list_for_each_entry(cur_cls,queue,surplus_queue) {
+		if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) {
+			end_of_queue = 0;
+			break;
+		}
+	}
+
+	//insert the clsptr
+	if (! cur_cls || end_of_queue)
+		list_add_tail(&clsptr->surplus_queue,queue);
+	else
+		list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue);
+}
+
+/*
+ * copy all classes in src_queue to dst_queue,
+ * reorder the classes based on their normalized demand 
+ * if a class already saturate (eshare >= demand), also remove it from src_queue
+ * return the total guarantee of the selected classes
+ *
+ * @src_queue: source queue
+ * @dst_queue: destination queue
+ * @check_sl: check soft limit
+ * @check_savings: only class has savings should be considered
+ */
+
+static unsigned long reorder_surplus_queue(struct list_head* src_queue, 
+					   struct list_head* dst_queue, 
+					   int check_sl, int check_savings, 
+					   struct ckrm_cpu_class *p_cls) 
+{
+	struct ckrm_cpu_class *clsptr, *tmp;	
+
+	INIT_LIST_HEAD(dst_queue);
+
+	list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) {
+		struct ckrm_cpu_class_stat* stat = &clsptr->stat;
+		int inc_limit;
+		int max_demand, eshare, esl,grt;
+
+		if (unlikely(clsptr == p_cls)) {
+			max_demand = get_mmax_demand(stat);
+			eshare  = stat->meshare;
+			esl = get_mysoft_limit(clsptr);
+			grt = stat->megrt;
+		} else {
+			max_demand = stat->max_demand;
+			eshare = stat->eshare;
+			esl = get_soft_limit(clsptr);
+			grt = stat->egrt;
+		}
+
+		//hard limit and demand limit
+		inc_limit = max_demand - eshare;
+		
+		//no additional share needed
+		if (inc_limit <= 0 || ! grt) {
+			list_del(&clsptr->surplus_queue);
+			continue;
+		}
+			
+		//or no more savings
+		if (check_savings && ! stat->has_savings)
+			continue;
+		
+		//check soft limit
+		if (check_sl) {
+			int soft_limit;
+
+			soft_limit = p_cls->stat.eshare * esl
+				/ p_cls->shares.total_guarantee;
+
+			if (soft_limit < max_demand)
+				inc_limit = soft_limit - eshare;
+			if ( inc_limit <= 0)   /* can turn negative */
+				continue;
+		}
+
+		BUG_ON(! grt);
+		//get the stat->demand_per_share
+		stat->demand_per_share = 
+			(inc_limit << CKRM_SHARE_SHIFT) / grt;	
+
+		list_del_init(&clsptr->surplus_queue);
+		//insert the class to the queue
+		insert_surplus_queue(dst_queue,clsptr);
+	}
+	return 0;
+}
+
+/*
+ * get all the surplus that should be reallocated to the children
+ */
+static inline int get_total_surplus(struct ckrm_cpu_class *p_cls,
+				    struct ckrm_core_class *parent) 
+{
+	struct ckrm_cpu_class *c_cls;
+	int total_surplus;
+	struct ckrm_core_class *child_core = NULL;
+
+	//additional share assigned to this sub node from parent
+	total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
+	BUG_ON(total_surplus < 0);
+
+	//surplus of this node
+	total_surplus += get_my_node_surplus(p_cls);
+	do {
+		child_core = ckrm_get_next_child(parent, child_core);
+		if (child_core) {
+			c_cls = ckrm_get_cpu_class(child_core);				
+			if (! c_cls) {
+				total_surplus = 0;
+				break;
+			}
+
+			total_surplus += get_node_surplus(c_cls);			
+		}
+	} while (child_core);
+
+	return total_surplus;
+}
+/**
+ * alloc_surplus_node: re-allocate the shares for a single level
+ * @parent: parent node
+ * return the remaining surplus
+ *
+ * The surplus reallocation policy is like below.
+ * -- the classes that have eshare >= demand don't need any additional share. 
+ *     So they don't participate the surplus allocation.
+ * -- all the other classes received share in this order:
+ * 1. has savings, not over soft limit
+ * 2. has savings, but over soft limit
+ * 3. no savings, not over soft limit
+ * 4. no savings, over soft limit
+ * 
+ * In each of the 4 levels above, classes get surplus propotionally to its guarantee
+ */
+static int alloc_surplus_node(struct ckrm_core_class *parent)
+{
+	struct ckrm_cpu_class *p_cls;
+	int total_surplus;
+	int ret = -1;
+	struct list_head src_queue, dst_queue;
+
+	p_cls = ckrm_get_cpu_class(parent);
+	if (! p_cls) //safty check
+		goto realloc_out;
+
+	ret = 0;
+	total_surplus = get_total_surplus(p_cls,parent);
+
+	if (! total_surplus) //no surplus to be allocated 
+		goto realloc_out;
+
+	/* 
+	 * first round, allocated to tasks with savings, check_sl
+	 */
+	get_class_surplus_queue(parent,&src_queue);
+	reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls);
+	if (! list_empty(&dst_queue)) {
+		total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
+		if (! total_surplus)
+			goto realloc_out;
+	}
+
+	/* 
+	 * second round, check savings, but no check_sl
+	 */
+	//merge the src_queue and dst_queue and reorder
+	list_splice(&dst_queue, &src_queue);
+	reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls);
+	if (! list_empty(&dst_queue)) {
+		total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
+		if (! total_surplus)
+			goto realloc_out;
+	}
+
+	/* 
+	 * third round, no check savings, but check_sl
+	 */
+	//merge the src_queue and dst_queue and reorder
+	list_splice(&dst_queue, &src_queue);
+	reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls);
+	if (! list_empty(&dst_queue)) {
+		total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
+		if (! total_surplus)
+			goto realloc_out;
+	}
+	/* 
+	 * fourth round, no check savings, no check_sl
+	 */
+	//merge the src_queue and dst_queue and reorder
+	list_splice(&dst_queue, &src_queue);
+	reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls);
+	if (! list_empty(&dst_queue))
+		total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);	
+	
+ realloc_out:
+	return ret;
+}
+
+/*
+ * return true if the class total savings > MIN_SAVINGS 
+ */
+static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online)
+{
+	unsigned long long total_savings;
+	ckrm_lrq_t* lrq;
+	int i;
+#define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS)
+	
+	total_savings = 0;
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		total_savings += lrq->savings;
+	}
+
+	if (total_savings < CLASS_MIN_SAVINGS)
+		return 0;
+
+	//get the average savings
+	do_div(total_savings,cpu_online);	
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		lrq->savings = total_savings;
+	}
+
+	/*
+	 * hzheng: this is another quick hack
+	 * only say I have savings when this node has more demand
+	 * ignoring the requirement of child classes
+	 */
+	if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat))
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * check savings status
+ * set has_savings field if the class or its sub class has savings
+ */
+static void check_savings_status(struct ckrm_core_class *root_core)
+{
+	struct ckrm_cpu_class *clsptr;
+	int cpu_online;
+
+	cpu_online = cpus_weight(cpu_online_map);	
+
+	//class status: demand, share,total_ns prio, index
+	list_for_each_entry(clsptr,&active_cpu_classes,links) 
+		clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online);
+}
+
+/**
+ * alloc_surplus - reallocate unused shares
+ *
+ * class A's usused share should be allocated to its siblings
+ * the re-allocation goes downward from the top
+ */
+int alloc_surplus(struct ckrm_core_class *root_core)
+{
+	struct ckrm_core_class *cur_core, *child_core;
+	//	struct ckrm_cpu_class *cls;
+	int ret = -1;
+
+	check_savings_status(root_core);
+
+	/*initialize*/
+	cur_core = root_core;
+	child_core = NULL;
+	//	cls = ckrm_get_cpu_class(cur_core);
+
+	/*the ckrm idle tasks get all what's remaining*/
+	/*hzheng: uncomment the following like for hard limit support */
+	//	update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
+	
+ repeat:
+	//check exit
+	if (!cur_core)
+		return 0;
+
+	//visit this node only once
+	if (! child_core) 
+		if ( alloc_surplus_node(cur_core) < 0 )
+			return ret;
+
+	//next child
+	child_core = ckrm_get_next_child(cur_core, child_core);
+	if (child_core) {
+		//go down
+		cur_core = child_core;
+		child_core = NULL;
+		goto repeat;
+	} else {		//no more child, go back
+		child_core = cur_core;
+		cur_core = child_core->hnode.parent;
+	}
+	goto repeat;
+}
+
+
+
 /**********************************************/
 /*          cpu demand                        */
 /**********************************************/
@@ -134,27 +701,29 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
  * how often should we recalculate the cpu demand
  * the number is in ns
  */
-static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
+static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,
+					  int state, unsigned long long len)
 {	
 	local_stat->total += len;
 	if (state == CKRM_CPU_DEMAND_RUN)
 		local_stat->run += len;
 
 	if (local_stat->total >= local_stat->recalc_interval) {
-		local_stat->total >>= CKRM_SHARE_ACCURACY;
-		if (unlikely(local_stat->run > 0xFFFFFFFF))
-			local_stat->run = 0xFFFFFFFF;
+		local_stat->total >>= CKRM_SHARE_SHIFT;
+		if (unlikely(local_stat->run > ULONG_MAX))
+			local_stat->run = ULONG_MAX;
 
-		if (local_stat->total > 0xFFFFFFFF) 
-			local_stat->total = 0xFFFFFFFF;
+		if (unlikely(local_stat->total > ULONG_MAX))
+			local_stat->total = ULONG_MAX;
 			
 		do_div(local_stat->run,(unsigned long)local_stat->total);
 
-		if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
+		if (unlikely(local_stat->total > ULONG_MAX)) {
+			//happens after very long sleep
 			local_stat->cpu_demand = local_stat->run;
-		else {
-			local_stat->cpu_demand += local_stat->run;
-			local_stat->cpu_demand >>= 1;
+		} else { 
+			local_stat->cpu_demand = 
+                            (local_stat->cpu_demand + local_stat->run) >> 1;
 		}
 		local_stat->total = 0;
 		local_stat->run = 0;
@@ -187,60 +756,28 @@ void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsign
 		break;
 	case CPU_DEMAND_INIT: //for task init only
 		cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
-		break;
-	default:
-		BUG();
-	}
-}
-
-/** 
- * check all the class local queue
- * 
- * to deal with excessive long run/sleep state
- * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
- */
-static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
-{
-	struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
-	unsigned long long sleep,now;
-	if (local_stat->last_sleep) {
-		now = sched_clock();
-		sleep = now - local_stat->last_sleep;
-		local_stat->last_sleep = now;
-		update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
-	}
-}
-
-/**
- *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
- *
- * self_cpu_demand = sum(cpu demand of all local queues) 
- */
-static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
-{
-	int cpu_demand = 0;
-	int i;
-	int cpuonline = 0;
-
-	for_each_online_cpu(i) {
-		cpu_demand_check_sleep(stat,i);
-		cpu_demand += stat->local_stats[i].cpu_demand;
-		cpuonline ++;
+		break;
+	default:
+		BUG();
 	}
-
-	return (cpu_demand/cpuonline);
 }
 
-/*
- * my max demand = min(cpu_demand, my effective hard limit)
+/** 
+ * check all the class local queue
+ * 
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
  */
-static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
+void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
 {
-	unsigned long mmax_demand = get_self_cpu_demand(stat);
-	if (mmax_demand > stat->mehl)
-		mmax_demand = stat->mehl;
-
-	return mmax_demand;
+	struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
+	unsigned long long sleep,now;
+	if (local_stat->last_sleep) {
+		now = sched_clock();
+		sleep = now - local_stat->last_sleep;
+		local_stat->last_sleep = now;
+		update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
+	}
 }
 
 /**
@@ -301,26 +838,6 @@ static int update_max_demand(struct ckrm_core_class *root_core)
 /**********************************************/
 /*          effective guarantee & limit       */
 /**********************************************/
-static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
-				       int new_share)
-{
-	if (!new_share)
-		new_share = 1;
-
-	BUG_ON(new_share < 0);
-	stat->eshare = new_share;
-}
-
-static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
-					    int new_share)
-{
-	if (!new_share)
-		new_share = 1;
-
-	BUG_ON(new_share < 0);
-	stat->meshare = new_share;
-}
-
 /**
  *update_child_effective - update egrt, ehl, mehl for all children of parent
  *@parent: the parent node
@@ -346,7 +863,7 @@ static int update_child_effective(struct ckrm_core_class *parent)
 		    p_cls->stat.egrt *
 		    c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
 
-		c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
+		c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls)
 			/ c_cls->shares.total_guarantee;
 		
 		c_cls->stat.ehl =
@@ -372,8 +889,9 @@ static int update_child_effective(struct ckrm_core_class *parent)
  *
  * return -1 if anything wrong happened (eg: the structure changed during the process)
  */
-static int update_effectives(struct ckrm_core_class *root_core)
+int update_effectives(void)
 {
+	struct ckrm_core_class *root_core = get_default_cpu_class()->core;
 	struct ckrm_core_class *cur_core, *child_core;
 	struct ckrm_cpu_class *cls;
 	int ret = -1;
@@ -384,7 +902,7 @@ static int update_effectives(struct ckrm_core_class *root_core)
 
 	//initialize the effectives for root 
 	cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
-	cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
+	cls->stat.megrt = cls->stat.egrt * get_my_grt(cls)
 		/ cls->shares.total_guarantee;
 	cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
 		/ cls->shares.total_guarantee;
@@ -418,288 +936,11 @@ static int update_effectives(struct ckrm_core_class *root_core)
 }
 
 /**********************************************/
-/*          surplus allocation                */
+/*           CKRM Idle Tasks                  */
 /**********************************************/
 
-/*
- * surplus = egrt - demand
- * if surplus < 0, surplus = 0 
- */
-static inline int get_node_surplus(struct ckrm_cpu_class *cls)
-{
-	int surplus = cls->stat.egrt - cls->stat.max_demand;
-
-	if (surplus < 0)
-		surplus = 0;
-
-	return surplus;
-}
-
-static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
-{
-	int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
-
-	if (surplus < 0)
-		surplus = 0;
-
-	return surplus;
-}
-
-/**
- * consume_surplus: decides how much surplus a node can consume
- * @ckeck_sl: if check_sl is set, then check soft_limitx
- * return how much consumed
- *
- * implements all the CKRM Scheduling Requirement
- * assume c_cls is valid
- */
-static inline int consume_surplus(int surplus,
-				       struct ckrm_cpu_class *c_cls,
-				       struct ckrm_cpu_class *p_cls,
-				       int check_sl
-				       )
-{
-	int consumed = 0;
-	int inc_limit;
-	int total_grt = p_cls->shares.total_guarantee;
-
- 	BUG_ON(surplus < 0);
-
-	/*can't consume more than demand or hard limit*/
-	if (c_cls->stat.eshare >= c_cls->stat.max_demand)
-		goto out;
-
-	//the surplus allocation is propotional to grt
-	consumed =
-		surplus * c_cls->shares.my_guarantee / total_grt;
-
-	if (! consumed) //no more share
-		goto out;
-
-	//hard limit and demand limit
-	inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
-
-	if (check_sl) {
-		int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
-			/total_grt;
-		if (esl < c_cls->stat.max_demand)
-			inc_limit = esl - c_cls->stat.eshare;
-	}
-
-	if (consumed > inc_limit)
-		consumed = inc_limit;
-
-        BUG_ON(consumed < 0);
- out:		
-	return consumed;
-}
-
-/*
- * how much a node can consume for itself?
- */
-static inline int consume_self_surplus(int surplus,
-				       struct ckrm_cpu_class *p_cls,
-				       int check_sl
-				       )
-{
-	int consumed = 0;
-	int inc_limit;
-	int total_grt = p_cls->shares.total_guarantee;
-	int max_demand = get_mmax_demand(&p_cls->stat);
-
- 	BUG_ON(surplus < 0);
-
-	/*can't consume more than demand or hard limit*/
-	if (p_cls->stat.meshare >= max_demand)
-		goto out;
-
-	//the surplus allocation is propotional to grt
-	consumed =
-		surplus * p_cls->shares.unused_guarantee / total_grt;
-
-	if (! consumed) //no more share
-		goto out;
-
-	//hard limit and demand limit
-	inc_limit = max_demand - p_cls->stat.meshare;
-
-	if (check_sl) {
-		int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
-			/total_grt;
-		if (mesl < max_demand)
-			inc_limit = mesl - p_cls->stat.meshare;
-	}
-
-	if (consumed > inc_limit)
-		consumed = inc_limit;
-
-        BUG_ON(consumed < 0);
- out:		
-	return consumed;
-}
-
-
-/*
- * allocate surplus to all its children and also its default class
- */
-static int alloc_surplus_single_round(
-				      int surplus,
-				      struct ckrm_core_class *parent,
-				      struct ckrm_cpu_class *p_cls,
-				      int check_sl)
-{
-	struct ckrm_cpu_class *c_cls;
-	struct ckrm_core_class *child_core = NULL;
-	int total_consumed = 0,consumed;
-
-	//first allocate to the default class
-	consumed  =
-		consume_self_surplus(surplus,p_cls,check_sl);
-
-	if (consumed > 0) {
-		set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
-		total_consumed += consumed;
-	}
-
-	do {
-		child_core = ckrm_get_next_child(parent, child_core);
-		if (child_core)  {
-			c_cls = ckrm_get_cpu_class(child_core);
-			if (! c_cls)
-				return -1;
-
-			consumed    =
-				consume_surplus(surplus, c_cls,
-						     p_cls,check_sl);
-			if (consumed > 0) {
-				set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
-				total_consumed += consumed;
-			}
-		}
-	} while (child_core);
-
-	return total_consumed;
-}
-
-/**
- * alloc_surplus_node: re-allocate the shares for children under parent
- * @parent: parent node
- * return the remaining surplus
- *
- * task:
- *  1. get total surplus
- *  2. allocate surplus
- *  3. set the effective_share of each node
- */
-static int alloc_surplus_node(struct ckrm_core_class *parent)
-{
-	struct ckrm_cpu_class *p_cls,*c_cls;
-	int total_surplus,consumed;
-	int check_sl;
-	int ret = -1;
-	struct ckrm_core_class *child_core = NULL;
-
-	p_cls = ckrm_get_cpu_class(parent);
-	if (! p_cls)
-		goto realloc_out;
-
-	/*
-	 * get total surplus
-	 */
-	total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
-	BUG_ON(total_surplus < 0);
-	total_surplus += get_my_node_surplus(p_cls);
-
-	do {
-		child_core = ckrm_get_next_child(parent, child_core);
-		if (child_core) {
-			c_cls = ckrm_get_cpu_class(child_core);				
-			if (! c_cls)
-				goto realloc_out;
-
-			total_surplus += get_node_surplus(c_cls);
-		}
-	} while (child_core);
-
-
-	if (! total_surplus) {
-		ret = 0;
-		goto realloc_out;
-	}
-
-	/* 
-	 * distributing the surplus 
-	 * first with the check_sl enabled
-	 * once all the tasks has research the soft limit, disable check_sl and try again
-	 */
-	
-	check_sl = 1;
-	do {
-		consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
-		if (consumed < 0) //something is wrong
-			goto realloc_out;
-
-		if (! consumed)
-			check_sl = 0;
-		else
-			total_surplus -= consumed;
-
-	} while ((total_surplus > 0) && (consumed || check_sl) );
-
-	ret = 0;
-	
- realloc_out:
-	return ret;
-}
-
-/**
- * alloc_surplus - reallocate unused shares
- *
- * class A's usused share should be allocated to its siblings
- * the re-allocation goes downward from the top
- */
-static int alloc_surplus(struct ckrm_core_class *root_core)
-{
-	struct ckrm_core_class *cur_core, *child_core;
-	//	struct ckrm_cpu_class *cls;
-	int ret = -1;
-
-	/*initialize*/
-	cur_core = root_core;
-	child_core = NULL;
-	//	cls = ckrm_get_cpu_class(cur_core);
-
-	/*the ckrm idle tasks get all what's remaining*/
-	/*hzheng: uncomment the following like for hard limit support */
-	//	update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
-	
- repeat:
-	//check exit
-	if (!cur_core)
-		return 0;
-
-	//visit this node only once
-	if (! child_core) 
-		if ( alloc_surplus_node(cur_core) < 0 )
-			return ret;
-
-	//next child
-	child_core = ckrm_get_next_child(cur_core, child_core);
-	if (child_core) {
-		//go down
-		cur_core = child_core;
-		child_core = NULL;
-		goto repeat;
-	} else {		//no more child, go back
-		child_core = cur_core;
-		cur_core = child_core->hnode.parent;
-	}
-	goto repeat;
-}
+#ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS
 
-/**********************************************/
-/*           CKRM Idle Tasks                  */
-/**********************************************/
 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
 struct task_struct* ckrm_idle_tasks[NR_CPUS];
 
@@ -710,7 +951,7 @@ static inline int get_nr_idle(unsigned long surplus)
 	int nr_idle = 0; 
 	
 	nr_idle = surplus * cpu_online;
-	nr_idle >>= CKRM_SHARE_ACCURACY;
+	nr_idle >>= CKRM_SHARE_SHIFT;
 
 	if (surplus) 
 		nr_idle ++;
@@ -722,7 +963,8 @@ static inline int get_nr_idle(unsigned long surplus)
 }
 
 /**
- * update_ckrm_idle: update the status of the idle class according to the new surplus
+ * update_ckrm_idle: update the status of the idle class according 
+ *                   to the new surplus
  * surplus: new system surplus
  *
  * Task:
@@ -816,6 +1058,20 @@ void ckrm_start_ckrm_idle(void)
 	}
 }
 
+void ckrm_stop_ckrm_idle(void)
+{
+	BUG_ON(1);   // not yet implemented
+}
+
+#else
+
+static inline void ckrm_start_ckrm_idle(void) { };
+static inline void ckrm_stop_ckrm_idle(void) { };
+static inline void update_ckrm_idle(unsigned long surplus) { };
+
+#endif
+
+
 /**********************************************/
 /*          Local Weight                      */
 /**********************************************/
@@ -831,8 +1087,19 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
 	int i;
 	unsigned long class_weight;
 	unsigned long long lw;	
-
-	//get total pressure
+	struct ckrm_cpu_class_stat *stat;
+	unsigned long oweight;
+	unsigned long skewed_limit;
+	/*
+	 * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare
+	 * then we set the skewed_share 
+	 */
+#define SKEWED_SHARE_RATIO 8
+#define SKEWED_WEIGHT_MIN 3
+	
+	/* get total pressure of the class, if there is not pressure (.. class is
+	 * idle, then leave the weights as is
+	 */
 	for_each_online_cpu(i) {
 		lrq = get_ckrm_lrq(clsptr,i);
 		total_pressure += lrq->lrq_load;
@@ -841,32 +1108,61 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
 	if (! total_pressure)
 		return;
 	
+	stat = &clsptr->stat;
+
 	class_weight = cpu_class_weight(clsptr) * cpu_online;
 
+	/* calculate or skewed limit weight */
+	skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO);
+	if (skewed_limit < SKEWED_WEIGHT_MIN)
+		skewed_limit = SKEWED_WEIGHT_MIN;
+
+	/* calculate over_weight */	
+	BUG_ON(stat->meshare < stat->megrt);
+	oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare;
+	oweight = SHARE_TO_WEIGHT(oweight);
+
 	/*
 	 * update weight for each cpu, minimun is 1
 	 */
 	for_each_online_cpu(i) {
 		lrq = get_ckrm_lrq(clsptr,i);
-		if (! lrq->lrq_load)
-			/*give idle class a high share to boost interactiveness */
+		lrq->over_weight = oweight;
+		if (! lrq->lrq_load) {
+			/* give idle class a high share to boost 
+			 * interactiveness 
+			 */
 			lw = cpu_class_weight(clsptr); 
-		else {
-			lw = lrq->lrq_load * class_weight;
+			if (unlikely(lw==0))
+				lw = 1;
+		} else {
+			lw = lrq->lrq_load;
+			lw *= class_weight;
 			do_div(lw,total_pressure);
-			if (!lw)
+			if (unlikely(lw==0))
 				lw = 1;
-			else if (lw > CKRM_SHARE_MAX)
-				lw = CKRM_SHARE_MAX;
-		}
-		
+			else if (unlikely(lw > CKRM_MAX_WEIGHT))
+				lw = CKRM_MAX_WEIGHT;
+		}	
+		BUG_ON(lw > CKRM_MAX_WEIGHT);
+
+		/* 
+		 * set is_skewed and local_weight in proper order
+		 * to avoid race condition
+		 */
 		lrq->local_weight = lw;
+		if (lw < skewed_limit) 
+			lrq->skewed_weight = skewed_limit;
+		else
+			lrq->skewed_weight = 0;
+		BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight));
 	}
 }
 
 /*
  * assume called with class_list_lock read lock held
  */
+
 void adjust_local_weight(void)
 {
 	static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
@@ -904,9 +1200,11 @@ void ckrm_cpu_monitor(int check_min)
 	static unsigned long long last_check = 0;
 	struct ckrm_core_class *root_core = get_default_cpu_class()->core;
 	unsigned long long now;	
-#define MIN_CPU_MONITOR_INTERVAL 100000000UL
+	int loc;
+
+#define MIN_CPU_MONITOR_INTERVAL (100*1000*1000)  /* 100 MSEC */
 
-	if (!root_core)
+	if (ckrm_cpu_disabled() || !root_core)
 		return;
 
 	//do nothing if someone already holding the lock
@@ -919,24 +1217,35 @@ void ckrm_cpu_monitor(int check_min)
 
 	//consecutive check should be at least 100ms apart
 	if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL))
-		goto outunlock;
+		goto outunlock_np;
 
 	last_check = now;
 
-	if (update_effectives(root_core) != 0)
+	if (update_effectives() != 0) {
+		loc = 0;
 		goto outunlock;
+	}
 	
-	if (update_max_demand(root_core) != 0)
+	if (update_max_demand(root_core) != 0) {
+		loc = 1;
 		goto outunlock;
+	}
 	
-	if (alloc_surplus(root_core) != 0)
+	if (alloc_surplus(root_core) != 0) {
+		loc = 2;
 		goto outunlock;
+	}
 	
 	adjust_local_weight();
 
- outunlock:	
+ outunlock_np:
 	read_unlock(&class_list_lock);
 	spin_unlock(&lock);
+	return;
+
+ outunlock:	
+	printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc);
+	goto outunlock_np;
 }
 
 /*****************************************************/
@@ -948,6 +1257,8 @@ static int thread_exit = 0;
 static int ckrm_cpu_monitord(void *nothing)
 {
 	daemonize("ckrm_cpu_ctrld");
+	printk("cpu_monitord started\n");
+	thread_exit = 0;
 	for (;;) {
 		/*sleep for sometime before next try*/
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -963,15 +1274,19 @@ static int ckrm_cpu_monitord(void *nothing)
 	return 0;
 }
 
-void ckrm_start_monitor(void)
+void ckrm_cpu_start_monitor(void)
 {
+	if (cpu_monitor_pid != -1) {
+		/* already started ... */
+		return;
+	}	
 	cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
 	if (cpu_monitor_pid < 0) {
 		printk("ckrm_cpu_monitord for failed\n");
 	}
 }
 
-void ckrm_kill_monitor(void)
+void ckrm_cpu_kill_monitor(void)
 {
 	printk("killing process %d\n", cpu_monitor_pid);
 	if (cpu_monitor_pid > 0) {
@@ -983,22 +1298,12 @@ void ckrm_kill_monitor(void)
 	}
 }
 
-int ckrm_cpu_monitor_init(void)
+static int __init ckrm_cpu_init_monitor(void)
 {
-	ckrm_start_monitor();
-	/*hzheng: uncomment the following like for hard limit support */
-	//	ckrm_start_ckrm_idle();
+	if (ckrm_cpu_enabled()) 
+		ckrm_cpu_start_monitor();
 	return 0;
 }
 
-void ckrm_cpu_monitor_exit(void)
-{
-	ckrm_kill_monitor();
-}
-
-module_init(ckrm_cpu_monitor_init);
-module_exit(ckrm_cpu_monitor_exit);
+__initcall(ckrm_cpu_init_monitor);
 
-MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
-MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
-MODULE_LICENSE("GPL");
diff --git a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c
index 0fe858633..103e3f957 100644
--- a/kernel/ckrm/ckrm_listenaq.c
+++ b/kernel/ckrm/ckrm_listenaq.c
@@ -1,4 +1,4 @@
-/* ckrm_socketaq.c - accept queue resource controller
+/* ckrm_listenaq.c - accept queue resource controller
  *
  * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
  * 
@@ -251,7 +251,7 @@ static int laq_set_share_values(void *my_res, struct ckrm_shares *shares)
 	}
 
 	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-	if (!parent)		// socket_class does not have a share interface
+	if (!parent)		// socketclass does not have a share interface
 		return -EINVAL;
 
 	// Ensure that we ignore limit values
@@ -380,7 +380,7 @@ static int laq_get_stats(void *my_res, struct seq_file *sfile)
 	}
 
 	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-	if (!parent) {		// socket_class does not have a stat interface
+	if (!parent) {		// socketclass does not have a stat interface
 		printk(KERN_ERR "socketaq internal fs inconsistency\n");
 		return -EINVAL;
 	}
@@ -451,7 +451,7 @@ static void laq_change_resclass(void *n, void *old, void *r)
 }
 
 struct ckrm_res_ctlr laq_rcbs = {
-	.res_name = "laq",
+	.res_name = "listenaq",
 	.resid = -1,		// dynamically assigned
 	.res_alloc = laq_res_alloc,
 	.res_free = laq_res_free,
@@ -467,9 +467,9 @@ int __init init_ckrm_laq_res(void)
 	struct ckrm_classtype *clstype;
 	int resid;
 
-	clstype = ckrm_find_classtype_by_name("socket_class");
+	clstype = ckrm_find_classtype_by_name("socketclass");
 	if (clstype == NULL) {
-		printk(KERN_INFO " Unknown ckrm classtype<socket_class>");
+		printk(KERN_INFO " Unknown ckrm classtype<socketclass>");
 		return -ENOENT;
 	}
 
diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c
index 40d6407a3..37a2ae9f8 100644
--- a/kernel/ckrm/rbce/rbcemod.c
+++ b/kernel/ckrm/rbce/rbcemod.c
@@ -409,7 +409,7 @@ static struct rbce_class *create_rbce_class(const char *classname,
 	return cls;
 }
 
-static struct rbce_class *get_class(char *classname, int *classtype)
+static struct rbce_class *get_class(const char *classname, int *classtype)
 {
 	struct rbce_class *cls;
 	void *classobj;
diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c
index 0400844a3..fd7f8a2b4 100644
--- a/kernel/ckrm_classqueue.c
+++ b/kernel/ckrm_classqueue.c
@@ -27,14 +27,19 @@
 #include <linux/ckrm_classqueue.h>
 
 #define cq_nr_member(cq) (cq->array.nr_active)
+#define CLASSQUEUE_MASK   (CLASSQUEUE_SIZE - 1)  
 
 /**
- * get_index - translate the logical priority to the real index in the queue
+ * get_node_index - 
+ *      translate the logical priority to the real index in the queue
  * 
  * validate the position
  * a valid prio is [cq->base,cq->base + size -1]
+ * check whether node is supposed to be enqeued beyond above window and 
+ * if so set the need_repos flag 
  */
-static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
+static inline unsigned long get_node_index(struct classqueue_struct *cq, 
+					   cq_node_t * node)
 {
 	unsigned long index;
 	int max_prio;
@@ -43,22 +48,24 @@ static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
 		return 0;
 
 	max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
-	if (*prio > max_prio)
-		*prio = max_prio;
-	if (*prio < cq->base)
-		*prio = cq->base;
+	if (unlikely(node->prio > max_prio)) {
+		node->real_prio = node->prio;
+		node->prio = max_prio;
+		node->need_repos = 1;
+	} else
+		node->need_repos = 0;
 
-       	index = (cq->base_offset + (*prio - cq->base)) ;
-	if (index >= CLASSQUEUE_SIZE)
-		index -= CLASSQUEUE_SIZE;
+	if (unlikely(node->prio < cq->base))
+		node->prio = cq->base;
 
-	return index;
+       	index = (cq->base_offset + (node->prio - cq->base)) ;
+	return ( index & CLASSQUEUE_MASK );   // ensure its in limits
 }
 
 /**
  * initialize a class queue object
  */
-int classqueue_init(struct classqueue_struct *cq)
+int classqueue_init(struct classqueue_struct *cq, int enabled)
 {
 	int i;
 	struct cq_prio_array *array;
@@ -73,7 +80,8 @@ int classqueue_init(struct classqueue_struct *cq)
 	array->nr_active = 0;
 
 	cq->base = 0;
-	cq->base_offset = -1;	//not valid yet
+	cq->base_offset = 0;
+	cq->enabled = enabled;
 
 	return 0;
 }
@@ -87,8 +95,8 @@ void classqueue_enqueue(struct classqueue_struct *cq,
 	int index;
 
 	//get real index
-	if (cq_nr_member(cq)) {
-		index = get_index(cq, &prio);
+	if (cq_nr_member(cq)) {		
+		index = get_node_index(cq, node);
 	} else {		//the first one
 		cq->base = prio;
 		cq->base_offset = 0;
@@ -123,8 +131,8 @@ void classqueue_update_prio(struct classqueue_struct *cq,
 	if (! cls_in_classqueue(node)) 
 		return;
 
-	index = get_index(cq, &new_pos);
 	node->prio = new_pos;
+	index = get_node_index(cq, node);
 
 	//remove from the original position
 	list_del_init(&(node->list));
@@ -137,10 +145,32 @@ void classqueue_update_prio(struct classqueue_struct *cq,
 	node->index = index;
 }
 
+
+static inline void __classqueue_update_base(struct classqueue_struct *cq, 
+					    int new_base)
+{
+	int max_prio; 
+	if (unlikely(new_base <= cq->base)) // base will never move back
+		return; 
+	if (unlikely(!cq_nr_member(cq))) {  
+		cq->base_offset = 0;
+		cq->base = new_base;        // is this necessary ??
+		return;
+	}
+	    
+	max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
+	if (unlikely(new_base > max_prio))
+		new_base = max_prio;
+
+       	cq->base_offset = (cq->base_offset + (new_base - cq->base)) & CLASSQUEUE_MASK; 
+	cq->base = new_base;
+}
+ 
 /**
  *classqueue_get_min_prio: return the priority of the last node in queue
  *
  * this function can be called without runqueue lock held
+ * return 0 if there's nothing in the queue
  */
 static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
 {
@@ -171,9 +201,13 @@ static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
  */
 cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
 {
-	cq_node_t *result = NULL;
+	cq_node_t *node;
 	int pos;
+	int index;
+	int new_base;
 
+search_again:
+	node = NULL;
 	/* 
 	 * search over the bitmap to get the first class in the queue
 	 */
@@ -183,10 +217,38 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
 		pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
 
 	if (pos < CLASSQUEUE_SIZE) {
-		BUG_ON(list_empty(&cq->array.queue[pos]));
-		result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+		//BUG_ON(list_empty(&cq->array.queue[pos]));
+ 		node = list_entry(cq->array.queue[pos].next, cq_node_t, list);
 	}
-	return result;
+
+	//check if the node need to be repositioned
+	if (likely(! node || ! node->need_repos)) 
+		return node;
+
+	// We need to reposition this node in the class queue
+	// BUG_ON(node->prio == node->real_prio);
+	
+	//remove from the original position
+	list_del_init(&(node->list));
+	if (list_empty(&cq->array.queue[node->index]))
+	  __clear_bit(node->index, cq->array.bitmap);
+	
+	new_base = classqueue_get_min_prio(cq);
+	node->prio = node->real_prio;
+	
+	if (! new_base)
+		new_base  = node->real_prio;
+	else if (node->real_prio < new_base)
+		new_base  = node->real_prio;
+	__classqueue_update_base(cq,new_base);
+	
+	index = get_node_index(cq, node);		
+	//add to new positon, round robin for classes with same priority
+	list_add_tail(&(node->list), &cq->array.queue[index]);
+	__set_bit(index, cq->array.bitmap);	
+	node->index = index;
+	
+	goto search_again;		
 }
 
 /**
@@ -198,14 +260,11 @@ void classqueue_update_base(struct classqueue_struct *cq)
 	int new_base;
 	
 	if (! cq_nr_member(cq)) {
-		cq->base_offset = -1;	//not defined
+		cq->base = 0;
+		cq->base_offset = 0;
 		return;
 	}
 
 	new_base = classqueue_get_min_prio(cq);
-	
-	if (new_base > cq->base) {
-		cq->base_offset = get_index(cq, &new_base);
-		cq->base = new_base;
-	}
+       	__classqueue_update_base(cq,new_base);
 }
diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c
index 1ca2611dc..f16e990c1 100644
--- a/kernel/ckrm_sched.c
+++ b/kernel/ckrm_sched.c
@@ -20,6 +20,28 @@ LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
 
 struct ckrm_cpu_class default_cpu_class_obj;
 
+unsigned int ckrm_sched_mode __cacheline_aligned_in_smp = 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT
+			CKRM_SCHED_MODE_ENABLED;
+#else
+			CKRM_SCHED_MODE_DISABLED;
+#endif
+
+static int __init ckrm_cpu_enabled_setup(char *str)
+{
+	ckrm_sched_mode = CKRM_SCHED_MODE_ENABLED;
+	return 1;
+}
+
+static int __init ckrm_cpu_disabled_setup(char *str)
+{
+	ckrm_sched_mode = CKRM_SCHED_MODE_DISABLED;
+	return 1;
+}
+
+__setup("ckrmcpu",  ckrm_cpu_enabled_setup);
+__setup("nockrmcpu",ckrm_cpu_disabled_setup);
+
 struct ckrm_cpu_class * get_default_cpu_class(void) {
 	return (&default_cpu_class_obj);
 }
@@ -28,7 +50,10 @@ struct ckrm_cpu_class * get_default_cpu_class(void) {
 /*                CVT Management                       */
 /*******************************************************/
 
-static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
+//an absolute bonus of 200ms for classes when reactivated
+#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
+
+static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
 {
 	CVT_t min_cvt;
 	CVT_t bonus;
@@ -44,47 +69,40 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
 	 */
 	bonus = INTERACTIVE_BONUS(lrq);
 	//cvt can't be negative
-	if (cur_cvt > bonus)
+	if (likely(cur_cvt > bonus))
 		min_cvt = cur_cvt - bonus;
 	else
 		min_cvt = 0;
-	
-	if (lrq->local_cvt < min_cvt) {
+
+	if (lrq->local_cvt < min_cvt) {	
+		//	if (lrq->local_cvt < min_cvt && ! lrq_nr_running(lrq)) {
 		CVT_t lost_cvt;
 
-		lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq);
+		if (unlikely(lrq->local_cvt == 0)) {
+			lrq->local_cvt = cur_cvt;
+			return;
+		}
+		lost_cvt = min_cvt - lrq->local_cvt;
+		lost_cvt *= local_class_weight(lrq);
 		lrq->local_cvt = min_cvt;
+		BUG_ON(lost_cvt < 0);
 
 		/* add what the class lost to its savings*/
-		lrq->savings += lost_cvt;
+#if 1 /*zhq debugging*/
+		lrq->savings += lost_cvt;	       
+#endif
 		if (lrq->savings > MAX_SAVINGS)
 			lrq->savings = MAX_SAVINGS; 
-	} else if (lrq->savings) {
-		/*
-		 *if a class saving and falling behind
-		 * then start to use it saving in a leaking bucket way
-		 */
-		CVT_t savings_used;
-
-		savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq);
-		if (savings_used > lrq->savings)
-			savings_used = lrq->savings;
-		
-		if (savings_used > SAVINGS_LEAK_SPEED)
-			savings_used = SAVINGS_LEAK_SPEED;
-
-		BUG_ON(lrq->savings < savings_used);
-		lrq->savings -= savings_used;
-		unscale_cvt(savings_used,lrq);
-		BUG_ON(lrq->local_cvt < savings_used);
-		lrq->local_cvt -= savings_used;
-	}		
+#if 0 /* zhq debugging*/
+		printk("lrq= %x savings: %llu lost= %llu\n",(int)lrq,lrq->savings,lost_cvt);
+#endif
+	}
 }
 
 /*
  * return the max_cvt of all the classes
  */
-static inline CVT_t get_max_cvt(int this_cpu)
+CVT_t get_max_cvt(int this_cpu)
 {
         struct ckrm_cpu_class *clsptr;
         ckrm_lrq_t * lrq;
@@ -92,7 +110,6 @@ static inline CVT_t get_max_cvt(int this_cpu)
 
         max_cvt = 0;
 
-        /*update class time, at the same time get max_cvt */
         list_for_each_entry(clsptr, &active_cpu_classes, links) {
                 lrq = get_ckrm_lrq(clsptr, this_cpu);
                 if (lrq->local_cvt > max_cvt)
@@ -102,6 +119,23 @@ static inline CVT_t get_max_cvt(int this_cpu)
 	return max_cvt;
 }
 
+CVT_t get_min_cvt(int this_cpu)
+{
+        struct ckrm_cpu_class *clsptr;
+        ckrm_lrq_t * lrq;
+        CVT_t max_cvt;
+
+        max_cvt = 0xFFFFFFFFFFFFFLLU;
+
+        list_for_each_entry(clsptr, &active_cpu_classes, links) {
+                lrq = get_ckrm_lrq(clsptr, this_cpu);
+                if (lrq->local_cvt < max_cvt)
+                        max_cvt = lrq->local_cvt;
+        }
+
+	return max_cvt;
+}
+
 /**
  * update_class_cputime - updates cvt of inactive classes
  * -- an inactive class shouldn't starve others when it comes back
@@ -110,7 +144,7 @@ static inline CVT_t get_max_cvt(int this_cpu)
  * 
  * class_list_lock must have been acquired 
  */
-void update_class_cputime(int this_cpu)
+void update_class_cputime(int this_cpu, int idle)
 {
 	struct ckrm_cpu_class *clsptr;
 	ckrm_lrq_t * lrq;
@@ -168,24 +202,45 @@ void update_class_cputime(int this_cpu)
 /*******************************************************/
 /*                PID load balancing stuff             */
 /*******************************************************/
-#define PID_SAMPLE_T 32
 #define PID_KP 20
 #define PID_KI 60
 #define PID_KD 20
 
+/*
+ * runqueue load is the local_weight of all the classes on this cpu
+ * must be called with class_list_lock held
+ */
+static unsigned long ckrm_cpu_load(int cpu)
+{
+	struct ckrm_cpu_class *clsptr;
+	ckrm_lrq_t* lrq;
+	struct ckrm_cpu_demand_stat* l_stat;
+	int total_load = 0;
+	int load;
+
+	list_for_each_entry(clsptr,&active_cpu_classes,links) {
+		lrq =  get_ckrm_lrq(clsptr,cpu);
+		l_stat = get_cls_local_stat(clsptr,cpu);
+
+		load = WEIGHT_TO_SHARE(lrq->local_weight);
+		
+		if (l_stat->cpu_demand < load)
+			load = l_stat->cpu_demand;
+		total_load += load;
+	}	
+	return total_load;
+}
+
+
 /**
  * sample pid load periodically
  */
+
 void ckrm_load_sample(ckrm_load_t* pid,int cpu)
 {
 	long load;
 	long err;
 
-	if (jiffies % PID_SAMPLE_T)
-		return;
-
-	adjust_local_weight();	
-
 	load = ckrm_cpu_load(cpu);
 	err = load - pid->load_p;
 	pid->load_d = err;
@@ -195,7 +250,7 @@ void ckrm_load_sample(ckrm_load_t* pid,int cpu)
 	pid->load_i /= 10;
 }
 
-long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
+long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group)
 {
 	long pressure;
 	pressure = ckrm_load->load_p * PID_KP;
@@ -204,3 +259,58 @@ long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
 	pressure /= 100;
 	return pressure;
 }
+
+/*
+ *  called after a task is switched out. Update the local cvt accounting 
+ *  we need to stick with long instead of long long due to nonexistent 
+ *  64-bit division
+ */
+void update_local_cvt(struct task_struct *p, unsigned long nsec)
+{
+	ckrm_lrq_t * lrq = get_task_lrq(p);
+	unsigned long cvt_inc;
+
+	/*
+	 * consume from savings if eshare is larger than egrt
+	 */
+	if (lrq->savings && lrq->over_weight) {
+		unsigned long savings_used;
+
+		savings_used = nsec;
+		savings_used >>= CKRM_WEIGHT_SHIFT;
+		savings_used *= lrq->over_weight;
+		if (savings_used > lrq->savings)
+			savings_used = lrq->savings;
+		lrq->savings -= savings_used;	
+	}
+
+	//BUG_ON(local_class_weight(lrq) == 0);
+	cvt_inc = nsec / local_class_weight(lrq); 
+
+	/* 
+	 * For a certain processor, CKRM allocates CPU time propotional 
+	 * to the class's local_weight. So once a class consumed nsec, 
+	 * it will wait for X (nsec) for its next turn.
+	 *
+	 * X is calculated based on the following fomular
+	 *     nsec / local_weight < X / (CKRM_MAX_WEIGHT - local_weight)
+	 * if local_weight is small, then approximated as
+	 *     nsec / local_weight < X / (CKRM_MAX_WEIGHT)
+	 */
+#define CVT_STARVATION_LIMIT (200LL*NSEC_PER_MS)
+#define CVT_STARVATION_INC_LIMIT (CVT_STARVATION_LIMIT >> CKRM_WEIGHT_SHIFT)
+
+	if (unlikely(lrq->skewed_weight)) {
+		unsigned long long starvation_limit = CVT_STARVATION_INC_LIMIT;
+		
+		starvation_limit *= local_class_weight(lrq);
+		if (unlikely(cvt_inc > starvation_limit))	  
+			cvt_inc = nsec / lrq->skewed_weight;
+	}
+
+	/* now update the CVT accounting */
+
+	lrq->local_cvt += cvt_inc;
+	lrq->uncounted_ns += nsec;
+	update_class_priority(lrq);
+}
diff --git a/kernel/sched.c b/kernel/sched.c
index 743e5bc8a..d65addbff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -42,6 +42,8 @@
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
+#include <linux/ckrm_classqueue.h>
+#include <linux/ckrm_sched.h>
 
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
@@ -198,8 +200,6 @@ unsigned int task_timeslice(task_t *p)
  */
 
 typedef struct runqueue runqueue_t;
-#include <linux/ckrm_classqueue.h>
-#include <linux/ckrm_sched.h>
 
 /*
  * This is the main, per-CPU runqueue data structure.
@@ -220,17 +220,21 @@ struct runqueue {
 	unsigned long cpu_load;
 #endif
 	unsigned long long nr_switches;
-	unsigned long expired_timestamp, nr_uninterruptible;
+	unsigned long nr_uninterruptible;
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+	unsigned long expired_timestamp;
+	int best_expired_prio;
+#endif
 	unsigned long long timestamp_last_tick;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
 	struct classqueue_struct classqueue;   
 	ckrm_load_t ckrm_load;
+	ckrm_lrq_t   dflt_lrq; /* local runqueue of the default class */
 #else
         prio_array_t *active, *expired, arrays[2];
 #endif
-	int best_expired_prio;
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
@@ -308,10 +312,72 @@ static inline void rq_unlock(runqueue_t *rq)
 	spin_unlock_irq(&rq->lock);
 }
 
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq);
+static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq);
+
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
+
+#define ckrm_rq_cpu_disabled(rq) (!rq->classqueue.enabled)
+#define ckrm_rq_cpu_enabled(rq)  ( rq->classqueue.enabled)
+
+static inline void class_enqueue_task(struct task_struct *p,
+			              prio_array_t * array)
+{
+	ckrm_lrq_t *lrq;
+	int effective_prio;
+	
+	if (ckrm_rq_cpu_disabled(task_rq(p)))
+		return;
+	
+	lrq = get_task_lrq(p);
+	// BUG_ON(lrq==NULL); 
+	
+	cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
+	lrq->lrq_load += task_load(p);
+	
+	if ((p->prio < lrq->top_priority) && (array == lrq->active))
+		set_top_priority(lrq, p->prio);	
+	
+	if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
+		cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
+		effective_prio = get_effective_prio(lrq);
+		classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, 
+				   effective_prio);
+	} 
+	
+}
+
+static inline void class_dequeue_task(struct task_struct *p,
+				      prio_array_t * array)
+{
+	ckrm_lrq_t *lrq;
+	unsigned long load;
+	
+	if (ckrm_rq_cpu_disabled(task_rq(p)))
+		return;
+	
+	lrq = get_task_lrq(p);
+	load = task_load(p); 
+	
+	// BUG_ON(lrq->lrq_load < load); 	
+	
+	lrq->lrq_load -= load;
+	
+	cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
+	
+	if ((array == lrq->active) && (p->prio == lrq->top_priority)
+	    && list_empty(&(array->queue[p->prio])))
+		set_top_priority(lrq,find_next_bit(array->bitmap, MAX_PRIO,
+						   p->prio));
+}
+
 static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
 {
-	cq_node_t *node = classqueue_get_head(&rq->classqueue);
+	cq_node_t *node;
+
+	if (ckrm_rq_cpu_disabled(rq)) 
+		return &rq->dflt_lrq;
+	node = classqueue_get_head(&rq->classqueue);
 	return ((node) ? class_list_entry(node) : NULL);
 }
 
@@ -330,51 +396,189 @@ CVT_t get_local_cur_cvt(int cpu)
 		return 0;
 }
 
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
+						    int cpu) 
 {
 	prio_array_t               *array;
 	struct task_struct         *next;
 	ckrm_lrq_t *queue;
 	int idx;
-	int cpu = smp_processor_id();
 
-	// it is guaranteed be the ( rq->nr_running > 0 ) check in 
-	// schedule that a task will be found.
+	if (ckrm_rq_cpu_disabled(rq)) {
+		/* original code from schedule(void) 
+		 * see also code in non CKRM configuration
+		 */
+		struct list_head *array_queue;
+		ckrm_lrq_t  *lrq = get_ckrm_lrq(get_default_cpu_class(),cpu);
+
+		if (unlikely(!rq->nr_running)) {
+			idle_balance(cpu, rq);
+			if (!rq->nr_running) {
+				rq->dflt_lrq.expired_timestamp = 0;
+				wake_sleeping_dependent(cpu, rq);
+				return NULL;
+			}
+		}
+
+	        array = lrq->active;
+		if (unlikely(!array->nr_active)) {
+			/*
+			 * Switch the active and expired arrays.
+			 */
+			lrq->active = lrq->expired;
+			lrq->expired = array;
+			array = lrq->active; 
+			lrq->expired_timestamp = 0;
+			lrq->best_expired_prio = MAX_PRIO;
+		}
+
+        	idx = sched_find_first_bit(array->bitmap);
+        	array_queue = array->queue + idx;
+        	next = list_entry(array_queue->next, task_t, run_list);
+		return next;
+	}
 
+	/*-- CKRM SCHEDULER --*/
+	
  retry_next_class:
+	/* we can't use (rq->nr_running == 0) to declare idleness
+	 * first we have to make sure that the class runqueue is properly
+	 * processed. This is due to two facts/requirements:
+	 * (a) when the last task is removed form an lrq we do not remove
+	 *     the lrq from the class runqueue. As a result the lrq is 
+	 *     selected again and we can perform necessary 
+	 *     expired switches.
+	 * (b) perform outstanding expired switches
+	 * 
+	 */
+
 	queue = rq_get_next_class(rq);
-	// BUG_ON( !queue );
+	if (unlikely(queue == NULL)) {
+		idle_balance(cpu, rq);
+		if (!rq->nr_running) {
+			rq->dflt_lrq.expired_timestamp = 0;
+			wake_sleeping_dependent(cpu, rq);
+			return NULL;
+		}
+		goto retry_next_class; // try again
+	}
 
 	array = queue->active;
 	if (unlikely(!array->nr_active)) {
 		queue->active = queue->expired;
 		queue->expired = array;
+		array = queue->active;
 		queue->expired_timestamp = 0;
 
-		if (queue->active->nr_active)
+		if (array->nr_active)
 			set_top_priority(queue,
-					 find_first_bit(queue->active->bitmap, MAX_PRIO));
+					 find_first_bit(array->bitmap,MAX_PRIO));
 		else {
+			/* since we do not dequeue a lrq when it becomes empty
+	  		 * but rely on the switching mechanism, we must dequeue
+			 * at this point
+			 */
 			classqueue_dequeue(queue->classqueue,
 					   &queue->classqueue_linkobj);
-			cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
+			cpu_demand_event(get_rq_local_stat(queue,cpu),
+					 CPU_DEMAND_DEQUEUE,0);
 		}
 		goto retry_next_class; 				
 	}
-	// BUG_ON(!array->nr_active);
 
 	idx = queue->top_priority;
-	// BUG_ON (idx == MAX_PRIO);
+	//BUG_ON(!array->nr_active);
+	//BUG_ON(idx == MAX_PRIO);
+	//BUG_ON(list_empty(array->queue+idx));
 	next = task_list_entry(array->queue[idx].next);
 	return next;
 }
+
+static inline void ckrm_account_task(struct runqueue* rq, 
+				     struct task_struct *prev, 
+			             unsigned long long now)
+{
+  	if ((prev != rq->idle) && ckrm_rq_cpu_enabled(rq) ) {
+		unsigned long long run = now - prev->timestamp;
+		ckrm_lrq_t * lrq = get_task_lrq(prev);
+
+		lrq->lrq_load -= task_load(prev);
+		cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
+		lrq->lrq_load += task_load(prev);
+
+		cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+  		update_local_cvt(prev, run);
+	}
+
+}
+
+#ifdef CONFIG_SMP
+#define COND_SMP(dflt,cond) (cond)
+#else
+#define COND_SMP(dflt,cond) (dflt)
+#endif
+
+static inline void ckrm_sched_tick(unsigned long j,int this_cpu, int idle,
+				   runqueue_t *rq)
+{
+	/* first determine whether we have to do anything
+	 * without grabing the global lock
+	 */
+
+	int sample, update;
+
+#ifdef __SIMULATOR__
+	if ((this_cpu == 0) && (j % 1000) == 0) {
+		ckrm_cpu_monitor(1);
+	}
+#endif
+	
+	if (ckrm_rq_cpu_disabled(rq))
+		return;
+	
+	update = (j % CVT_UPDATE_TICK);
+	sample = COND_SMP(1,(j % CPU_PID_CTRL_TICK)); 
+	
+// avoid taking the global class_list lock on every tick 
+	if (likely(update && sample))
+		return;   // nothing to be done;
+	
+	read_lock(&class_list_lock);
+	
+#ifdef CONFIG_SMP
+	if (sample==0) {
+		ckrm_load_sample(rq_ckrm_load(rq),this_cpu);
+	}
+#endif
+	
+	if (update==0) {
+		classqueue_update_base(get_cpu_classqueue(this_cpu));
+		update_class_cputime(this_cpu,idle);
+		// occasionally we need to call the weight adjustment
+		// for SMP systems
+		if (COND_SMP(0,(this_cpu==0)))
+			adjust_local_weight();   
+	}
+	
+	read_unlock(&class_list_lock);
+}
+
 #else /*! CONFIG_CKRM_CPU_SCHEDULE*/
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
+						    int cpu) 
 {
 	prio_array_t *array;
         struct list_head *queue;
 	int idx;
 
+	if (unlikely(!rq->nr_running)) {
+		idle_balance(cpu, rq);
+                if (!rq->nr_running) {
+                        rq->expired_timestamp = 0;
+                        wake_sleeping_dependent(cpu, rq);
+                        return NULL;
+                }
+	}
 	array = rq->active;
 	if (unlikely(!array->nr_active)) {
 		/*
@@ -392,11 +596,17 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
 	return list_entry(queue->next, task_t, run_list);
 }
 
-static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
-static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_enqueue_task(struct task_struct* p, 
+				      prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, 
+				      prio_array_t *array) { }
 static inline void init_cpu_classes(void) { }
+static inline void ckrm_sched_tick(int j,int this_cpu,int idle, void* arg) {}
+static inline void ckrm_account_task(struct runqueue* rq, struct 
+				     task_struct *prev, 
+				     unsigned long long now)  { }
 #define rq_ckrm_load(rq) NULL
-static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
+
 #endif  /* CONFIG_CKRM_CPU_SCHEDULE */
 
 /*
@@ -1533,261 +1743,129 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 	return 1;
 }
 
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
-				      int phase, enum idle_type idle)
-{
-	long pressure = task_load(tmp);
-	
-	if (pressure > max) 
-		return 0;
-
-	if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
-		return 0;
-	return 1;
-}
-
 /*
- * move tasks for a specic local class
- * return number of tasks pulled
+ * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
+ * as part of a balancing operation within "domain". Returns the number of
+ * tasks moved.
+ *
+ * Called with both runqueues locked.
  */
-static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
-				      runqueue_t *this_rq,
-				      runqueue_t *busiest,
-				      struct sched_domain *sd,
-				      int this_cpu,
-				      enum idle_type idle,
-				      long* pressure_imbalance) 
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+		      unsigned long max_nr_move, struct sched_domain *sd,
+		      enum idle_type idle)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
+	int idx, pulled = 0;
 	task_t *tmp;
-	int idx;
-	int pulled = 0;
-	int phase = -1;
-	long pressure_min, pressure_max;
-	/*hzheng: magic : 90% balance is enough*/
-	long balance_min = *pressure_imbalance / 10; 
-/*
- * we don't want to migrate tasks that will reverse the balance
- *     or the tasks that make too small difference
- */
-#define CKRM_BALANCE_MAX_RATIO	100
-#define CKRM_BALANCE_MIN_RATIO	1
- start:
-	phase ++;
+#if CONFIG_CKRM_CPU_SCHEDULE
+	/* need to distinguish between the runqueues and the class
+         * local runqueues.
+	 * we know we can get here only if the dflt class is present
+	 */
+	ckrm_lrq_t *l_this_rq = &this_rq->dflt_lrq;
+	ckrm_lrq_t *l_busiest = &busiest->dflt_lrq;
+#else
+#define l_busiest busiest
+#define l_this_rq this_rq
+#endif
+
+	if (max_nr_move <= 0 || busiest->nr_running <= 1)
+		goto out;
+
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
 	 * be cache-cold, thus switching CPUs has the least effect
 	 * on them.
 	 */
-	if (src_lrq->expired->nr_active) {
-		array = src_lrq->expired;
-		dst_array = dst_lrq->expired;
+	if (l_busiest->expired->nr_active) {
+		array = l_busiest->expired;
+		dst_array = l_this_rq->expired;
 	} else {
-		array = src_lrq->active;
-		dst_array = dst_lrq->active;
+		array = l_busiest->active;
+		dst_array = l_this_rq->active;
 	}
-	
- new_array:
+
+new_array:
 	/* Start searching at priority 0: */
 	idx = 0;
- skip_bitmap:
+skip_bitmap:
 	if (!idx)
 		idx = sched_find_first_bit(array->bitmap);
 	else
 		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
 	if (idx >= MAX_PRIO) {
-		if (array == src_lrq->expired && src_lrq->active->nr_active) {
-			array = src_lrq->active;
-			dst_array = dst_lrq->active;
+		if (array == l_busiest->expired && l_busiest->active->nr_active) {
+			array = l_busiest->active;
+			dst_array = l_this_rq->active;
 			goto new_array;
 		}
-		if ((! phase) && (! pulled) && (idle != IDLE))
-			goto start; //try again
-		else 
-			goto out; //finished search for this lrq
+		goto out;
 	}
-	
+
 	head = array->queue + idx;
 	curr = head->prev;
- skip_queue:
+skip_queue:
 	tmp = list_entry(curr, task_t, run_list);
-	
+
 	curr = curr->prev;
-	
+
 	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
 		goto skip_bitmap;
 	}
+	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+	pulled++;
 
-	pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
-	pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
-	/*
-	 * skip the tasks that will reverse the balance too much
-	 */
-	if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
-		*pressure_imbalance -= task_load(tmp);
-		pull_task(busiest, array, tmp, 
-			  this_rq, dst_array, this_cpu);
-		pulled++;
-
-		if (*pressure_imbalance <= balance_min)
-			goto out;
+	/* We only want to steal up to the prescribed number of tasks. */
+	if (pulled < max_nr_move) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
 	}
-		
-	if (curr != head)
-		goto skip_queue;
-	idx++;
-	goto skip_bitmap;
- out:	       
+out:
 	return pulled;
 }
 
-static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
-{
-	long imbalance;
-	/*
-	 * make sure after balance, imbalance' > - imbalance/2
-	 * we don't want the imbalance be reversed too much
-	 */
-	imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) 
-		- pid_get_pressure(rq_ckrm_load(this_rq),1);
-	imbalance /= 2;
-	return imbalance;
-}
-
 /*
- * try to balance the two runqueues
- *
- * Called with both runqueues locked.
- * if move_tasks is called, it will try to move at least one task over
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the number of tasks which should be
+ * moved to restore balance via the imbalance parameter.
  */
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle)
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+		   unsigned long *imbalance, enum idle_type idle)
 {
-	struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
-	ckrm_lrq_t* src_lrq,*dst_lrq;
-	long pressure_imbalance, pressure_imbalance_old;
-	int src_cpu = task_cpu(busiest->curr);
-	struct list_head *list;
-	int pulled = 0;
-	long imbalance;
-
-	imbalance =  ckrm_rq_imbalance(this_rq,busiest);
+	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 
-	if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
-		goto out;
+	max_load = this_load = total_load = total_pwr = 0;
 
-	//try to find the vip class
-        list_for_each_entry(clsptr,&active_cpu_classes,links) {
-		src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+	do {
+		cpumask_t tmp;
+		unsigned long load;
+		int local_group;
+		int i, nr_cpus = 0;
 
-		if (! lrq_nr_running(src_lrq))
-			continue;
-
-		if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
-			{
-				vip_cls = clsptr;
-			}
-	}
-
-	/*
-	 * do search from the most significant class
-	 * hopefully, less tasks will be migrated this way
-	 */
-	clsptr = vip_cls;
-
- move_class:
-	if (! clsptr)
-		goto out;
-	
-
-	src_lrq = get_ckrm_lrq(clsptr,src_cpu);
-	if (! lrq_nr_running(src_lrq))
-		goto other_class;
-	
-	dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
-
-	//how much pressure for this class should be transferred
-	pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
-	if (pulled && ! pressure_imbalance) 
-		goto other_class;
-	
-	pressure_imbalance_old = pressure_imbalance;
-	
-	//move tasks
-	pulled += 
-		ckrm_cls_move_tasks(src_lrq,dst_lrq,
-				    this_rq,
-				    busiest,
-				    sd,this_cpu,idle,
-				    &pressure_imbalance);
-
-	/* 
-	 * hzheng: 2 is another magic number
-	 * stop balancing if the imbalance is less than 25% of the orig
-	 */
-	if (pressure_imbalance <= (pressure_imbalance_old >> 2))
-		goto out;
-		
-	//update imbalance
-	imbalance *= pressure_imbalance / pressure_imbalance_old;
- other_class:
-	//who is next?
-	list = clsptr->links.next;
-	if (list == &active_cpu_classes)
-		list = list->next;
-	clsptr = list_entry(list, typeof(*clsptr), links);
-	if (clsptr != vip_cls)
-		goto move_class;
- out:
-	return pulled;
-}
-
-/**
- * ckrm_check_balance - is load balancing necessary?
- * return 0 if load balancing is not necessary
- * otherwise return the average load of the system
- * also, update nr_group
- *
- * heuristics: 
- *   no load balancing if it's load is over average
- *   no load balancing if it's load is far more than the min
- * task:
- *   read the status of all the runqueues
- */
-static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
-					     enum idle_type idle, int* nr_group)
-{
-	struct sched_group *group = sd->groups;
-	unsigned long min_load, max_load, avg_load;
-	unsigned long total_load, this_load, total_pwr;
-
-	max_load = this_load = total_load = total_pwr = 0;
-	min_load = 0xFFFFFFFF;
-	*nr_group = 0;
-
-	do {
-		cpumask_t tmp;
-		unsigned long load;
-		int local_group;
-		int i, nr_cpus = 0;
+		local_group = cpu_isset(this_cpu, group->cpumask);
 
 		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
 		cpus_and(tmp, group->cpumask, cpu_online_map);
 		if (unlikely(cpus_empty(tmp)))
 			goto nextgroup;
 
-		avg_load = 0;
-		local_group = cpu_isset(this_cpu, group->cpumask);
-
 		for_each_cpu_mask(i, tmp) {
-			load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = target_load(i);
+			else
+				load = source_load(i);
+
 			nr_cpus++;
 			avg_load += load;
 		}
@@ -1803,386 +1881,86 @@ static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
 
 		if (local_group) {
 			this_load = avg_load;
+			this = group;
 			goto nextgroup;
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
-		}      
-		if (avg_load < min_load) {
-			min_load = avg_load;
+			busiest = group;
 		}
 nextgroup:
 		group = group->next;
-		*nr_group = *nr_group + 1;
 	} while (group != sd->groups);
 
-	if (!max_load || this_load >= max_load)
+	if (!busiest || this_load >= max_load)
 		goto out_balanced;
 
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 
-	/* hzheng: debugging: 105 is a magic number
-	 * 100*max_load <= sd->imbalance_pct*this_load)
-	 * should use imbalance_pct instead
-	 */
-	if (this_load > avg_load 
-	    || 100*max_load < 105*this_load
-	    || 100*min_load < 70*this_load
-	    )
+	if (this_load >= avg_load ||
+			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 
-	return avg_load;
- out_balanced:
-	return 0;
-}
-
-/**
- * any group that has above average load is considered busy
- * find the busiest queue from any of busy group
- */
-static runqueue_t *
-ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
-		     unsigned long avg_load, enum idle_type idle,
-		     int nr_group)
-{
-	struct sched_group *group;
-	runqueue_t * busiest=NULL;
-	unsigned long rand;
-	
-	group = sd->groups;
-	rand = get_ckrm_rand(nr_group);
-	nr_group = 0;
+	/*
+	 * We're trying to get all the cpus to the average_load, so we don't
+	 * want to push ourselves above the average load, nor do we wish to
+	 * reduce the max loaded cpu below the average load, as either of these
+	 * actions would just result in more rebalancing later, and ping-pong
+	 * tasks around. Thus we look for the minimum possible imbalance.
+	 * Negative imbalances (*we* are more loaded than anyone else) will
+	 * be counted as no imbalance for these purposes -- we can't fix that
+	 * by pulling tasks to us.  Be careful of negative numbers as they'll
+	 * appear as very large values with unsigned longs.
+	 */
+	*imbalance = min(max_load - avg_load, avg_load - this_load);
 
-	do {
-		unsigned long load,total_load,max_load;
-		cpumask_t tmp;
-		int i;
-		runqueue_t * grp_busiest;
+	/* How much load to actually move to equalise the imbalance */
+	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
+				/ SCHED_LOAD_SCALE;
 
-		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (unlikely(cpus_empty(tmp)))
-			goto find_nextgroup;
+	if (*imbalance < SCHED_LOAD_SCALE - 1) {
+		unsigned long pwr_now = 0, pwr_move = 0;
+		unsigned long tmp;
 
-		total_load = 0;
-		max_load = 0;
-		grp_busiest = NULL;
-		for_each_cpu_mask(i, tmp) {
-			load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
-			total_load += load;
-			if (load > max_load) {
-				max_load = load;
-				grp_busiest = cpu_rq(i);
-			}				
+		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
+			*imbalance = 1;
+			return busiest;
 		}
 
-		total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
-		if (total_load > avg_load) {
-			busiest = grp_busiest;
-			if (nr_group >= rand)
-				break;
-		}
-	find_nextgroup:		
-		group = group->next;
-		nr_group ++;
-	} while (group != sd->groups);
+		/*
+		 * OK, we don't have enough imbalance to justify moving tasks,
+		 * however we may be able to increase total CPU power used by
+		 * moving them.
+		 */
 
-	return busiest;
-}
+		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now /= SCHED_LOAD_SCALE;
 
-/**
- * load_balance - pressure based load balancing algorithm used by ckrm
- */
-static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
-			struct sched_domain *sd, enum idle_type idle)
-{
-	runqueue_t *busiest;
-	unsigned long avg_load;
-	int nr_moved,nr_group;
+		/* Amount of load we'd subtract */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		if (max_load > tmp)
+			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+							max_load - tmp);
 
-	avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
-	if (! avg_load)
-		goto out_balanced;
+		/* Amount of load we'd add */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+		if (max_load < tmp)
+			tmp = max_load;
+		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+		pwr_move /= SCHED_LOAD_SCALE;
 
-	busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
-	if (! busiest)
-		goto out_balanced;
-	/*
-	 * This should be "impossible", but since load
-	 * balancing is inherently racy and statistical,
-	 * it could happen in theory.
-	 */
-	if (unlikely(busiest == this_rq)) {
-		WARN_ON(1);
-		goto out_balanced;
-	}
+		/* Move if we gain another 8th of a CPU worth of throughput */
+		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+			goto out_balanced;
 
-	nr_moved = 0;
-	if (busiest->nr_running > 1) {
-		/*
-		 * Attempt to move tasks. If find_busiest_group has found
-		 * an imbalance but busiest->nr_running <= 1, the group is
-		 * still unbalanced. nr_moved simply stays zero, so it is
-		 * correctly treated as an imbalance.
-		 */
-		double_lock_balance(this_rq, busiest);
-		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-				      0,sd, idle);		
-		spin_unlock(&busiest->lock);
-		if (nr_moved) {
-			adjust_local_weight();
-		}
+		*imbalance = 1;
+		return busiest;
 	}
 
-	if (!nr_moved) 
-		sd->nr_balance_failed ++;
-	else
-		sd->nr_balance_failed  = 0;		
+	/* Get rid of the scaling factor, rounding down as we divide */
+	*imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
 
-	/* We were unbalanced, so reset the balancing interval */
-	sd->balance_interval = sd->min_interval;
-
-	return nr_moved;
-
-out_balanced:
-	/* tune up the balancing interval */
-	if (sd->balance_interval < sd->max_interval)
-		sd->balance_interval *= 2;
-
-	return 0;
-}
-
-/*
- * this_rq->lock is already held
- */
-static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
-				       struct sched_domain *sd)
-{
-	int ret;
-	read_lock(&class_list_lock);
-	ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
-	read_unlock(&class_list_lock);
-	return ret;
-}
-
-static inline int load_balance(int this_cpu, runqueue_t *this_rq,
-			struct sched_domain *sd, enum idle_type idle)
-{
-	int ret;
-
-	spin_lock(&this_rq->lock);
-	read_lock(&class_list_lock);
-	ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
-	read_unlock(&class_list_lock);
-	spin_unlock(&this_rq->lock);
-	return ret;
-}
-#else /*! CONFIG_CKRM_CPU_SCHEDULE */
-/*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle)
-{
-	prio_array_t *array, *dst_array;
-	struct list_head *head, *curr;
-	int idx, pulled = 0;
-	task_t *tmp;
-
-	if (max_nr_move <= 0 || busiest->nr_running <= 1)
-		goto out;
-
-	/*
-	 * We first consider expired tasks. Those will likely not be
-	 * executed in the near future, and they are most likely to
-	 * be cache-cold, thus switching CPUs has the least effect
-	 * on them.
-	 */
-	if (busiest->expired->nr_active) {
-		array = busiest->expired;
-		dst_array = this_rq->expired;
-	} else {
-		array = busiest->active;
-		dst_array = this_rq->active;
-	}
-
-new_array:
-	/* Start searching at priority 0: */
-	idx = 0;
-skip_bitmap:
-	if (!idx)
-		idx = sched_find_first_bit(array->bitmap);
-	else
-		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
-	if (idx >= MAX_PRIO) {
-		if (array == busiest->expired && busiest->active->nr_active) {
-			array = busiest->active;
-			dst_array = this_rq->active;
-			goto new_array;
-		}
-		goto out;
-	}
-
-	head = array->queue + idx;
-	curr = head->prev;
-skip_queue:
-	tmp = list_entry(curr, task_t, run_list);
-
-	curr = curr->prev;
-
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
-		if (curr != head)
-			goto skip_queue;
-		idx++;
-		goto skip_bitmap;
-	}
-	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
-	pulled++;
-
-	/* We only want to steal up to the prescribed number of tasks. */
-	if (pulled < max_nr_move) {
-		if (curr != head)
-			goto skip_queue;
-		idx++;
-		goto skip_bitmap;
-	}
-out:
-	return pulled;
-}
-
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
- * moved to restore balance via the imbalance parameter.
- */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum idle_type idle)
-{
-	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
-
-	max_load = this_load = total_load = total_pwr = 0;
-
-	do {
-		cpumask_t tmp;
-		unsigned long load;
-		int local_group;
-		int i, nr_cpus = 0;
-
-		local_group = cpu_isset(this_cpu, group->cpumask);
-
-		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
-		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (unlikely(cpus_empty(tmp)))
-			goto nextgroup;
-
-		for_each_cpu_mask(i, tmp) {
-			/* Bias balancing toward cpus of our domain */
-			if (local_group)
-				load = target_load(i);
-			else
-				load = source_load(i);
-
-			nr_cpus++;
-			avg_load += load;
-		}
-
-		if (!nr_cpus)
-			goto nextgroup;
-
-		total_load += avg_load;
-		total_pwr += group->cpu_power;
-
-		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
-
-		if (local_group) {
-			this_load = avg_load;
-			this = group;
-			goto nextgroup;
-		} else if (avg_load > max_load) {
-			max_load = avg_load;
-			busiest = group;
-		}
-nextgroup:
-		group = group->next;
-	} while (group != sd->groups);
-
-	if (!busiest || this_load >= max_load)
-		goto out_balanced;
-
-	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
-
-	if (this_load >= avg_load ||
-			100*max_load <= sd->imbalance_pct*this_load)
-		goto out_balanced;
-
-	/*
-	 * We're trying to get all the cpus to the average_load, so we don't
-	 * want to push ourselves above the average load, nor do we wish to
-	 * reduce the max loaded cpu below the average load, as either of these
-	 * actions would just result in more rebalancing later, and ping-pong
-	 * tasks around. Thus we look for the minimum possible imbalance.
-	 * Negative imbalances (*we* are more loaded than anyone else) will
-	 * be counted as no imbalance for these purposes -- we can't fix that
-	 * by pulling tasks to us.  Be careful of negative numbers as they'll
-	 * appear as very large values with unsigned longs.
-	 */
-	*imbalance = min(max_load - avg_load, avg_load - this_load);
-
-	/* How much load to actually move to equalise the imbalance */
-	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
-				/ SCHED_LOAD_SCALE;
-
-	if (*imbalance < SCHED_LOAD_SCALE - 1) {
-		unsigned long pwr_now = 0, pwr_move = 0;
-		unsigned long tmp;
-
-		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = 1;
-			return busiest;
-		}
-
-		/*
-		 * OK, we don't have enough imbalance to justify moving tasks,
-		 * however we may be able to increase total CPU power used by
-		 * moving them.
-		 */
-
-		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
-		pwr_now /= SCHED_LOAD_SCALE;
-
-		/* Amount of load we'd subtract */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
-		if (max_load > tmp)
-			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
-							max_load - tmp);
-
-		/* Amount of load we'd add */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-		if (max_load < tmp)
-			tmp = max_load;
-		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
-		pwr_move /= SCHED_LOAD_SCALE;
-
-		/* Move if we gain another 8th of a CPU worth of throughput */
-		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
-			goto out_balanced;
-
-		*imbalance = 1;
-		return busiest;
-	}
-
-	/* Get rid of the scaling factor, rounding down as we divide */
-	*imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
-
-	return busiest;
+	return busiest;
 
 out_balanced:
 	if (busiest && (idle == NEWLY_IDLE ||
@@ -2224,6 +2002,17 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
  *
  * Called with this_rq unlocked.
  */
+
+static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+				    struct sched_domain *sd, 
+				    enum idle_type idle)
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+{
+	return -1;
+}
+#endif
+;
+
 static int load_balance(int this_cpu, runqueue_t *this_rq,
 			struct sched_domain *sd, enum idle_type idle)
 {
@@ -2234,6 +2023,9 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 
 	spin_lock(&this_rq->lock);
 
+	if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,idle)) != -1)
+		goto out_balanced;
+
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
 	if (!group)
 		goto out_balanced;
@@ -2319,8 +2111,12 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	struct sched_group *group;
 	runqueue_t *busiest = NULL;
 	unsigned long imbalance;
-	int nr_moved = 0;
+	int nr_moved;
+
+	if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE)) != -1)
+		goto out;
 
+	nr_moved = 0;
 	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
 	if (!group)
 		goto out;
@@ -2340,8 +2136,6 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 out:
 	return nr_moved;
 }
-#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
-
 
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
@@ -2447,6 +2241,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
 
+	ckrm_sched_tick(j,this_cpu,(idle != NOT_IDLE),this_rq);
+
 	/* Update our load */
 	old_load = this_rq->cpu_load;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -2485,7 +2281,9 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
  */
 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
 {
+	ckrm_sched_tick(jiffies,cpu,(idle != NOT_IDLE),rq);
 }
+
 static inline void idle_balance(int cpu, runqueue_t *rq)
 {
 }
@@ -2522,15 +2320,19 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 
 #ifndef CONFIG_CKRM_CPU_SCHEDULE
 #define EXPIRED_STARVING(rq) \
-	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+		((STARVATION_LIMIT && ((rq)->expired_timestamp && \
 		(jiffies - (rq)->expired_timestamp >= \
 			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
 			((rq)->curr->static_prio > (rq)->best_expired_prio))
 #else
+/* we need to scale the starvation based on weight 
+ * classes with small weight have longer expiration starvation
+ */
 #define EXPIRED_STARVING(rq) \
- 		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
+                ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
  		(jiffies - (rq)->expired_timestamp >= \
- 			STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
+ 			(((STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)*CKRM_MAX_WEIGHT)/rq->local_weight)))) || \
+			(this_rq()->curr->static_prio > (rq)->best_expired_prio))
 #endif
 
 /*
@@ -2568,7 +2370,6 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			cpustat->idle += sys_ticks;
 		if (wake_priority_sleeper(rq))
 			goto out;
-		ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
 		rebalance_tick(cpu, rq, IDLE);
 		return;
 	}
@@ -2609,8 +2410,11 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 	}
 	if (!--p->time_slice) {
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
-		/* Hubertus ... we can abstract this out */
-		ckrm_lrq_t* rq = get_task_lrq(p);
+		/* we redefine RQ to be a local runqueue */
+		ckrm_lrq_t* rq;
+		runqueue_t *cpu_rq = this_rq();
+		rq = ckrm_rq_cpu_enabled(cpu_rq) ? get_task_lrq(p) 
+					         : &(cpu_rq->dflt_lrq);
 #endif
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
@@ -2622,8 +2426,8 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			rq->expired_timestamp = jiffies;
 		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
 			enqueue_task(p, rq->expired);
-			if (p->static_prio < this_rq()->best_expired_prio)
-				this_rq()->best_expired_prio = p->static_prio;
+			if (p->static_prio < rq->best_expired_prio)
+				rq->best_expired_prio = p->static_prio;
 		} else
 			enqueue_task(p, rq->active);
 	} else {
@@ -2657,7 +2461,6 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 out_unlock:
 	spin_unlock(&rq->lock);
 out:
-	ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
 	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
@@ -2793,19 +2596,8 @@ need_resched:
 
 	spin_lock_irq(&rq->lock);
 
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-  	if (prev != rq->idle) {
-		unsigned long long run = now - prev->timestamp;
-		ckrm_lrq_t * lrq = get_task_lrq(prev);
-
-		lrq->lrq_load -= task_load(prev);
-		cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
-		lrq->lrq_load += task_load(prev);
+	ckrm_account_task(rq,prev,now);
 
-		cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
-  		update_local_cvt(prev, run);
-	}
-#endif
 	/*
 	 * if entering off of a kernel preemption go straight
 	 * to picking the next task.
@@ -2821,19 +2613,12 @@ need_resched:
 	}
 
 	cpu = smp_processor_id();
-	if (unlikely(!rq->nr_running)) {
-		idle_balance(cpu, rq);
-                if (!rq->nr_running) {
-                        next = rq->idle;
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-                        rq->expired_timestamp = 0;
-#endif
-                        wake_sleeping_dependent(cpu, rq);
-                        goto switch_tasks;
-                }
-	}
 
-	next = rq_get_next_task(rq);
+	next = rq_get_next_task(rq,cpu);
+	if (unlikely(next == NULL)) {
+		next = rq->idle;
+		goto switch_tasks;
+	}
 
 	if (dependent_sleeper(cpu, rq, next)) {
 		next = rq->idle;
@@ -4528,13 +4313,12 @@ void __init sched_init(void)
 
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
+		rq->best_expired_prio = MAX_PRIO;
 #else
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 #endif
 
-		rq->best_expired_prio = MAX_PRIO;
-
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_init;
 		rq->cpu_load = 0;
@@ -4647,6 +4431,21 @@ int task_running_sys(struct task_struct *p)
 EXPORT_SYMBOL(task_running_sys);
 #endif
 
+
+
+/********************************************************************
+ *
+ *  CKRM Scheduler additions
+ * 
+ *  (a) helper functions
+ *  (b) load balancing code
+ *
+ *  These are required here to avoid having to externalize many
+ *  of the definitions in sched.c
+ *
+ * 
+ ********************************************************************/
+
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
 /**
  * return the classqueue object of a certain processor
@@ -4676,4 +4475,559 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
 
 	task_rq_unlock(rq,&flags);
 }
+
+/**
+ * get_min_cvt_locking  - get the mininum cvt on a particular cpu under rqlock
+ */
+
+CVT_t get_min_cvt(int cpu);
+
+CVT_t get_min_cvt_locking(int cpu)
+{
+	CVT_t cvt;
+	struct runqueue *rq = cpu_rq(cpu);
+	spin_lock(&rq->lock);
+	cvt = get_min_cvt(cpu);
+	spin_unlock(&rq->lock);
+	return cvt;
+}
+
+ckrm_lrq_t *rq_get_dflt_lrq(int cpu)
+{
+	return &(cpu_rq(cpu)->dflt_lrq);
+}
+
+#ifdef CONFIG_SMP
+
+/**************  CKRM Load Balancing code ************************/
+
+static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
+				      int phase, enum idle_type idle)
+{
+	long pressure = task_load(tmp);
+	
+	if (pressure > max) 
+		return 0;
+
+	if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
+		return 0;
+	return 1;
+}
+
+/*
+ * move tasks for a specic local class
+ * return number of tasks pulled
+ */
+static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
+				      runqueue_t *this_rq,
+				      runqueue_t *busiest,
+				      struct sched_domain *sd,
+				      int this_cpu,
+				      enum idle_type idle,
+				      long* pressure_imbalance) 
+{
+	prio_array_t *array, *dst_array;
+	struct list_head *head, *curr;
+	task_t *tmp;
+	int idx;
+	int pulled = 0;
+	int phase = -1;
+	long pressure_min, pressure_max;
+	/*hzheng: magic : 90% balance is enough*/
+	long balance_min = *pressure_imbalance / 10; 
+/*
+ * we don't want to migrate tasks that will reverse the balance
+ *     or the tasks that make too small difference
+ */
+#define CKRM_BALANCE_MAX_RATIO	100
+#define CKRM_BALANCE_MIN_RATIO	1
+ start:
+	phase ++;
+	/*
+	 * We first consider expired tasks. Those will likely not be
+	 * executed in the near future, and they are most likely to
+	 * be cache-cold, thus switching CPUs has the least effect
+	 * on them.
+	 */
+	if (src_lrq->expired->nr_active) {
+		array = src_lrq->expired;
+		dst_array = dst_lrq->expired;
+	} else {
+		array = src_lrq->active;
+		dst_array = dst_lrq->active;
+	}
+	
+ new_array:
+	/* Start searching at priority 0: */
+	idx = 0;
+ skip_bitmap:
+	if (!idx)
+		idx = sched_find_first_bit(array->bitmap);
+	else
+		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+	if (idx >= MAX_PRIO) {
+		if (array == src_lrq->expired && src_lrq->active->nr_active) {
+			array = src_lrq->active;
+			dst_array = dst_lrq->active;
+			goto new_array;
+		}
+		if ((! phase) && (! pulled) && (idle != IDLE))
+			goto start; //try again
+		else 
+			goto out; //finished search for this lrq
+	}
+	
+	head = array->queue + idx;
+	curr = head->prev;
+ skip_queue:
+	tmp = list_entry(curr, task_t, run_list);
+	
+	curr = curr->prev;
+	
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+		if (curr != head)
+			goto skip_queue;
+		idx++;
+		goto skip_bitmap;
+	}
+
+	pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
+	pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
+	/*
+	 * skip the tasks that will reverse the balance too much
+	 */
+	if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
+		*pressure_imbalance -= task_load(tmp);
+		pull_task(busiest, array, tmp, 
+			  this_rq, dst_array, this_cpu);
+		pulled++;
+
+		if (*pressure_imbalance <= balance_min)
+			goto out;
+	}
+		
+	if (curr != head)
+		goto skip_queue;
+	idx++;
+	goto skip_bitmap;
+ out:	       
+	return pulled;
+}
+
+static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
+{
+	long imbalance;
+	/*
+	 * make sure after balance, imbalance' > - imbalance/2
+	 * we don't want the imbalance be reversed too much
+	 */
+	imbalance = ckrm_get_pressure(rq_ckrm_load(dst_rq),0) 
+		- ckrm_get_pressure(rq_ckrm_load(this_rq),1);
+	imbalance /= 2;
+	return imbalance;
+}
+
+/*
+ * try to balance the two runqueues
+ *
+ * Called with both runqueues locked.
+ * if move_tasks is called, it will try to move at least one task over
+ */
+static int ckrm_move_tasks(runqueue_t *this_rq, int this_cpu, 
+			   runqueue_t *busiest,
+			   unsigned long max_nr_move, struct sched_domain *sd,
+			   enum idle_type idle)
+{
+	struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
+	ckrm_lrq_t* src_lrq,*dst_lrq;
+	long pressure_imbalance, pressure_imbalance_old;
+	int src_cpu = task_cpu(busiest->curr);
+	struct list_head *list;
+	int pulled = 0;
+	long imbalance;
+
+	imbalance =  ckrm_rq_imbalance(this_rq,busiest);
+
+	if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
+		goto out;
+
+	//try to find the vip class
+        list_for_each_entry(clsptr,&active_cpu_classes,links) {
+		src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+
+		if (! lrq_nr_running(src_lrq))
+			continue;
+
+		if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
+			{
+				vip_cls = clsptr;
+			}
+	}
+
+	/*
+	 * do search from the most significant class
+	 * hopefully, less tasks will be migrated this way
+	 */
+	clsptr = vip_cls;
+
+ move_class:
+	if (! clsptr)
+		goto out;
+	
+
+	src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+	if (! lrq_nr_running(src_lrq))
+		goto other_class;
+	
+	dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
+
+	//how much pressure for this class should be transferred
+	pressure_imbalance = (src_lrq->lrq_load * imbalance)/WEIGHT_TO_SHARE(src_lrq->local_weight);
+	if (pulled && ! pressure_imbalance) 
+		goto other_class;
+	
+	pressure_imbalance_old = pressure_imbalance;
+	
+	//move tasks
+	pulled += 
+		ckrm_cls_move_tasks(src_lrq,dst_lrq,
+				    this_rq,
+				    busiest,
+				    sd,this_cpu,idle,
+				    &pressure_imbalance);
+
+	/* 
+	 * hzheng: 2 is another magic number
+	 * stop balancing if the imbalance is less than 25% of the orig
+	 */
+	if (pressure_imbalance <= (pressure_imbalance_old >> 2))
+		goto out;
+		
+	//update imbalance
+	imbalance *= pressure_imbalance / pressure_imbalance_old;
+ other_class:
+	//who is next?
+	list = clsptr->links.next;
+	if (list == &active_cpu_classes)
+		list = list->next;
+	clsptr = list_entry(list, typeof(*clsptr), links);
+	if (clsptr != vip_cls)
+		goto move_class;
+ out:
+	return pulled;
+}
+
+/**
+ * ckrm_check_balance - is load balancing necessary?
+ * return 0 if load balancing is not necessary
+ * otherwise return the average load of the system
+ * also, update nr_group
+ *
+ * heuristics: 
+ *   no load balancing if it's load is over average
+ *   no load balancing if it's load is far more than the min
+ * task:
+ *   read the status of all the runqueues
+ */
+static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
+					     enum idle_type idle, int* nr_group)
+{
+	struct sched_group *group = sd->groups;
+	unsigned long min_load, max_load, avg_load;
+	unsigned long total_load, this_load, total_pwr;
+
+	max_load = this_load = total_load = total_pwr = 0;
+	min_load = 0xFFFFFFFF;
+	*nr_group = 0;
+
+	do {
+		cpumask_t tmp;
+		unsigned long load;
+		int local_group;
+		int i, nr_cpus = 0;
+
+		/* Tally up the load of all CPUs in the group */
+		cpus_and(tmp, group->cpumask, cpu_online_map);
+		if (unlikely(cpus_empty(tmp)))
+			goto nextgroup;
+
+		avg_load = 0;
+		local_group = cpu_isset(this_cpu, group->cpumask);
+
+		for_each_cpu_mask(i, tmp) {
+			load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+			nr_cpus++;
+			avg_load += load;
+		}
+
+		if (!nr_cpus)
+			goto nextgroup;
+
+		total_load += avg_load;
+		total_pwr += group->cpu_power;
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			goto nextgroup;
+		} else if (avg_load > max_load) {
+			max_load = avg_load;
+		}      
+		if (avg_load < min_load) {
+			min_load = avg_load;
+		}
+nextgroup:
+		group = group->next;
+		*nr_group = *nr_group + 1;
+	} while (group != sd->groups);
+
+	if (!max_load || this_load >= max_load)
+		goto out_balanced;
+
+	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+	/* hzheng: debugging: 105 is a magic number
+	 * 100*max_load <= sd->imbalance_pct*this_load)
+	 * should use imbalance_pct instead
+	 */
+	if (this_load > avg_load 
+	    || 100*max_load < 105*this_load
+	    || 100*min_load < 70*this_load
+	    )
+		goto out_balanced;
+
+	return avg_load;
+ out_balanced:
+	return 0;
+}
+
+/**
+ * any group that has above average load is considered busy
+ * find the busiest queue from any of busy group
+ */
+static runqueue_t *
+ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
+		     unsigned long avg_load, enum idle_type idle,
+		     int nr_group)
+{
+	struct sched_group *group;
+	runqueue_t * busiest=NULL;
+	unsigned long rand;
+	
+	group = sd->groups;
+	rand = get_ckrm_rand(nr_group);
+	nr_group = 0;
+
+	do {
+		unsigned long load,total_load,max_load;
+		cpumask_t tmp;
+		int i;
+		runqueue_t * grp_busiest;
+
+		cpus_and(tmp, group->cpumask, cpu_online_map);
+		if (unlikely(cpus_empty(tmp)))
+			goto find_nextgroup;
+
+		total_load = 0;
+		max_load = 0;
+		grp_busiest = NULL;
+		for_each_cpu_mask(i, tmp) {
+			load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
+			total_load += load;
+			if (load > max_load) {
+				max_load = load;
+				grp_busiest = cpu_rq(i);
+			}				
+		}
+
+		total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
+		if (total_load > avg_load) {
+			busiest = grp_busiest;
+			if (nr_group >= rand)
+				break;
+		}
+	find_nextgroup:		
+		group = group->next;
+		nr_group ++;
+	} while (group != sd->groups);
+
+	return busiest;
+}
+
+/**
+ * load_balance - pressure based load balancing algorithm used by ckrm
+ */
+static int ckrm_load_balance_locked(int this_cpu, runqueue_t *this_rq,
+				    struct sched_domain *sd, 
+				    enum idle_type idle)
+{
+	runqueue_t *busiest;
+	unsigned long avg_load;
+	int nr_moved,nr_group;
+
+	avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
+	if (! avg_load)
+		goto out_balanced;
+
+	busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
+	if (! busiest)
+		goto out_balanced;
+	/*
+	 * This should be "impossible", but since load
+	 * balancing is inherently racy and statistical,
+	 * it could happen in theory.
+	 */
+	if (unlikely(busiest == this_rq)) {
+		WARN_ON(1);
+		goto out_balanced;
+	}
+
+	nr_moved = 0;
+	if (busiest->nr_running > 1) {
+		/*
+		 * Attempt to move tasks. If find_busiest_group has found
+		 * an imbalance but busiest->nr_running <= 1, the group is
+		 * still unbalanced. nr_moved simply stays zero, so it is
+		 * correctly treated as an imbalance.
+		 */
+		double_lock_balance(this_rq, busiest);
+		nr_moved = ckrm_move_tasks(this_rq, this_cpu, busiest,
+					   0,sd, idle);		
+		spin_unlock(&busiest->lock);
+		if (nr_moved) {
+			adjust_local_weight();
+		}
+	}
+
+	if (!nr_moved) 
+		sd->nr_balance_failed ++;
+	else
+		sd->nr_balance_failed  = 0;		
+
+	/* We were unbalanced, so reset the balancing interval */
+	sd->balance_interval = sd->min_interval;
+
+	return nr_moved;
+
+out_balanced:
+	/* tune up the balancing interval */
+	if (sd->balance_interval < sd->max_interval)
+		sd->balance_interval *= 2;
+
+	return 0;
+}
+
+static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+				    struct sched_domain *sd, 
+				    enum idle_type idle)
+{
+	int ret;
+
+	if (ckrm_rq_cpu_disabled(this_rq)) 
+		return -1;
+	//spin_lock(&this_rq->lock);
+	read_lock(&class_list_lock);
+	ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,idle);
+	// ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,NEWLY_IDLE);
+	read_unlock(&class_list_lock);
+	//spin_unlock(&this_rq->lock);
+	return ret;
+}
+
+#endif   // CONFIG_SMP
+
+
+void ckrm_cpu_class_queue_update(int on)
+{
+	/* This is called when the mode changes from disabled
+	 * to enabled (on=1) or vice versa (on=0).
+	 * we make sure that all classqueues on all cpus
+	 * either have the default class enqueued (on=1) or 
+	 * all classes dequeued (on=0). 
+	 * if not done a race condition will persist
+	 * when flipping the ckrm_sched_mode.
+	 * Otherwise will lead to more complicated code
+	 * in rq_get_next_task, where we despite knowing of
+	 * runnable tasks can not find an enqueued class.
+	 */
+
+	int i;
+	runqueue_t *rq;
+	ckrm_lrq_t *lrq;
+	struct ckrm_cpu_class *clsptr;
+
+	if (on) {	
+		BUG_ON(ckrm_cpu_enabled());
+		for_each_cpu(i) {
+			rq = cpu_rq(i);
+			BUG_ON(ckrm_rq_cpu_enabled(rq));
+			lrq = &rq->dflt_lrq;
+			spin_lock(&rq->lock);
+
+			BUG_ON(cls_in_classqueue(&lrq->classqueue_linkobj));
+
+			classqueue_init(&rq->classqueue,1);
+			lrq->top_priority = find_first_bit(lrq->active->bitmap,
+							   MAX_PRIO),
+                	classqueue_enqueue(lrq->classqueue, 
+					   &lrq->classqueue_linkobj, 0);
+			spin_unlock(&rq->lock);
+#if 0
+			printk("UPDATE(%d) run=%lu:%d:%d %d:%d->%d\n", i,
+				rq->nr_running,lrq->active->nr_active,
+				lrq->expired->nr_active,
+				find_first_bit(lrq->active->bitmap,MAX_PRIO),
+				find_first_bit(lrq->expired->bitmap,MAX_PRIO),
+				lrq->top_priority);
 #endif
+        	}
+	} else {
+		for_each_cpu(i) {
+			rq = cpu_rq(i);
+			spin_lock(&rq->lock);
+
+			/* walk through all classes and make sure they
+	  		 * are not enqueued
+			 */
+			write_lock(&class_list_lock);
+		        list_for_each_entry(clsptr,&active_cpu_classes,links) {
+				lrq = get_ckrm_lrq(clsptr,i);
+				BUG_ON((lrq != &rq->dflt_lrq) && lrq_nr_running(lrq));  // must be empty
+				if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
+                        		classqueue_dequeue(lrq->classqueue,
+                                                   	&lrq->classqueue_linkobj);
+        		}
+			rq->classqueue.enabled = 0;
+			write_unlock(&class_list_lock);
+			spin_unlock(&rq->lock);
+		}
+	}
+}
+
+/*
+ * callback when a class is getting deleted
+ * need to remove it from the class runqueue. see (class_queue_update)
+ */
+
+void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr)
+{
+	int i;
+	
+	for_each_cpu(i) {
+		runqueue_t *rq = cpu_rq(i);
+		ckrm_lrq_t *lrq = get_ckrm_lrq(clsptr,i);
+
+		spin_lock(&rq->lock);
+		write_lock(&class_list_lock);
+		BUG_ON(lrq_nr_running(lrq));  // must be empty
+		if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
+			classqueue_dequeue(lrq->classqueue,
+					   &lrq->classqueue_linkobj);
+		write_unlock(&class_list_lock);
+		spin_unlock(&rq->lock);
+	}
+}
+
+#endif  // CONFIG_CKRM_CPU_SCHEDULE
-- 
2.47.0