From: Marc Fiuczynski <mef@cs.princeton.edu>
Date: Tue, 28 Sep 2004 06:14:31 +0000 (+0000)
Subject: Merge in the version 7 cpu controller from CKRM.
X-Git-Tag: before-enable-kexec-patch~60
X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=c7089c9ff74690b4a1387b120eb5710f3510249f;p=linux-2.6.git

Merge in the version 7 cpu controller from CKRM.
- Had to shuffle the runqueue datastructure from sched.h to sched.c.
- Fixed arithmetic fault that would occur early in boot due to uninitialized
  CKRM cpu controller data structure. Odd that the CKRM folks didn't have
  this bug, too.
---

diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h
index 1bdf9b775..a825336cb 100644
--- a/include/linux/ckrm_classqueue.h
+++ b/include/linux/ckrm_classqueue.h
@@ -116,7 +116,7 @@ void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int
 cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
 
 /*update the base priority of the classqueue*/
-void classqueue_update_base(struct classqueue_struct *cq, int new_base);
+void classqueue_update_base(struct classqueue_struct *cq);
 
 /**
  * class_compare_prio: compare the priority of this two nodes
diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h
index 9d82214fb..b7e6b30d0 100644
--- a/include/linux/ckrm_sched.h
+++ b/include/linux/ckrm_sched.h
@@ -15,30 +15,34 @@
 #ifndef _CKRM_SCHED_H
 #define _CKRM_SCHED_H
 
-#define CC_BUG_ON_DO(cond,action)  do { if (cond)  action; BUG_ON(cond); } while(0)
-#define CC_BUG_ON(cond)            BUG_ON(cond)
-
 #include <linux/sched.h>
 #include <linux/ckrm_rc.h>
 #include <linux/ckrm_classqueue.h>
 
-//update every second
-#define CVT_UPDATE_TICK     (1*HZ/1 ?: 1)
-#define CLASS_BONUS_RATE 22	// shift from ns to increase class bonus
-#define PRIORITY_BONUS_RATE 0	// ??  Hubertus
-
 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+
 struct prio_array {
-	int nr_active;
+	unsigned int nr_active;
 	unsigned long bitmap[BITMAP_SIZE];
 	struct list_head queue[MAX_PRIO];
 };
 
-struct ckrm_local_runqueue {
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#define rq_active(p,rq)   (get_task_lrq(p)->active)
+#define rq_expired(p,rq)  (get_task_lrq(p)->expired)
+int __init init_ckrm_sched_res(void);
+#else
+#define rq_active(p,rq)   (rq->active)
+#define rq_expired(p,rq)  (rq->expired)
+static inline void init_ckrm_sched_res(void) {}
+static inline int ckrm_cpu_monitor_init(void) {return 0;}
+#endif //CONFIG_CKRM_CPU_SCHEDULE
+
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+struct ckrm_runqueue {
 	cq_node_t classqueue_linkobj;	/*links in classqueue */
 	struct ckrm_cpu_class *cpu_class;	// class it belongs to
 	struct classqueue_struct *classqueue;	// classqueue it belongs tow
-	CVT_t uncounted_cvt;
 	unsigned long long uncounted_ns;
 
 	prio_array_t *active, *expired, arrays[2];
@@ -55,19 +59,27 @@ struct ckrm_local_runqueue {
 	 * updated on enqueue, dequeue
 	 */
 	int top_priority;
-	CVT_t local_cvt;	// snapshot of local_cvt, update on every loadbalance
+	CVT_t local_cvt;
+
+	unsigned long lrq_load;
+	int local_weight; 
+
+
+	/*
+	 * unused CPU time accumulated while thoe class 
+	 * is inactive goes to savings
+	 * 
+	 * initialized to be 0
+	 * a class can't accumulate more than SAVING_THRESHOLD of savings
+	 * savings are kept in normalized form (like cvt)
+	 * so when task share change the savings should be scaled accordingly
+	 */
+	unsigned long long savings;
+
 	unsigned long magic;	//for debugging
 };
 
-/**
- * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping
- */
-struct ckrm_cpu_class_local_stat {
-	unsigned long long run;
-	unsigned long long total;
-	unsigned long long last_sleep;
-	unsigned long cpu_demand; /*estimated cpu demand */
-};
+typedef struct ckrm_runqueue ckrm_lrq_t;
 
 /**
  * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
@@ -78,22 +90,35 @@ struct ckrm_cpu_class_stat {
 
 	unsigned long long total_ns;	/*how much nano-secs it has consumed */
 
-	struct ckrm_cpu_class_local_stat local_stats[NR_CPUS];
-	unsigned long cpu_demand;
+	struct ckrm_cpu_demand_stat local_stats[NR_CPUS];
+
+	/* 
+	 * 
+	 */
+	unsigned long max_demand; /* the maximun a class can consume */
+	int egrt,megrt; /*effective guarantee*/
+	int ehl,mehl; /*effective hard limit, my effective hard limit*/
 
-	/*temp stat used by cpu monitor */
-	int effective_guarantee;
-	int effective_limit;
-	int glut;		//true or false
 	/*
-	 * effective_share: for both default class and its children
-	 * self_effective_share: just for the default class
+	 * eshare: for both default class and its children
+	 * meshare: just for the default class
 	 */
-	int effective_share;
-	int self_effective_share;
+	int eshare;
+	int meshare;
 };
 
-typedef struct ckrm_cpu_class_stat ckrm_stat_t;
+#define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
+
+#define USAGE_SAMPLE_FREQ HZ  //sample every 1 seconds
+#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
+#define USAGE_WINDOW_SIZE 60  //keep the last 60 sample
+
+struct ckrm_usage {
+	unsigned long samples[USAGE_WINDOW_SIZE]; //record usages 
+	unsigned long sample_pointer; //pointer for the sliding window
+	unsigned long long last_ns; //ns for last sample
+	long long last_sample_jiffies; //in number of jiffies
+};
 
 /*
  * manages the class status
@@ -104,72 +129,221 @@ struct ckrm_cpu_class {
 	struct ckrm_core_class *parent;
 	struct ckrm_shares shares;
 	spinlock_t cnt_lock;	// always grab parent's lock first and then child's
-	CVT_t global_cvt;	// total cummulative virtual time
 	struct ckrm_cpu_class_stat stat;
 	struct list_head links;	// for linking up in cpu classes
-	struct ckrm_local_runqueue local_queues[NR_CPUS];	// runqueues 
+	ckrm_lrq_t local_queues[NR_CPUS];	// runqueues 
+	struct ckrm_usage usage;
+	unsigned long magic;	//for debugging
 };
 
-#if CONFIG_CKRM_CPU_SCHEDULE
-#define rq_active(p,rq)   (get_task_class_queue(p)->active)
-#define rq_expired(p,rq)  (get_task_class_queue(p)->expired)
-#else
-#define rq_active(p,rq)   (rq->active)
-#define rq_expired(p,rq)  (rq->expired)
-#endif
+#define cpu_class_weight(cls) (cls->stat.meshare)
+#define local_class_weight(lrq) (lrq->local_weight)
 
-//#define cpu_class_weight(cls) (cls->shares.my_guarantee)
-#define cpu_class_weight(cls) (cls->stat.self_effective_share)
-
-#define bpt_queue(cpu) (& (cpu_rq(cpu)->classqueue) )
-CVT_t get_min_cvt(int cpu);
+static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
+{
+	return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC);
+}
 
 struct classqueue_struct *get_cpu_classqueue(int cpu);
+struct ckrm_cpu_class * get_default_cpu_class(void);
+
+
+static inline void ckrm_usage_init(struct ckrm_usage* usage)
+{
+	int i;
+
+	for (i=0; i < USAGE_WINDOW_SIZE; i++)
+		usage->samples[i] = 0;
+	usage->sample_pointer = 0;
+	usage->last_ns = 0;
+	usage->last_sample_jiffies = 0;
+}
+
+/*
+ * this function can be called at any frequency
+ * it's self-contained
+ */
+static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr)
+{
+	struct ckrm_usage* usage = &clsptr->usage;
+	unsigned long long cur_sample;
+	int duration = jiffies - usage->last_sample_jiffies;
+
+	//jiffies wasn't start from 0
+	//so it need to be properly handled
+	if (unlikely(!usage->last_sample_jiffies)) 
+		usage->last_sample_jiffies = jiffies;
+
+	//called too frequenctly
+	if (duration < USAGE_SAMPLE_FREQ)
+		return;
+
+	usage->last_sample_jiffies = jiffies;
+
+	cur_sample = clsptr->stat.total_ns - usage->last_ns; 
+	usage->last_ns = clsptr->stat.total_ns;
+
+	//scale it based on the sample duration
+	cur_sample *= ((USAGE_SAMPLE_FREQ<< 15)/duration);
+	cur_sample >>= 15;
+	usage->samples[usage->sample_pointer] = cur_sample;
+	//	printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
+
+	usage->sample_pointer ++;
+	if (usage->sample_pointer >= USAGE_WINDOW_SIZE)
+		usage->sample_pointer = 0;
+}
+
+//duration is specified in number of jiffies
+//return the usage in percentage
+static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration)
+{
+	int nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
+	struct ckrm_usage* usage = &clsptr->usage;
+	unsigned long long total = 0;
+	int i, idx;
+
+	if (nr_samples > USAGE_WINDOW_SIZE)
+		nr_samples = USAGE_WINDOW_SIZE;
+
+	idx = usage->sample_pointer;	
+	for (i = 0; i< nr_samples; i++) {
+		if (! idx)
+			idx = USAGE_WINDOW_SIZE;
+		idx --;
+		total += usage->samples[idx];
+	}
+        total *= 100;
+        do_div(total,nr_samples);
+        do_div(total,NS_PER_SAMPLE);
+	do_div(total,cpus_weight(cpu_online_map));
+        return total;
+}
 
-extern struct ckrm_cpu_class default_cpu_class_obj;
-#define default_cpu_class (&default_cpu_class_obj)
 
-#define local_queue_nr_running(local_queue) \
-             (local_queue->active->nr_active + local_queue->expired->nr_active)
+#define lrq_nr_running(lrq) \
+             (lrq->active->nr_active + lrq->expired->nr_active)
 
-static inline struct ckrm_local_runqueue *
-get_ckrm_local_runqueue(struct ckrm_cpu_class*cls, int cpu)
+static inline ckrm_lrq_t *
+get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
 {
 	return &(cls->local_queues[cpu]);
 }
 
-static inline struct ckrm_local_runqueue *get_task_class_queue(struct task_struct *p)
+static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
 {
 	return &(p->cpu_class->local_queues[task_cpu(p)]);
 }
 
 #define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
-#define class_list_entry(list) list_entry(list,struct ckrm_local_runqueue,classqueue_linkobj)
+#define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj)
 
 /* some additional interfaces exported from sched.c */
 struct runqueue;
-void dequeue_task(struct task_struct *p, prio_array_t * array);
-void enqueue_task(struct task_struct *p, prio_array_t * array);
-struct runqueue *task_rq_lock(task_t * p, unsigned long *flags);
-void task_rq_unlock(struct runqueue *rq, unsigned long *flags);
-extern spinlock_t cvt_lock;
 extern rwlock_t class_list_lock;
 extern struct list_head active_cpu_classes;
+unsigned int task_timeslice(task_t *p);
+void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls);
 
-/*functions exported by ckrm_cpu_class.c*/
-int __init init_ckrm_sched_res(void);
 void init_cpu_classes(void);
+void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
+void ckrm_cpu_change_class(void *task, void *old, void *new);
+
 
-/*functions exported by ckrm_cpu_monitor.c*/
-void ckrm_cpu_monitor(void);
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
 #define CPU_DEMAND_ENQUEUE 0
 #define CPU_DEMAND_DEQUEUE 1
 #define CPU_DEMAND_DESCHEDULE 2
-void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len);
+#define CPU_DEMAND_INIT 3
+
+/*functions exported by ckrm_cpu_monitor.c*/
+void ckrm_cpu_monitor(void);
+int ckrm_cpu_monitor_init(void);
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
+void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
+void adjust_local_weight(void);
+
+#define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
+#define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu])
+#define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu))
+
+/********************************************************************
+ * Parameters that determine how quickly CVT's progress and how
+ * priority can impact a LRQ's runqueue position. See also
+ * get_effective_prio(). These parameters need to adjusted
+ * in accordance to the following example and understanding.
+ * 
+ * CLASS_QUANTIZER:
+ * 
+ * A class with 5% share, can execute 50M nsecs / per sec ~ 2^28.
+ * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7.
+ * With CLASS_QUANTIZER=16, the local_cvt of this class will increase
+ * by 2^28/2^9 = 2^19 = 512K.
+ * Setting CLASS_QUANTIZER to 16, 2^(19-16) = 8 slots / per second.
+ * A class with 5% shares, will cover 80 slots / per second.
+ *
+ * PRIORITY_QUANTIZER:
+ *
+ * How much can top priorities of class impact slot bonus.
+ * There are 40 nice priorities. "2" will allow upto 10 slots improvement
+ * in the RQ thus for 50% class it can perform ~1sec starvation.
+ *
+ *******************************************************************/
+
+#define CLASS_QUANTIZER 16 	//shift from ns to increase class bonus
+#define PRIORITY_QUANTIZER 2	//controls how much a high prio task can borrow
+
+#define CKRM_SHARE_ACCURACY 10
+#define NSEC_PER_MS 1000000
+#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
+
+
+#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC)  // 10 seconds
+
+#define CVT_UPDATE_TICK     ((HZ/2)?:1)
+
+// ABSOLUTE_CKRM_TUNING determines whether classes can make up
+// lost time in absolute time or in relative values
+
+#define ABSOLUTE_CKRM_TUNING         // preferred due to more predictable behavior
+
+#ifdef ABSOLUTE_CKRM_TUNING
+
+#define MAX_SAVINGS        MAX_SAVINGS_ABSOLUTE
+//an absolute bonus of 200ms for classes when reactivated
+#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
+#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
+
+#define scale_cvt(val,lrq)   ((val)*local_class_weight(lrq))
+#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq)))
+
+#else
+
+#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) 
+/*
+ * to improve system responsiveness
+ * an inactive class is put a little bit ahead of the current class when it wakes up
+ * the amount is set in normalized termis to simplify the calculation
+ * for class with 100% share, it can be 2s ahead
+ * while for class with 10% share, it can be 200ms ahead
+ */
+#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS)  
+
+/*
+ * normalized savings can't be more than MAX_NORMALIZED_SAVINGS
+ * based on the current configuration
+ * this means that a class with share 100% will accumulate 10s at most
+ * while a class with 1% of the share can only accumulate 100ms
+ */
+
+//a class with share 100% can get 100ms every 500ms
+//while a class with share 10% can only get 10ms every 500ms
+#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY)
+
+#define scale_cvt(val,lrq)   (val)
+#define unscale_cvt(val,lrq) (val)
+
+#endif
 
-#define get_task_local_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
-#define get_rq_local_stat(lrq,cpu) (&(lrq)->cpu_class->stat.local_stats[cpu])
 
 /**
  * get_effective_prio: return the effective priority of a class local queue
@@ -181,18 +355,18 @@ void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, u
  * currently, prio increases by 1 if either: top_priority increase by one
  *                                   or, local_cvt increases by 4ms
  */
-static inline int get_effective_prio(struct ckrm_local_runqueue * lcq)
+static inline int get_effective_prio(ckrm_lrq_t * lrq)
 {
 	int prio;
 
-	// cumulative usage
-	prio = lcq->local_cvt >> CLASS_BONUS_RATE;
-	// queue urgency
-	prio += lcq->top_priority >> PRIORITY_BONUS_RATE;
+	prio = lrq->local_cvt >> CLASS_QUANTIZER;  // cumulative usage
+	prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency
 
 	return prio;
 }
 
+CVT_t get_local_cur_cvt(int cpu);
+
 /** 
  * update_class_priority:
  * 
@@ -206,9 +380,8 @@ static inline int get_effective_prio(struct ckrm_local_runqueue * lcq)
  *      -- rq_get_next_task (queue switch)
  *   -- update_local_cvt
  *      -- schedule
- *   -- update_global_cvt
  */
-static inline void update_class_priority(struct ckrm_local_runqueue *local_rq)
+static inline void update_class_priority(ckrm_lrq_t *local_rq)
 {
 	int effective_prio = get_effective_prio(local_rq);
 	classqueue_update_prio(local_rq->classqueue,
@@ -220,42 +393,80 @@ static inline void update_class_priority(struct ckrm_local_runqueue *local_rq)
  *  set the new top priority and reposition the queue
  *  called when: task enqueue/dequeue and queue switch
  */
-static inline void set_top_priority(struct ckrm_local_runqueue *class_queue,
+static inline void set_top_priority(ckrm_lrq_t *lrq,
 				    int new_priority)
 {
-	class_queue->top_priority = new_priority;
-	update_class_priority(class_queue);
+	lrq->top_priority = new_priority;
+	update_class_priority(lrq);
+}
+
+/*
+ * task_load: how much load this task counts
+ */
+static inline unsigned long task_load(struct task_struct* p)
+{
+	return (task_timeslice(p) * p->demand_stat.cpu_demand);
+}
+
+/*
+ * runqueue load is the local_weight of all the classes on this cpu
+ * must be called with class_list_lock held
+ */
+static inline unsigned long ckrm_cpu_load(int cpu)
+{
+	struct ckrm_cpu_class *clsptr;
+	ckrm_lrq_t* lrq;
+	struct ckrm_cpu_demand_stat* l_stat;
+	int total_load = 0;
+	int load;
+
+	list_for_each_entry(clsptr,&active_cpu_classes,links) {
+		lrq =  get_ckrm_lrq(clsptr,cpu);
+		l_stat = get_cls_local_stat(clsptr,cpu);
+		load = lrq->local_weight;
+		if (l_stat->cpu_demand < load)
+			load = l_stat->cpu_demand;
+		total_load += load;
+	}	
+	return total_load;
 }
 
 static inline void class_enqueue_task(struct task_struct *p,
 				      prio_array_t * array)
 {
-	struct ckrm_local_runqueue *queue;
+	ckrm_lrq_t *lrq;
 	int effective_prio;
 
-	queue = get_task_class_queue(p);
+	lrq = get_task_lrq(p);
+
+	cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
+	lrq->lrq_load += task_load(p);
 
-	if (! cls_in_classqueue(&queue->classqueue_linkobj)) {
-		cpu_demand_event(get_task_local_stat(p),CPU_DEMAND_ENQUEUE,0);
-		/*make sure the cvt of this class is up to date*/
-		queue->local_cvt = get_min_cvt(task_cpu(p));
-		effective_prio = get_effective_prio(queue);
-		classqueue_enqueue(queue->classqueue, &queue->classqueue_linkobj, effective_prio);
+	if ((p->prio < lrq->top_priority) && (array == lrq->active))
+		set_top_priority(lrq, p->prio);	
+
+	if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
+		cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
+		effective_prio = get_effective_prio(lrq);
+		classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio);
 	} 
-	
-	if ((p->prio < queue->top_priority) && (array == queue->active))
-		set_top_priority(queue, p->prio);	
 
 }
 
 static inline void class_dequeue_task(struct task_struct *p,
 				      prio_array_t * array)
 {
-	struct ckrm_local_runqueue *queue = get_task_class_queue(p);
+	ckrm_lrq_t *lrq = get_task_lrq(p);
+	unsigned long load = task_load(p);
 
-	if ((array == queue->active) && (p->prio == queue->top_priority)
+	BUG_ON(lrq->lrq_load < load);
+	lrq->lrq_load -= load;
+
+	cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
+
+	if ((array == lrq->active) && (p->prio == lrq->top_priority)
 	    && list_empty(&(array->queue[p->prio])))
-		set_top_priority(queue,
+		set_top_priority(lrq,
 				 find_next_bit(array->bitmap, MAX_PRIO,
 					       p->prio));
 }
@@ -266,32 +477,82 @@ static inline void class_dequeue_task(struct task_struct *p,
  */
 static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
 {
-	struct ckrm_local_runqueue *class_queue = get_task_class_queue(p);
-	struct ckrm_cpu_class *cls = class_queue->cpu_class;
+	ckrm_lrq_t * lrq = get_task_lrq(p);
+
+	unsigned long cvt_inc = nsec / local_class_weight(lrq);
+
+	lrq->local_cvt += cvt_inc;
+	lrq->uncounted_ns += nsec;
 
-	unsigned long cvt_inc = nsec / cpu_class_weight(cls);
+	update_class_priority(lrq);
+}
 
-	class_queue->local_cvt += cvt_inc;
-	class_queue->uncounted_cvt += cvt_inc;
+static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
+{
+	struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
+	struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj);
 
-	class_queue->uncounted_ns += nsec;
-	update_class_priority(class_queue);
+	return (class_compare_prio(node1,node2) < 0);
 }
 
 /*
- * called during loadbalancing 
- * to charge the class with locally accumulated cvt
+ * return a random value with range [0, (val-1)]
  */
-void update_global_cvts(int this_cpu);
+static inline int get_ckrm_rand(unsigned long val)
+{
+	int rand;
+	static int last_rand[NR_CPUS];
+	int cpu = smp_processor_id();
+
+	rand = last_rand[cpu];
+	rand ++;
+	if (rand >= val)
+		rand = 0; 
+	
+	last_rand[cpu] = rand;
+	return rand;
+}
 
-/**
- * 
- */
-static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
+void update_class_cputime(int this_cpu);
+
+/**********************************************/
+/*          PID_LOAD_BALANCING                */
+/**********************************************/
+struct ckrm_load_struct {
+	unsigned long load_p; 	/*propotional*/
+	unsigned long load_i;   /*integral   */
+	long load_d;   /*derivative */
+};
+
+typedef struct ckrm_load_struct ckrm_load_t;
+
+static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
+	ckrm_load->load_p = 0;
+	ckrm_load->load_i = 0;
+	ckrm_load->load_d = 0;
+}
+
+void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
+#define rq_ckrm_load(rq) (&((rq)->ckrm_load))
+
+static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load)
 {
-	struct cq_node_struct* node1 = &(get_task_class_queue(p)->classqueue_linkobj);
-	struct cq_node_struct* node2 = &(get_task_class_queue(curr)->classqueue_linkobj);
+	read_lock(&class_list_lock);
+       
+#ifdef CONFIG_SMP
+	ckrm_load_sample(ckrm_load,this_cpu);
+#endif
 
-	return (class_compare_prio(node1,node2) < 0);
+	if (! (j % CVT_UPDATE_TICK)) {
+		//		printk("ckrm_sched j=%lu\n",j);
+		classqueue_update_base(get_cpu_classqueue(this_cpu));
+		update_class_cputime(this_cpu);
+	}
+
+	read_unlock(&class_list_lock);
 }
+
+#endif //CONFIG_CKRM_CPU_SCHEDULE
+
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 98f7a1eba..dd5005295 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -94,7 +94,7 @@ extern unsigned long avenrun[];		/* Load averages */
 extern int nr_threads;
 extern int last_pid;
 DECLARE_PER_CPU(unsigned long, process_counts);
-DECLARE_PER_CPU(struct runqueue, runqueues);
+// DECLARE_PER_CPU(struct runqueue, runqueues); -- removed after ckrm cpu v7 merge
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
@@ -429,6 +429,25 @@ int set_current_groups(struct group_info *group_info);
 struct audit_context;		/* See audit.c */
 struct mempolicy;
 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/**
+ * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class
+ * @run: how much time it has been running since the counter started
+ * @total: total time since the counter started
+ * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping
+ * @recalc_interval: how often do we recalculate the cpu_demand
+ * @cpu_demand: moving average of run/total
+ */
+struct ckrm_cpu_demand_stat {
+	unsigned long long run;
+	unsigned long long total;
+	unsigned long long last_sleep;
+	unsigned long long recalc_interval;
+	unsigned long cpu_demand; /*estimated cpu demand */
+};
+#endif
+
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -528,7 +547,6 @@ struct task_struct {
 /* signal handlers */
 	struct signal_struct *signal;
 	struct sighand_struct *sighand;
-
 	sigset_t blocked, real_blocked;
 	struct sigpending pending;
 
@@ -594,7 +612,9 @@ struct task_struct {
 	struct list_head        taskclass_link;
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
         struct ckrm_cpu_class *cpu_class;
-#endif
+	//track cpu demand of this task
+	struct ckrm_cpu_demand_stat demand_stat;
+#endif //CONFIG_CKRM_CPU_SCHEDULE
 #endif // CONFIG_CKRM_TYPE_TASKCLASS
 #ifdef CONFIG_CKRM_RES_MEM
 	struct list_head	mm_peers; // list of tasks using same mm_struct
@@ -781,83 +801,6 @@ extern int idle_cpu(int cpu);
 
 void yield(void);
 
-/*
- * These are the runqueue data structures:
- */
-typedef struct runqueue runqueue_t;
-
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-#include <linux/ckrm_classqueue.h>
-#endif
-
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-
-/**
- *  if belong to different class, compare class priority
- *  otherwise compare task priority 
- */
-#define TASK_PREEMPTS_CURR(p, rq) \
-	(((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio)
-#else
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-struct prio_array {
-	unsigned int nr_active;
-	unsigned long bitmap[BITMAP_SIZE];
-	struct list_head queue[MAX_PRIO];
-};
-#define rq_active(p,rq)   (rq->active)
-#define rq_expired(p,rq)  (rq->expired)
-#define ckrm_rebalance_tick(j,this_cpu) do {} while (0)
-#define TASK_PREEMPTS_CURR(p, rq) \
-	((p)->prio < (rq)->curr->prio)
-#endif
-
-/*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
- */
-struct runqueue {
-	spinlock_t lock;
-
-	/*
-	 * nr_running and cpu_load should be in the same cacheline because
-	 * remote CPUs use both these fields when doing load calculation.
-	 */
-	unsigned long nr_running;
-#if defined(CONFIG_SMP)
-	unsigned long cpu_load;
-#endif
-	unsigned long long nr_switches, nr_preempt;
-	unsigned long expired_timestamp, nr_uninterruptible;
-	unsigned long long timestamp_last_tick;
-	task_t *curr, *idle;
-	struct mm_struct *prev_mm;
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-	unsigned long ckrm_cpu_load;
-	struct classqueue_struct classqueue;   
-#else
-        prio_array_t *active, *expired, arrays[2];
-#endif
-	int best_expired_prio;
-	atomic_t nr_iowait;
-
-#ifdef CONFIG_SMP
-	struct sched_domain *sd;
-
-	/* For active balancing */
-	int active_balance;
-	int push_cpu;
-
-	task_t *migration_thread;
-	struct list_head migration_queue;
-#endif
-	struct list_head hold_queue;
-	int idle_tokens;
-};
-
 /*
  * The default (Linux) execution domain.
  */
@@ -894,6 +837,7 @@ static inline struct user_struct *get_uid(struct user_struct *u)
 	atomic_inc(&u->__count);
 	return u;
 }
+
 extern void free_uid(struct user_struct *);
 extern void switch_uid(struct user_struct *);
 
@@ -999,6 +943,7 @@ static inline int capable(int cap)
 }
 #endif
 
+
 /*
  * Routines for handling mm_structs
  */
@@ -1132,7 +1077,7 @@ static inline struct mm_struct * get_task_mm(struct task_struct * task)
 
 	return mm;
 }
-
+ 
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/init/Kconfig b/init/Kconfig
index da1b24f0e..e63697a39 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -175,21 +175,12 @@ config CKRM_RES_NUMTASKS
 config CKRM_CPU_SCHEDULE
 	bool "CKRM CPU scheduler"
 	depends on CKRM_TYPE_TASKCLASS
-	default m
+	default y
 	help
 	  Use CKRM CPU scheduler instead of Linux Scheduler
 	
 	  Say N if unsure, Y to use the feature.
 
-config CKRM_CPU_MONITOR
-	bool "CKRM CPU Resoure Monitor"
-	depends on CKRM_CPU_SCHEDULE
-	default m
-	help
-	  Monitor CPU Resource Usage of the classes
-	
-	  Say N if unsure, Y to use the feature.
-
 config CKRM_RES_BLKIO
 	tristate " Disk I/O Resource Controller"
 	depends on CKRM_TYPE_TASKCLASS && IOSCHED_CFQ
diff --git a/init/main.c b/init/main.c
index e93d25685..6416eab8d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -55,6 +55,7 @@ int __init init_ckrm_sched_res(void);
 #else
 #define init_ckrm_sched_res() ((void)0)
 #endif
+//#include <linux/ckrm_sched.h>
 
 /*
  * This is one of the first .c files built. Error out early
@@ -476,6 +477,7 @@ asmlinkage void __init start_kernel(void)
 	 * printk() and can access its per-cpu storage.
 	 */
 	smp_prepare_boot_cpu();
+
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -695,7 +697,9 @@ static int init(void * unused)
 	 * firmware files.
 	 */
 	populate_rootfs();
+
 	do_basic_setup();
+
 	init_ckrm_sched_res();
 
 	sched_init_smp();
diff --git a/kernel/Makefile b/kernel/Makefile
index 905f3c59d..ec5001052 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,12 +27,9 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
-obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o
-obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_sched.o
+obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_KGDB) += kgdbstub.o
-
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile
index 32b576b9b..b32530977 100644
--- a/kernel/ckrm/Makefile
+++ b/kernel/ckrm/Makefile
@@ -9,6 +9,5 @@ endif
     obj-$(CONFIG_CKRM_RES_NUMTASKS) 	+= ckrm_numtasks.o
     obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
     obj-$(CONFIG_CKRM_RES_LISTENAQ) 	+= ckrm_laq.o
-    obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o
-    obj-$(CONFIG_CKRM_CPU_MONITOR) += ckrm_cpu_monitor.o
-    obj-$(CONFIG_CKRM_RES_MEM) 			+= ckrm_mem.o
+    obj-$(CONFIG_CKRM_CPU_SCHEDULE)     += ckrm_cpu_class.o ckrm_cpu_monitor.o
+    obj-$(CONFIG_CKRM_RES_MEM) 		+= ckrm_mem.o
diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c
index 0ded7f3c6..ad45380ee 100644
--- a/kernel/ckrm/ckrm_cpu_class.c
+++ b/kernel/ckrm/ckrm_cpu_class.c
@@ -23,17 +23,32 @@
 #include <linux/ckrm_classqueue.h>
 #include <linux/seq_file.h>
 
-
 struct ckrm_res_ctlr cpu_rcbs;
 
+/**
+ * insert_cpu_class - insert a class to active_cpu_class list
+ *
+ * insert the class in decreasing order of class weight
+ */
+static inline void insert_cpu_class(struct ckrm_cpu_class *cls)
+{
+	list_add(&cls->links,&active_cpu_classes);
+}
+
 /*
  *  initialize a class object and its local queues
  */
- static void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
+void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
 {
 	int i,j,k;      
 	prio_array_t *array; 	
-	struct ckrm_local_runqueue* queue;
+	ckrm_lrq_t* queue;
+
+	cls->shares = *shares;
+	cls->cnt_lock = SPIN_LOCK_UNLOCKED;
+	ckrm_cpu_stat_init(&cls->stat);
+	ckrm_usage_init(&cls->usage);
+	cls->magic = CKRM_CPU_CLASS_MAGIC;
 
 	for (i = 0 ; i < NR_CPUS ; i++) {
 		queue = &cls->local_queues[i];
@@ -58,34 +73,37 @@ struct ckrm_res_ctlr cpu_rcbs;
 		queue->top_priority = MAX_PRIO;
 		cq_node_init(&queue->classqueue_linkobj);
 		queue->local_cvt = 0;
-		queue->uncounted_cvt = 0;
+		queue->lrq_load = 0;
+		queue->local_weight = cpu_class_weight(cls);
 		queue->uncounted_ns = 0;
+		queue->savings = 0;
 		queue->magic = 0x43FF43D7;
 	}
 
-	cls->shares = *shares;
-	cls->global_cvt = 0;
-	cls->cnt_lock = SPIN_LOCK_UNLOCKED;
-	ckrm_cpu_stat_init(&cls->stat);
-
 	// add to class list
 	write_lock(&class_list_lock);
-	list_add(&cls->links,&active_cpu_classes);
+	insert_cpu_class(cls);
 	write_unlock(&class_list_lock);
 }
 
 static inline void set_default_share(ckrm_shares_t *shares)
 {
 	shares->my_guarantee     = 0;
-	shares->my_limit         = CKRM_SHARE_DFLT_MAX_LIMIT;
 	shares->total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-	shares->max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
 	shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-	shares->cur_max_limit    = CKRM_SHARE_DFLT_MAX_LIMIT;
+	shares->my_limit         = CKRM_SHARE_DFLT_MAX_LIMIT;
+	shares->max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+	shares->cur_max_limit    = 0;
 }
 
-struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) {
-	return ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
+struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core)
+{
+	struct ckrm_cpu_class * cls;
+	cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
+	if (valid_cpu_class(cls))
+		return cls;
+	else
+		return NULL;
 }
 
 
@@ -94,7 +112,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
 	struct ckrm_cpu_class *cls;
 
 	if (! parent) /*root class*/
-		cls =  default_cpu_class;
+		cls =  get_default_cpu_class();
 	else
 		cls = (struct ckrm_cpu_class *) kmalloc(sizeof(struct ckrm_cpu_class),GFP_ATOMIC);
 
@@ -113,7 +131,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
 			cls->parent = parent;
 		}
 	} else
-		printk("alloc_cpu_class failed GFP_ATOMIC\n");
+		printk(KERN_ERR"alloc_cpu_class failed\n");
 
 	return cls;
 }		
@@ -132,7 +150,7 @@ static void ckrm_free_cpu_class(void *my_res)
 		return;
 
 	/*the default class can't be freed*/
-	if (cls == default_cpu_class) 
+	if (cls == get_default_cpu_class()) 
 		return;
 
 	// Assuming there will be no children when this function is called
@@ -187,7 +205,16 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
                 parres = NULL;
         }
 
+	/*
+	 * hzheng: CKRM_SHARE_DONTCARE should be handled
+	 */
+	if (new_share->my_guarantee == CKRM_SHARE_DONTCARE)
+		new_share->my_guarantee = 0;
+
 	rc = set_shares(new_share, cur, par);
+	if (cur->my_limit == CKRM_SHARE_DONTCARE)
+		cur->my_limit = cur->max_limit;
+
 
 	spin_unlock(&cls->cnt_lock);
 	if (cls->parent) {
@@ -196,9 +223,6 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
 	return rc;
 }							
 			
-/*
- * translate the global_CVT to ticks
- */
 static int ckrm_cpu_get_share(void *my_res,
 			      struct ckrm_shares *shares)
 {			
@@ -213,35 +237,42 @@ static int ckrm_cpu_get_share(void *my_res,
 int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
 {
 	struct ckrm_cpu_class *cls = my_res;
+	struct ckrm_cpu_class_stat* stat = &cls->stat;
+	ckrm_lrq_t* lrq;
+	int i;
 
 	if (!cls) 
 		return -EINVAL;
 
 	seq_printf(sfile, "-------- CPU Class Status Start---------\n");
-	seq_printf(sfile, "  gua= %d limit= %d\n",
+	seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n",
 		   cls->shares.my_guarantee,
-		   cls->shares.my_limit);
-	seq_printf(sfile, "  total_gua= %d limit= %d\n",
+		   cls->shares.my_limit,
 		   cls->shares.total_guarantee,
 		   cls->shares.max_limit);
-	seq_printf(sfile, "  used_gua= %d cur_limit= %d\n",
+	seq_printf(sfile, "\tunused_grt= %d cur_max_limit= %d\n",
 		   cls->shares.unused_guarantee,
 		   cls->shares.cur_max_limit);
 
-	seq_printf(sfile, "  Share= %d\n",cpu_class_weight(cls));
-	seq_printf(sfile, "  cvt= %llu\n",cls->local_queues[0].local_cvt);
-	seq_printf(sfile, "  total_ns= %llu\n",cls->stat.total_ns);
-	seq_printf(sfile, "  prio= %d\n",cls->local_queues[0].classqueue_linkobj.prio);
-	seq_printf(sfile, "  index= %d\n",cls->local_queues[0].classqueue_linkobj.index);
-	seq_printf(sfile, "  run= %llu\n",cls->stat.local_stats[0].run);
-	seq_printf(sfile, "  total= %llu\n",cls->stat.local_stats[0].total);
-	seq_printf(sfile, "  cpu_demand= %lu\n",cls->stat.cpu_demand);
-
-	seq_printf(sfile, "  effective_guarantee= %d\n",cls->stat.effective_guarantee);
-	seq_printf(sfile, "  effective_limit= %d\n",cls->stat.effective_limit);
-	seq_printf(sfile, "  effective_share= %d\n",cls->stat.effective_share);
-	seq_printf(sfile, "-------- CPU Class Status END ---------\n");
+	seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt);
+	seq_printf(sfile, "\tmegrt= %d\n",stat->megrt);
+	seq_printf(sfile, "\tehl= %d\n",stat->ehl);
+	seq_printf(sfile, "\tmehl= %d\n",stat->mehl);
+	seq_printf(sfile, "\teshare= %d\n",stat->eshare);
+	seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls));
+	seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand);
+	seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns);
+	seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n",
+		   get_ckrm_usage(cls,2*HZ),
+		   get_ckrm_usage(cls,10*HZ),
+		   get_ckrm_usage(cls,60*HZ)
+		   );
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(cls,i);		
+		seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav=%lu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings);
+	}
 
+	seq_printf(sfile, "-------- CPU Class Status END ---------\n");
 
 	return 0;
 }
@@ -249,28 +280,16 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
 /*
  * task will remain in the same cpu but on a different local runqueue
  */
-static void ckrm_cpu_change_class(void *task, void *old, void *new)
+void ckrm_cpu_change_class(void *task, void *old, void *new)
 {		
 	struct task_struct *tsk = task;			   
 	struct ckrm_cpu_class *newcls = new;
-	unsigned long flags;
-	struct runqueue *rq;
-	prio_array_t *array;
 
 	/*sanity checking*/
 	if (!task || ! old || !new)
 		return; 
 
-	rq = task_rq_lock(tsk,&flags); 
-	array = tsk->array;
-	if (array) {
-		dequeue_task(tsk,array);
-		tsk->cpu_class = newcls;
-		enqueue_task(tsk,rq_active(tsk,rq));
-	} else {
-		tsk->cpu_class = newcls;
-	}
-	task_rq_unlock(rq,&flags);
+	_ckrm_cpu_change_class(tsk,newcls);
 }							
 
 /*dummy function, not used*/
@@ -297,7 +316,7 @@ static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
 }
 	
 struct ckrm_res_ctlr cpu_rcbs = {
-	.res_name          = "CKRM CPU Class",
+	.res_name          = "cpu",
 	.res_hdepth        = 1,
 	.resid             = -1,
 	.res_alloc         = ckrm_alloc_cpu_class,
@@ -339,10 +358,11 @@ void init_cpu_classes(void)
 	//init classqueues for each processor
 	for (i=0; i < NR_CPUS; i++)
 		classqueue_init(get_cpu_classqueue(i)); 
-/*
- * hzheng: initialize the default cpu class
- *         required for E14 since ckrm_init is called after sched_init
- */
+
+	/*
+	 * hzheng: initialize the default cpu class
+	 *  required for E14/E15 since ckrm_init is called after sched_init
+	 */
 	ckrm_alloc_cpu_class(NULL,NULL);
 }
 
diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c
index 674ee6e50..c83c83fca 100644
--- a/kernel/ckrm/ckrm_cpu_monitor.c
+++ b/kernel/ckrm/ckrm_cpu_monitor.c
@@ -28,36 +28,84 @@
 #include <asm/div64.h>
 #include <linux/ckrm_sched.h>
 
-#define CPU_MONITOR_INTERVAL (4*HZ) /*how often do we adjust the shares*/
-#define CKRM_SHARE_ACCURACY 7
+#define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
 
+#define CKRM_CPU_DEMAND_RUN 0
+#define CKRM_CPU_DEMAND_SLEEP 1
+//sample task cpu demand every 64ms
+#define CPU_DEMAND_TASK_RECALC  (64000000LL)
+#define CPU_DEMAND_CLASS_RECALC (256000000LL)
+#define CPU_DEMAND_TP_CLASS 0
+#define CPU_DEMAND_TP_TASK 1
+
 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
+void update_ckrm_idle(unsigned long surplus);
+
+/*interface to share definition*/
+static inline int get_soft_limit(struct ckrm_cpu_class *cls)
+{
+	return cls->shares.my_limit;
+}
+
+static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
+{
+	return cls->shares.total_guarantee;
+}
+
+static inline int get_hard_limit(struct ckrm_cpu_class *cls)
+{
+	return cls->shares.total_guarantee;
+}
+
+static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
+{
+	return cls->shares.total_guarantee;
+}
+
+
+static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
+{
+	unsigned long long now = sched_clock();
+
+	local_stat->run = 0;
+	local_stat->total = 0;
+	local_stat->last_sleep = now;
+	switch (type) {
+	case CPU_DEMAND_TP_CLASS:
+		local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
+		local_stat->cpu_demand = 0; 
+		break;
+	case CPU_DEMAND_TP_TASK:
+		local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
+		//for task, the init cpu_demand is copied from its parent
+		break;
+	default:
+		BUG();
+	}
+}
 
 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
 {
 	int i;
-	struct ckrm_cpu_class_local_stat* local_stat;
-	unsigned long long now = sched_clock();
 
 	stat->stat_lock = SPIN_LOCK_UNLOCKED;
 	stat->total_ns = 0;
-	stat->cpu_demand = 0;
+	stat->max_demand = 0;
 
 	for (i=0; i< NR_CPUS; i++) {
-		local_stat = &stat->local_stats[i];
-		local_stat->run = 0;
-		local_stat->total = 0;
-		local_stat->last_sleep = now;
-		local_stat->cpu_demand = 0;		
+		cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
 	}
 
-	stat->effective_guarantee = 0;
-	stat->effective_limit = 0;
-	stat->glut = 0;
-	stat->effective_share = 100;
-	stat->self_effective_share = 100;
+	stat->egrt = 0;
+	stat->megrt = 0;
+	stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
+	stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
+
+	stat->eshare = CKRM_SHARE_MAX;
+	stat->meshare = CKRM_SHARE_MAX;
 }
+
 /**********************************************/
 /*          cpu demand                        */
 /**********************************************/
@@ -77,52 +125,42 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
  */
 
 /**
- * update_cpu_demand - update a state change
+ * update_cpu_demand_stat - 
  * 
- * should be called whenever the state of a local queue changes
+ * should be called whenever the state of a task/task local queue changes
  * -- when deschedule : report how much run
  * -- when enqueue: report how much sleep
  *
- * to deal with excessive long run/sleep state
- * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
+ * how often should we recalculate the cpu demand
+ * the number is in ns
  */
-#define CKRM_CPU_DEMAND_RUN 0
-#define CKRM_CPU_DEMAND_SLEEP 1
-//how often should we recalculate the cpu demand, in ns
-#define CPU_DEMAND_CAL_THRESHOLD (1000000000LL)
-static inline void update_local_cpu_demand(struct ckrm_cpu_class_local_stat* local_stat,int state, unsigned long long len)
+static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
 {	
 	local_stat->total += len;
 	if (state == CKRM_CPU_DEMAND_RUN)
 		local_stat->run += len;
 
-	if (local_stat->total >= CPU_DEMAND_CAL_THRESHOLD) {
+	if (local_stat->total >= local_stat->recalc_interval) {
 		local_stat->total >>= CKRM_SHARE_ACCURACY;
-		if (local_stat->total > 0xFFFFFFFF)
-			local_stat->total = 0xFFFFFFFF;
+		if (unlikely(local_stat->run > 0xFFFFFFFF))
+			local_stat->run = 0xFFFFFFFF;
 
+		if (local_stat->total > 0xFFFFFFFF) 
+			local_stat->total = 0xFFFFFFFF;
+			
 		do_div(local_stat->run,(unsigned long)local_stat->total);
-		local_stat->cpu_demand +=local_stat->run;
-		local_stat->cpu_demand >>= 1;
+
+		if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
+			local_stat->cpu_demand = local_stat->run;
+		else {
+			local_stat->cpu_demand += local_stat->run;
+			local_stat->cpu_demand >>= 1;
+		}
 		local_stat->total = 0;
 		local_stat->run = 0;
 	}
 }
 
-static inline void cpu_demand_update_run(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
-{
-	update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_RUN,len);
-}
-
-static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
-{
-	update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
-}
-
-#define CPU_DEMAND_ENQUEUE 0
-#define CPU_DEMAND_DEQUEUE 1
-#define CPU_DEMAND_DESCHEDULE 2
-
 /**
  * cpu_demand_event - and cpu_demand event occured
  * @event: one of the following three events:
@@ -131,19 +169,24 @@ static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* loc
  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
  */
-void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len) 
+void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) 
 {	
 	switch (event) {
 	case CPU_DEMAND_ENQUEUE: 
 		len = sched_clock() - local_stat->last_sleep;
 		local_stat->last_sleep = 0;
-		cpu_demand_update_sleep(local_stat,len);
+		update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
 		break;
 	case CPU_DEMAND_DEQUEUE:
-		local_stat->last_sleep = sched_clock();
+		if (! local_stat->last_sleep) {
+			local_stat->last_sleep = sched_clock();
+		}
 		break;
 	case CPU_DEMAND_DESCHEDULE:
-		cpu_demand_update_run(local_stat,len);		
+		update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
+		break;
+	case CPU_DEMAND_INIT: //for task init only
+		cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
 		break;
 	default:
 		BUG();
@@ -152,18 +195,19 @@ void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, u
 
 /** 
  * check all the class local queue
- * if local queueu is not in runqueue, then it's in sleep state
- * if compare to last sleep, 
+ * 
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
  */
 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
 {
-	struct ckrm_cpu_class_local_stat * local_stat = &stat->local_stats[cpu];
+	struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
 	unsigned long long sleep,now;
 	if (local_stat->last_sleep) {
 		now = sched_clock();
 		sleep = now - local_stat->last_sleep;
 		local_stat->last_sleep = now;
-		cpu_demand_update_sleep(local_stat,sleep);
+		update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
 	}
 }
 
@@ -172,51 +216,72 @@ static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int
  *
  * self_cpu_demand = sum(cpu demand of all local queues) 
  */
-static unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat
-						*stat)
+static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
 {
 	int cpu_demand = 0;
 	int i;
+	int cpuonline = 0;
 
 	for_each_online_cpu(i) {
 		cpu_demand_check_sleep(stat,i);
 		cpu_demand += stat->local_stats[i].cpu_demand;
+		cpuonline ++;
 	}
 
-	if (cpu_demand > CKRM_SHARE_MAX)
-		cpu_demand = CKRM_SHARE_MAX;
-	return cpu_demand;
+	return (cpu_demand/cpuonline);
 }
 
 /*
- * update effective cpu demand for each class
- * assume the root_core->parent == NULL
+ * my max demand = min(cpu_demand, my effective hard limit)
  */
-static void update_cpu_demand(struct ckrm_core_class *root_core)
+static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
+{
+	unsigned long mmax_demand = get_self_cpu_demand(stat);
+	if (mmax_demand > stat->mehl)
+		mmax_demand = stat->mehl;
+
+	return mmax_demand;
+}
+
+/**
+ * update_max_demand: update effective cpu demand for each class
+ * return -1 on error
+ * 
+ * Assume: the root_core->parent == NULL
+ */
+static int update_max_demand(struct ckrm_core_class *root_core)
 {
 	struct ckrm_core_class *cur_core, *child_core;
-	struct ckrm_cpu_class *cls;
+	struct ckrm_cpu_class *cls,*c_cls;
+	int ret = -1;
 
 	cur_core = root_core;
 	child_core = NULL;
-	/*
-	 * iterate the tree
-	 * update cpu_demand of each node
-	 */
-      repeat:
-	if (!cur_core)
-		return;
+	
+ repeat:
+	if (!cur_core) { //normal exit
+		ret = 0;
+		goto out;
+	}
 
 	cls = ckrm_get_cpu_class(cur_core);
+	if (! cls) //invalid c_cls, abort
+		goto out;
+
 	if (!child_core)	//first child
-		cls->stat.cpu_demand = get_self_cpu_demand(&cls->stat);
+		cls->stat.max_demand = get_mmax_demand(&cls->stat);
 	else {
-		cls->stat.cpu_demand +=
-		    ckrm_get_cpu_class(child_core)->stat.cpu_demand;
-		if (cls->stat.cpu_demand > CKRM_SHARE_MAX)
-			cls->stat.cpu_demand = CKRM_SHARE_MAX;
+		c_cls = ckrm_get_cpu_class(child_core);
+		if (c_cls)
+			cls->stat.max_demand += c_cls->stat.max_demand;
+		else //invalid c_cls, abort
+			goto out;
 	}
 
+	//check class hard limit
+	if (cls->stat.max_demand > cls->stat.ehl)
+		cls->stat.max_demand = cls->stat.ehl;
+
 	//next child
 	child_core = ckrm_get_next_child(cur_core, child_core);
 	if (child_core) {
@@ -229,78 +294,116 @@ static void update_cpu_demand(struct ckrm_core_class *root_core)
 		cur_core = child_core->hnode.parent;
 	}
 	goto repeat;
+ out:
+	return ret;
 }
 
 /**********************************************/
 /*          effective guarantee & limit       */
 /**********************************************/
-static inline void set_effective_share(struct ckrm_cpu_class_stat *stat,
+static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
 				       int new_share)
 {
 	if (!new_share)
 		new_share = 1;
-	stat->effective_share = new_share;
+
+	BUG_ON(new_share < 0);
+	stat->eshare = new_share;
 }
 
-static inline void set_self_effective_share(struct ckrm_cpu_class_stat *stat,
+static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
 					    int new_share)
 {
 	if (!new_share)
 		new_share = 1;
-	stat->self_effective_share = new_share;
+
+	BUG_ON(new_share < 0);
+	stat->meshare = new_share;
 }
 
-static inline void update_child_effective(struct ckrm_core_class *parent)
+/**
+ *update_child_effective - update egrt, ehl, mehl for all children of parent
+ *@parent: the parent node
+ *return -1 if anything wrong
+ *
+ */
+static int update_child_effective(struct ckrm_core_class *parent)
 {
 	struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
-	struct ckrm_core_class *child_core = ckrm_get_next_child(parent, NULL);
+	struct ckrm_core_class *child_core;	
+	int ret = -1;
+
+	if (! p_cls)
+		return ret;
 
+	child_core = ckrm_get_next_child(parent, NULL);
 	while (child_core) {
 		struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
+		if (! c_cls)
+			return ret;
 
-		c_cls->stat.effective_guarantee =
-		    p_cls->stat.effective_guarantee *
+		c_cls->stat.egrt =
+		    p_cls->stat.egrt *
 		    c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
-		c_cls->stat.effective_limit =
-		    p_cls->stat.effective_guarantee * c_cls->shares.my_limit /
-		    p_cls->shares.total_guarantee;
+
+		c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
+			/ c_cls->shares.total_guarantee;
+		
+		c_cls->stat.ehl =
+		    p_cls->stat.ehl *
+		    get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
+
+		c_cls->stat.mehl =
+		    c_cls->stat.ehl *
+		    get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
 
 		child_core = ckrm_get_next_child(parent, child_core);
 	};
-
+	return 0;
 }
 
-/*
- * update effective guarantee and effective limit
- * -- effective share = parent->effective->share * share/parent->total_share
- * -- effective limit = parent->effective->share * limit/parent->total_share
+/**
+ * update_effectives: update egrt, ehl, mehl for the whole tree
  * should be called only when class structure changed
+ *
+ * return -1 if anything wrong happened (eg: the structure changed during the process)
  */
-static void update_effective_guarantee_limit(struct ckrm_core_class *root_core)
+static int update_effectives(struct ckrm_core_class *root_core)
 {
-	struct ckrm_core_class *cur_core, *child_core = NULL;
+	struct ckrm_core_class *cur_core, *child_core;
 	struct ckrm_cpu_class *cls;
+	int ret = -1;
 
 	cur_core = root_core;
+	child_core = NULL;
 	cls = ckrm_get_cpu_class(cur_core);
-	cls->stat.effective_guarantee = CKRM_SHARE_MAX;
-	cls->stat.effective_limit = cls->stat.effective_guarantee;
 
-      repeat:
+	//initialize the effectives for root 
+	cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
+	cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
+		/ cls->shares.total_guarantee;
+	cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
+		/ cls->shares.total_guarantee;
+	cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
+		/ cls->shares.total_guarantee;
+	
+ repeat:
 	//check exit
 	if (!cur_core)
-		return;
+		return 0;
 
 	//visit this node
-	update_child_effective(cur_core);
+	if (update_child_effective(cur_core) < 0)
+		return ret; //invalid cur_core node
+	
 	//next child
 	child_core = ckrm_get_next_child(cur_core, child_core);
+
 	if (child_core) {
-		//go down
+		//go down to the next hier
 		cur_core = child_core;
 		child_core = NULL;
-		goto repeat;
-	} else {		//no more child, go back
+	} else { //no more child, go back
 		child_core = cur_core;
 		cur_core = child_core->hnode.parent;
 	}
@@ -312,12 +415,12 @@ static void update_effective_guarantee_limit(struct ckrm_core_class *root_core)
 /**********************************************/
 
 /*
- * surplus = my_effective_share - demand
+ * surplus = egrt - demand
  * if surplus < 0, surplus = 0 
  */
 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
 {
-	int surplus = cls->stat.effective_guarantee - cls->stat.cpu_demand;
+	int surplus = cls->stat.egrt - cls->stat.max_demand;
 
 	if (surplus < 0)
 		surplus = 0;
@@ -325,122 +428,199 @@ static inline int get_node_surplus(struct ckrm_cpu_class *cls)
 	return surplus;
 }
 
-/*
- * consume the surplus
+static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
+{
+	int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
+
+	if (surplus < 0)
+		surplus = 0;
+
+	return surplus;
+}
+
+/**
+ * node_surplus_consume: consume the surplus
+ * @ckeck_sl: if check_sl is set, then check soft_limit
+ * @total_grt: total guarantee 
  * return how much consumed
- * set glut when necessary
+ * return -1 on error
+ *
+ * implements all the CKRM Scheduling Requirement
+ * update total_grt if necessary 
  */
-static inline int node_surplus_consume(int old_surplus,
+static inline int node_surplus_consume(int surplus,
 				       struct ckrm_core_class *child_core,
-				       struct ckrm_cpu_class *p_cls)
+				       struct ckrm_cpu_class *p_cls,
+				       int check_sl
+				       )
 {
 	int consumed = 0;
 	int inc_limit;
+	int glut = 1;
 
 	struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
+	int total_grt = p_cls->shares.total_guarantee;
 
-	if (c_cls->stat.glut)
+ 	BUG_ON(surplus < 0);
+
+	if (! c_cls || ! total_grt)
 		goto out;
 
-	//check demand
-	if (c_cls->stat.effective_share >= c_cls->stat.cpu_demand) {
-		c_cls->stat.glut = 1;
+	/*can't consume more than demand or hard limit*/
+	if (c_cls->stat.eshare >= c_cls->stat.max_demand)
 		goto out;
-	}
 
 	consumed =
-	    old_surplus * c_cls->shares.my_guarantee /
-	    p_cls->shares.total_guarantee;
+		surplus * c_cls->shares.my_guarantee / total_grt;
 
-	//check limit
-	inc_limit = c_cls->stat.effective_limit - c_cls->stat.effective_share;
-	if (inc_limit <= consumed) {
-		c_cls->stat.glut = 1;
-		consumed = inc_limit;
+	if (! consumed) //no more share
+		goto out;
+
+	//hard limit and demand limit
+	inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
+
+	if (check_sl) {
+		int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
+			/p_cls->shares.total_guarantee;
+		if (esl < c_cls->stat.max_demand)
+			inc_limit = esl - c_cls->stat.eshare;
 	}
 
-	c_cls->stat.effective_share += consumed;
-      out:
+
+	if (consumed > inc_limit)
+		consumed = inc_limit;
+	else
+		glut = 0;
+
+        BUG_ON(consumed < 0);
+	set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
+        BUG_ON(c_cls->stat.eshare < 0);
+
+ out:		
 	return consumed;
 }
 
-/*
- * re-allocate the shares for all the childs under this node
+/**
+ * alloc_surplus_node: re-allocate the shares for children under parent
+ * @parent: parent node
+ * return the remaining surplus
+ *
  * task:
  *  1. get total surplus
  *  2. allocate surplus
  *  3. set the effective_share of each node
  */
-static void alloc_surplus_node(struct ckrm_core_class *parent)
+static int alloc_surplus_node(struct ckrm_core_class *parent)
 {
-	int total_surplus = 0, old_surplus = 0;
+	int total_surplus , old_surplus;
 	struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
 	struct ckrm_core_class *child_core = NULL;
 	int self_share;
+	int check_sl;
+	int ret = -1;
+
+	if (! p_cls)
+		return ret;
+
+	total_surplus = get_my_node_surplus(p_cls);
 
 	/*
-	 * calculate surplus 
-	 * total_surplus = sum(child_surplus)
-	 * reset glut flag
 	 * initialize effective_share
 	 */
 	do {
 		child_core = ckrm_get_next_child(parent, child_core);
 		if (child_core) {
-			struct ckrm_cpu_class *c_cls =
-			    ckrm_get_cpu_class(child_core);
-			ckrm_stat_t *stat = &c_cls->stat;
+			struct ckrm_cpu_class *c_cls;
+
+			c_cls = ckrm_get_cpu_class(child_core);				
+			if (! c_cls)
+				return ret; 
 
 			total_surplus += get_node_surplus(c_cls);
-			stat->glut = 0;
-			set_effective_share(stat, stat->effective_guarantee);
+
+		 	set_eshare(&c_cls->stat, c_cls->stat.egrt);
 		}
 	} while (child_core);
 
-	/*distribute the surplus */
+	if (! total_surplus)
+		goto realloc_out;
+
+	/* distribute the surplus */
 	child_core = NULL;
+	check_sl = 1;
+	old_surplus = 0;
 	do {
-		if (!child_core)	//keep the surplus of last round
+		if (!child_core) {//start a new round
+
+			//ok, everybody reached the soft limit
+			if (old_surplus == total_surplus) 
+				check_sl = 0;
 			old_surplus = total_surplus;
+		}
 
 		child_core = ckrm_get_next_child(parent, child_core);
-		if (child_core) {
-			total_surplus -=
-			    node_surplus_consume(old_surplus, child_core,
-						 p_cls);
+		if (child_core)  {
+			int consumed = 0;
+			consumed -=
+				node_surplus_consume(old_surplus, child_core,
+						     p_cls,check_sl);
+			if (consumed >= 0) 
+				total_surplus -= consumed;
+			else
+				return ret;	
 		}
 		//start a new round if something is allocated in the last round
-	} while (child_core || (total_surplus != old_surplus));
+	} while (child_core || check_sl || total_surplus != old_surplus);
 
-	//any remaining surplus goes to the default class
-	self_share = p_cls->stat.effective_share *
+ realloc_out:
+	/*how much for itself*/
+	self_share = p_cls->stat.eshare *
 	    p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
-	self_share += total_surplus;
 
-	set_self_effective_share(&p_cls->stat, self_share);
+	if (self_share < p_cls->stat.max_demand) {
+		/*any remaining surplus goes to the default class*/
+		self_share += total_surplus;	
+		if (self_share > p_cls->stat.max_demand)
+			self_share = p_cls->stat.max_demand;
+	}
+	
+	set_meshare(&p_cls->stat, self_share);
+	return 0;
 }
 
 /**
  * alloc_surplus - reallocate unused shares
  *
  * class A's usused share should be allocated to its siblings
+ * the re-allocation goes downward from the top
  */
-static void alloc_surplus(struct ckrm_core_class *root_core)
+static int alloc_surplus(struct ckrm_core_class *root_core)
 {
-	struct ckrm_core_class *cur_core, *child_core = NULL;
+	struct ckrm_core_class *cur_core, *child_core;
 	struct ckrm_cpu_class *cls;
+	int ret = -1;
 
+	/*initialize*/
 	cur_core = root_core;
+	child_core = NULL;
 	cls = ckrm_get_cpu_class(cur_core);
-	cls->stat.glut = 0;
-	set_effective_share(&cls->stat, cls->stat.effective_guarantee);
+
+	//set root eshare
+	set_eshare(&cls->stat, cls->stat.egrt);
+
+	/*the ckrm idle tasks get all what's remaining*/
+	/*hzheng: uncomment the following like for hard limit support */
+	//	update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
+	
       repeat:
 	//check exit
 	if (!cur_core)
-		return;
+		return 0;
 
 	//visit this node
-	alloc_surplus_node(cur_core);
+	if ( alloc_surplus_node(cur_core) < 0 )
+		return ret;
+
 	//next child
 	child_core = ckrm_get_next_child(cur_core, child_core);
 	if (child_core) {
@@ -455,6 +635,199 @@ static void alloc_surplus(struct ckrm_core_class *root_core)
 	goto repeat;
 }
 
+/**********************************************/
+/*           CKRM Idle Tasks                  */
+/**********************************************/
+struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
+struct task_struct* ckrm_idle_tasks[NR_CPUS];
+
+/*how many ckrm idle tasks should I wakeup*/
+static inline int get_nr_idle(unsigned long surplus)
+{
+	int cpu_online = cpus_weight(cpu_online_map);	
+	int nr_idle = 0; 
+	
+	nr_idle = surplus * cpu_online;
+	nr_idle >>= CKRM_SHARE_ACCURACY;
+
+	if (surplus) 
+		nr_idle ++;
+
+	if (nr_idle > cpu_online)  
+		nr_idle = cpu_online;
+
+	return nr_idle;
+}
+
+/**
+ * update_ckrm_idle: update the status of the idle class according to the new surplus
+ * surplus: new system surplus
+ *
+ * Task:
+ * -- update share of the idle class 
+ * -- wakeup idle tasks according to surplus
+ */
+void update_ckrm_idle(unsigned long surplus)
+{
+	int nr_idle = get_nr_idle(surplus);
+	int i;
+	struct task_struct* idle_task;
+
+	set_eshare(&ckrm_idle_class->stat,surplus);
+	set_meshare(&ckrm_idle_class->stat,surplus);
+	/*wake up nr_idle idle tasks*/
+	for_each_online_cpu(i) {
+		idle_task = ckrm_idle_tasks[i];
+		if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
+			ckrm_cpu_change_class(idle_task,
+					      idle_task->cpu_class,
+					      ckrm_idle_class);
+		}
+		if (! idle_task)
+			continue;
+		if (i < nr_idle) {
+			//activate it
+			wake_up_process(idle_task);
+		} else {
+			//deactivate it
+			idle_task->state = TASK_INTERRUPTIBLE;
+			set_tsk_need_resched(idle_task);
+		}
+	}
+}
+
+static int ckrm_cpu_idled(void *nothing)
+{
+	set_user_nice(current,19);
+	daemonize("ckrm_idle_task");
+
+	//deactivate it, it will be waked up by ckrm_cpu_monitor
+	current->state = TASK_INTERRUPTIBLE;
+	schedule();		
+
+	/*similar to cpu_idle */
+	while (1) {
+		while (!need_resched()) {
+			ckrm_cpu_monitor();
+			if (current_cpu_data.hlt_works_ok) {
+				local_irq_disable();
+				if (!need_resched()) {
+					set_tsk_need_resched(current);
+					safe_halt();
+				} else
+					local_irq_enable();
+			}
+		}
+		schedule();		
+	}
+	return 0;
+}
+
+/**
+ * ckrm_start_ckrm_idle: 
+ *  create the ckrm_idle_class and starts the idle tasks
+ *
+ */
+void ckrm_start_ckrm_idle(void)
+{
+	int i;
+	int ret;
+	ckrm_shares_t shares;
+	
+	ckrm_idle_class = &ckrm_idle_class_obj; 
+	memset(ckrm_idle_class,0,sizeof(shares));
+	/*don't care about the shares */
+	init_cpu_class(ckrm_idle_class,&shares);
+	printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
+	
+	for_each_online_cpu(i) {
+		ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
+		
+		/*warn on error, but the system should still work without it*/
+		if (ret < 0)
+			printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
+		else {
+			ckrm_idle_tasks[i] = find_task_by_pid(ret);
+			if (!ckrm_idle_tasks[i])
+				printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
+		}
+	}
+}
+
+/**********************************************/
+/*          Local Weight                      */
+/**********************************************/
+/**
+ * adjust_class_local_weight: adjust the local weight for each cpu
+ *
+ * lrq->weight = lpr->pressure * class->weight / total_pressure
+ */
+static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
+{
+	unsigned long total_pressure = 0;
+	ckrm_lrq_t* lrq;
+	int i;
+	unsigned long class_weight;
+	unsigned long long lw;	
+
+	//get total pressure
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		total_pressure += lrq->lrq_load;
+	}
+
+	if (! total_pressure)
+		return;
+	
+	class_weight = cpu_class_weight(clsptr) * cpu_online;
+
+	/*
+	 * update weight for each cpu, minimun is 1
+	 */
+	for_each_online_cpu(i) {
+		lrq = get_ckrm_lrq(clsptr,i);
+		if (! lrq->lrq_load)
+			/*give idle class a high share to boost interactiveness */
+			lw = cpu_class_weight(clsptr); 
+		else {
+			lw = lrq->lrq_load * class_weight;
+			do_div(lw,total_pressure);
+			if (!lw)
+				lw = 1;
+			else if (lw > CKRM_SHARE_MAX)
+				lw = CKRM_SHARE_MAX;
+		}
+		
+		lrq->local_weight = lw;
+	}
+}
+
+/*
+ * assume called with class_list_lock read lock held
+ */
+void adjust_local_weight(void)
+{
+	static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
+	struct ckrm_cpu_class *clsptr;
+	int cpu_online;
+
+	//do nothing if someone already holding the lock
+	if (! spin_trylock(&lock))
+		return;
+
+	cpu_online = cpus_weight(cpu_online_map);	
+
+	//class status: demand, share,total_ns prio, index
+	list_for_each_entry(clsptr,&active_cpu_classes,links) {
+		adjust_lrq_weight(clsptr,cpu_online);
+	}
+
+	spin_unlock(&lock);
+}
+
+/**********************************************/
+/*          Main                              */
+/**********************************************/
 /**
  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
  *
@@ -464,13 +837,43 @@ static void alloc_surplus(struct ckrm_core_class *root_core)
  */
 void ckrm_cpu_monitor(void)
 {
-	struct ckrm_core_class *root_core = default_cpu_class->core;
+	static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
+	static unsigned long long last_check = 0;
+	struct ckrm_core_class *root_core = get_default_cpu_class()->core;
+	unsigned long long now;	
+#define MIN_CPU_MONITOR_INTERVAL 100000000UL
+
 	if (!root_core)
 		return;
 
-	update_effective_guarantee_limit(root_core);
-	update_cpu_demand(root_core);
-	alloc_surplus(root_core);
+	//do nothing if someone already holding the lock
+	if (! spin_trylock(&lock))
+		return;
+
+	read_lock(&class_list_lock);
+
+	now = sched_clock();
+
+	//consecutive check should be at least 100ms apart
+	if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
+		goto outunlock;
+	}
+	last_check = now;
+
+	if (update_effectives(root_core) != 0)
+		goto outunlock;
+	
+	if (update_max_demand(root_core) != 0)
+		goto outunlock;
+	
+	if (alloc_surplus(root_core) != 0)
+		goto outunlock;
+	
+	adjust_local_weight();
+
+ outunlock:	
+	read_unlock(&class_list_lock);
+	spin_unlock(&lock);
 }
 
 /*****************************************************/
@@ -481,14 +884,11 @@ static int thread_exit = 0;
 
 static int ckrm_cpu_monitord(void *nothing)
 {
-	wait_queue_head_t wait;
-
-	init_waitqueue_head(&wait);
-
 	daemonize("ckrm_cpu_ctrld");
 	for (;;) {
 		/*sleep for sometime before next try*/
-		interruptible_sleep_on_timeout(&wait, CPU_MONITOR_INTERVAL);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(CPU_MONITOR_INTERVAL);
 		ckrm_cpu_monitor();
 		if (thread_exit) {
 			break;
@@ -510,15 +910,14 @@ void ckrm_start_monitor(void)
 
 void ckrm_kill_monitor(void)
 {
-	wait_queue_head_t wait;
 	int interval = HZ;
-	init_waitqueue_head(&wait);
 
 	printk("killing process %d\n", cpu_monitor_pid);
 	if (cpu_monitor_pid > 0) {
 		thread_exit = 1;
 		while (thread_exit != 2) {
-			interruptible_sleep_on_timeout(&wait, interval);
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(CPU_MONITOR_INTERVAL);
 		}
 	}
 }
@@ -526,6 +925,8 @@ void ckrm_kill_monitor(void)
 int ckrm_cpu_monitor_init(void)
 {
 	ckrm_start_monitor();
+	/*hzheng: uncomment the following like for hard limit support */
+	//	ckrm_start_ckrm_idle();
 	return 0;
 }
 
diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c
index 1929aaf4e..0400844a3 100644
--- a/kernel/ckrm_classqueue.c
+++ b/kernel/ckrm_classqueue.c
@@ -133,12 +133,16 @@ void classqueue_update_prio(struct classqueue_struct *cq,
 	
 	//add to new positon, round robin for classes with same priority
 	list_add_tail(&(node->list), &cq->array.queue[index]);
-	__set_bit(index, cq->array.bitmap);
-	
+	__set_bit(index, cq->array.bitmap);	
 	node->index = index;
 }
 
-cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
+/**
+ *classqueue_get_min_prio: return the priority of the last node in queue
+ *
+ * this function can be called without runqueue lock held
+ */
+static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
 {
 	cq_node_t *result = NULL;
 	int pos;
@@ -147,9 +151,36 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
 	 * search over the bitmap to get the first class in the queue
 	 */
 	pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset);
-	if (pos >= CLASSQUEUE_SIZE) {	//do circular search from the beginning
+	//do circular search from the beginning
+	if (pos >= CLASSQUEUE_SIZE) 
 		pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
+
+	if (pos < CLASSQUEUE_SIZE) {
+		result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+		if (list_empty(&cq->array.queue[pos]))
+			result = NULL;
 	}
+	if (result)
+		return result->prio;
+	else 
+		return 0;
+}
+
+/**
+ * this function must be called with runqueue lock held
+ */
+cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
+{
+	cq_node_t *result = NULL;
+	int pos;
+
+	/* 
+	 * search over the bitmap to get the first class in the queue
+	 */
+	pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset);
+	//do circular search from the beginning
+	if (pos >= CLASSQUEUE_SIZE) 
+		pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
 
 	if (pos < CLASSQUEUE_SIZE) {
 		BUG_ON(list_empty(&cq->array.queue[pos]));
@@ -162,15 +193,17 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
  * Moving the end of queue forward
  * the new_base here is logical, we need to translate to the abosule position
  */
-void classqueue_update_base(struct classqueue_struct *cq, int new_base)
+void classqueue_update_base(struct classqueue_struct *cq)
 {
-	if (!cq_nr_member(cq)) {
+	int new_base;
+	
+	if (! cq_nr_member(cq)) {
 		cq->base_offset = -1;	//not defined
 		return;
 	}
 
-	//	assert(new_base >= cq->base);
-
+	new_base = classqueue_get_min_prio(cq);
+	
 	if (new_base > cq->base) {
 		cq->base_offset = get_index(cq, &new_base);
 		cq->base = new_base;
diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c
index ba716d4c5..9c653a3b6 100644
--- a/kernel/ckrm_sched.c
+++ b/kernel/ckrm_sched.c
@@ -15,57 +15,192 @@
 #include <linux/init.h>
 #include <linux/ckrm_sched.h>
 
+rwlock_t   class_list_lock = RW_LOCK_UNLOCKED;
+LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
+
+struct ckrm_cpu_class default_cpu_class_obj;
+
+struct ckrm_cpu_class * get_default_cpu_class(void) {
+	return (&default_cpu_class_obj);
+}
+
 /*******************************************************/
 /*                CVT Management                       */
 /*******************************************************/
-#define CVT_WINDOW_SIZE (CLASSQUEUE_SIZE << CLASS_BONUS_RATE)
-static CVT_t max_CVT = CVT_WINDOW_SIZE;
 
-/*
- *  Also ensure that the classes global cvt is upgraded to the 
- * minimum CVT in the system, as a class might not have run for a while
- */
-static void update_global_cvt(struct ckrm_cpu_class *cpu_class, int cpu)
+static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
 {
-	struct ckrm_local_runqueue *class_queue =
-	    get_ckrm_local_runqueue(cpu_class, cpu);
 	CVT_t min_cvt;
-	CVT_t local_cvt_old = class_queue->local_cvt;
+	CVT_t bonus;
 
-	spin_lock(&cvt_lock);
-	if (class_queue->uncounted_cvt) {
-		cpu_class->global_cvt += class_queue->uncounted_cvt;
-		class_queue->uncounted_cvt = 0;
-	}
-	min_cvt = max_CVT - CVT_WINDOW_SIZE;
-	if (cpu_class->global_cvt < min_cvt)
-		cpu_class->global_cvt = min_cvt;
-	else  if (cpu_class->global_cvt > max_CVT)
-		max_CVT = cpu_class->global_cvt;
-
-/* update local cvt from global cvt*/
-#if 0
-	class_queue->local_cvt = cpu_class->global_cvt;
-#endif
-	spin_unlock(&cvt_lock);
-
-	if (class_queue->local_cvt != local_cvt_old)
-		update_class_priority(class_queue);
+	//just a safty measure
+	if (unlikely(! cur_cvt))
+		return; 
+
+	/*
+	 * Always leaving a small bonus for inactive classes 
+	 * allows them to compete for cycles immediately when the become
+	 * active. This should improve interactive behavior
+	 */
+	bonus = INTERACTIVE_BONUS(lrq);
+	//cvt can't be negative
+	if (cur_cvt > bonus)
+		min_cvt = cur_cvt - bonus;
+	else
+		min_cvt = 0;
+	
+	if (lrq->local_cvt < min_cvt) {
+		CVT_t lost_cvt;
+
+		lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq);
+		lrq->local_cvt = min_cvt;
+
+		/* add what the class lost to its savings*/
+		lrq->savings += lost_cvt;
+		if (lrq->savings > MAX_SAVINGS)
+			lrq->savings = MAX_SAVINGS; 
+	} else if (lrq->savings) {
+		/*
+		 *if a class saving and falling behind
+		 * then start to use it saving in a leaking bucket way
+		 */
+		CVT_t savings_used;
+
+		savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq);
+		if (savings_used > lrq->savings)
+			savings_used = lrq->savings;
+		
+		if (savings_used > SAVINGS_LEAK_SPEED)
+			savings_used = SAVINGS_LEAK_SPEED;
+
+		BUG_ON(lrq->savings < savings_used);
+		lrq->savings -= savings_used;
+		unscale_cvt(savings_used,lrq);
+		BUG_ON(lrq->local_cvt < savings_used);
+		// lrq->local_cvt -= savings_used;
+	}		
 }
 
 /*
+ * return the max_cvt of all the classes
+ */
+static inline CVT_t get_max_cvt(int this_cpu)
+{
+        struct ckrm_cpu_class *clsptr;
+        ckrm_lrq_t * lrq;
+        CVT_t max_cvt;
+
+        max_cvt = 0;
+
+        /*update class time, at the same time get max_cvt */
+        list_for_each_entry(clsptr, &active_cpu_classes, links) {
+                lrq = get_ckrm_lrq(clsptr, this_cpu);
+                if (lrq->local_cvt > max_cvt)
+                        max_cvt = lrq->local_cvt;
+        }
+
+	return max_cvt;
+}
+
+/**
+ * update_class_cputime - updates cvt of inactive classes
+ * -- an inactive class shouldn't starve others when it comes back
+ * -- the cpu time it lost when it's inactive should be accumulated
+ * -- its accumulated saving should be compensated (in a leaky bucket fashion)
+ * 
  * class_list_lock must have been acquired 
  */
-void update_global_cvts(int this_cpu)
+void update_class_cputime(int this_cpu)
 {
 	struct ckrm_cpu_class *clsptr;
-	struct ckrm_local_runqueue *class_queue;
+	ckrm_lrq_t * lrq;
+	CVT_t cur_cvt;
+
+	/*
+	 *  a class's local_cvt must not be significantly smaller than min_cvt 
+	 *  of active classes otherwise, it will starve other classes when it 
+         *  is reactivated.
+	 * 
+  	 *  Hence we keep all local_cvt's within a range of the min_cvt off
+	 *  all active classes (approximated by the local_cvt of the currently
+	 *  running class) and account for how many cycles where thus taken
+	 *  from an inactive class building a savings (not to exceed a few seconds)
+	 *  for a class to gradually make up upon reactivation, without 
+	 *  starvation of other classes.
+         *  
+	 */
+	cur_cvt = get_local_cur_cvt(this_cpu);
 
-	/*for each class*/
+	/*
+	 * cur_cvt == 0 means the system is now idle
+	 * in this case, we use max_cvt as cur_cvt
+	 * max_cvt roughly represents the cvt of the class 
+	 * that has just finished running
+	 *
+	 * fairness wouldn't be a problem since we account for whatever lost in savings
+	 * if the system is not busy, the system responsiveness is not a problem.
+	 * still fine if the sytem is busy, but happened to be idle at this certain point
+	 * since bias toward interactive classes (class priority) is a more important way to improve system responsiveness
+	 */
+	if (unlikely(! cur_cvt))  {
+		cur_cvt = get_max_cvt(this_cpu);
+		//return;
+	}
+
+	/* 
+	 *  - check the local cvt of all the classes 
+	 *  - update total_ns received by the class
+	 *  - do a usage sampling for the whole class
+	 */
 	list_for_each_entry(clsptr, &active_cpu_classes, links) {
-		update_global_cvt(clsptr, this_cpu);
-		class_queue = get_ckrm_local_runqueue(clsptr, this_cpu);
-		clsptr->stat.total_ns += class_queue->uncounted_ns;
-		class_queue->uncounted_ns = 0;
+		lrq = get_ckrm_lrq(clsptr, this_cpu);
+
+		spin_lock(&clsptr->stat.stat_lock);
+		clsptr->stat.total_ns += lrq->uncounted_ns;
+		ckrm_sample_usage(clsptr);
+		spin_unlock(&clsptr->stat.stat_lock);
+		lrq->uncounted_ns = 0;
+
+		check_inactive_class(lrq,cur_cvt);		
 	}
 }
+
+/*******************************************************/
+/*                PID load balancing stuff             */
+/*******************************************************/
+#define PID_SAMPLE_T 32
+#define PID_KP 20
+#define PID_KI 60
+#define PID_KD 20
+
+/**
+ * sample pid load periodically
+ */
+void ckrm_load_sample(ckrm_load_t* pid,int cpu)
+{
+	long load;
+	long err;
+
+	if (jiffies % PID_SAMPLE_T)
+		return;
+
+	adjust_local_weight();	
+
+	load = ckrm_cpu_load(cpu);
+	err = load - pid->load_p;
+	pid->load_d = err;
+	pid->load_p = load;
+	pid->load_i *= 9;
+	pid->load_i += load;
+	pid->load_i /= 10;
+}
+
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
+{
+	long pressure;
+	pressure = ckrm_load->load_p * PID_KP;
+	pressure += ckrm_load->load_i * PID_KI;
+	pressure += ckrm_load->load_d * PID_KD;
+	pressure /= 100;
+	return pressure;
+}
diff --git a/kernel/sched.c b/kernel/sched.c
index 74a53bf05..947dda24e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -163,6 +163,21 @@ EXPORT_SYMBOL(dump_oncpu);
 #define LOW_CREDIT(p) \
 	((p)->interactive_credit < -CREDIT_LIMIT)
 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/*
+ *  if belong to different class, compare class priority
+ *  otherwise compare task priority 
+ */
+#define TASK_PREEMPTS_CURR(p, rq) \
+	( ((p)->cpu_class != (rq)->curr->cpu_class) \
+	  && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \
+	  ? class_preempts_curr((p),(rq)->curr)  \
+	  : ((p)->prio < (rq)->curr->prio)
+#else
+#define TASK_PREEMPTS_CURR(p, rq) \
+	((p)->prio < (rq)->curr->prio)
+#endif
+
 /*
  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
  * to time slice values.
@@ -178,14 +193,71 @@ EXPORT_SYMBOL(dump_oncpu);
 		((MAX_TIMESLICE - MIN_TIMESLICE) * \
 			(MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
 
-static unsigned int task_timeslice(task_t *p)
+unsigned int task_timeslice(task_t *p)
 {
 	return BASE_TIMESLICE(p);
 }
 
 #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
 
-DEFINE_PER_CPU(struct runqueue, runqueues);
+/*
+ * These are the runqueue data structures:
+ */
+
+typedef struct runqueue runqueue_t;
+#include <linux/ckrm_classqueue.h>
+#include <linux/ckrm_sched.h>
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct runqueue {
+	spinlock_t lock;
+
+	/*
+	 * nr_running and cpu_load should be in the same cacheline because
+	 * remote CPUs use both these fields when doing load calculation.
+	 */
+	unsigned long nr_running;
+#if defined(CONFIG_SMP)
+	unsigned long cpu_load;
+#endif
+	unsigned long long nr_switches, nr_preempt;
+	unsigned long expired_timestamp, nr_uninterruptible;
+	unsigned long long timestamp_last_tick;
+	task_t *curr, *idle;
+	struct mm_struct *prev_mm;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+	struct classqueue_struct classqueue;   
+	ckrm_load_t ckrm_load;
+#else
+        prio_array_t *active, *expired, arrays[2];
+#endif
+	int best_expired_prio;
+	atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+	struct sched_domain *sd;
+
+	/* For active balancing */
+	int active_balance;
+	int push_cpu;
+
+	task_t *migration_thread;
+	struct list_head migration_queue;
+#endif
+
+#ifdef	CONFIG_VSERVER_HARDCPU		
+	struct list_head hold_queue;
+	int idle_tokens;
+#endif
+};
+
+static DEFINE_PER_CPU(struct runqueue, runqueues);
 
 #define for_each_domain(cpu, domain) \
 	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
@@ -204,95 +276,86 @@ DEFINE_PER_CPU(struct runqueue, runqueues);
 # define task_running(rq, p)		((rq)->curr == (p))
 #endif
 
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-#include <linux/ckrm_sched.h>
-spinlock_t cvt_lock        = SPIN_LOCK_UNLOCKED;
-rwlock_t   class_list_lock = RW_LOCK_UNLOCKED;
-LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
-struct ckrm_cpu_class default_cpu_class_obj;
-
 /*
- * the minimum CVT allowed is the base_cvt
- * otherwise, it will starve others
+ * task_rq_lock - lock the runqueue a given task resides on and disable
+ * interrupts.  Note the ordering: we can safely lookup the task_rq without
+ * explicitly disabling preemption.
  */
-CVT_t get_min_cvt(int cpu)
+static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 {
-	cq_node_t *node;
-	struct ckrm_local_runqueue * lrq;
-	CVT_t min_cvt;
+	struct runqueue *rq;
 
-	node = classqueue_get_head(bpt_queue(cpu));
-	lrq =  (node) ? class_list_entry(node) : NULL;
-	
-	if (lrq) 
-		min_cvt = lrq->local_cvt;
-	else 
-		min_cvt = 0;
-		
-	return min_cvt;
+repeat_lock_task:
+	local_irq_save(*flags);
+	rq = task_rq(p);
+	spin_lock(&rq->lock);
+	if (unlikely(rq != task_rq(p))) {
+		spin_unlock_irqrestore(&rq->lock, *flags);
+		goto repeat_lock_task;
+	}
+	return rq;
+}
+
+static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+{
+	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 
 /*
- * update the classueue base for all the runqueues
- * TODO: we can only update half of the min_base to solve the movebackward issue
+ * rq_lock - lock a given runqueue and disable interrupts.
  */
-static inline void check_update_class_base(int this_cpu) {
-	unsigned long min_base = 0xFFFFFFFF; 
-	cq_node_t *node;
-	int i;
+static runqueue_t *this_rq_lock(void)
+{
+	runqueue_t *rq;
 
-	if (! cpu_online(this_cpu)) return;
+	local_irq_disable();
+	rq = this_rq();
+	spin_lock(&rq->lock);
 
-	/*
-	 * find the min_base across all the processors
-	 */
-	for_each_online_cpu(i) {
-		/*
-		 * I should change it to directly use bpt->base
-		 */
-		node = classqueue_get_head(bpt_queue(i));
-		if (node && node->prio < min_base) {
-			min_base = node->prio;
-		}
-	}
-	if (min_base != 0xFFFFFFFF) 
-		classqueue_update_base(bpt_queue(this_cpu),min_base);
+	return rq;
 }
 
-static inline void ckrm_rebalance_tick(int j,int this_cpu)
+static inline void rq_unlock(runqueue_t *rq)
 {
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-	read_lock(&class_list_lock);
-	if (!(j % CVT_UPDATE_TICK))
-		update_global_cvts(this_cpu);
-
-#define CKRM_BASE_UPDATE_RATE 400
-	if (! (jiffies % CKRM_BASE_UPDATE_RATE))
-		check_update_class_base(this_cpu);
-
-	read_unlock(&class_list_lock);
-#endif
+	spin_unlock_irq(&rq->lock);
 }
 
-static inline struct ckrm_local_runqueue *rq_get_next_class(struct runqueue *rq)
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
 {
 	cq_node_t *node = classqueue_get_head(&rq->classqueue);
 	return ((node) ? class_list_entry(node) : NULL);
 }
 
+/*
+ * return the cvt of the current running class
+ * if no current running class, return 0
+ * assume cpu is valid (cpu_online(cpu) == 1)
+ */
+CVT_t get_local_cur_cvt(int cpu)
+{
+	ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu));
+
+	if (lrq)
+		return lrq->local_cvt;
+	else	
+		return 0;
+}
+
 static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
 {
 	prio_array_t               *array;
 	struct task_struct         *next;
-	struct ckrm_local_runqueue *queue;
+	ckrm_lrq_t *queue;
+	int idx;
 	int cpu = smp_processor_id();
 	
 	next = rq->idle;
  retry_next_class:
 	if ((queue = rq_get_next_class(rq))) {
-		array = queue->active;
 		//check switch active/expired queue
-		if (unlikely(!queue->active->nr_active)) {
+		array = queue->active;
+		if (unlikely(!array->nr_active)) {
 			queue->active = queue->expired;
 			queue->expired = array;
 			queue->expired_timestamp = 0;
@@ -305,20 +368,20 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
 						   &queue->classqueue_linkobj);
 				cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
 			}
-
 			goto retry_next_class; 				
 		}
-		BUG_ON(!queue->active->nr_active);
-		next = task_list_entry(array->queue[queue->top_priority].next);
+		BUG_ON(!array->nr_active);
+
+		idx = queue->top_priority;
+		if (queue->top_priority == MAX_PRIO) {
+			BUG_ON(1);
+		}
+
+		next = task_list_entry(array->queue[idx].next);
 	}
 	return next;
 }
-
-static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load += cpu_class_weight(p->cpu_class); }
-static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load -= cpu_class_weight(p->cpu_class); }
-
-#else /*CONFIG_CKRM_CPU_SCHEDULE*/
-
+#else /*! CONFIG_CKRM_CPU_SCHEDULE*/
 static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
 {
 	prio_array_t *array;
@@ -345,59 +408,14 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
 static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
 static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
 static inline void init_cpu_classes(void) { }
-static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { }
-static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { }
+#define rq_ckrm_load(rq) NULL
+static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
 #endif  /* CONFIG_CKRM_CPU_SCHEDULE */
 
-
-/*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts.  Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
- */
-runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
-{
-	struct runqueue *rq;
-
-repeat_lock_task:
-	local_irq_save(*flags);
-	rq = task_rq(p);
-	spin_lock(&rq->lock);
-	if (unlikely(rq != task_rq(p))) {
-		spin_unlock_irqrestore(&rq->lock, *flags);
-		goto repeat_lock_task;
-	}
-	return rq;
-}
-
-void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
-{
-	spin_unlock_irqrestore(&rq->lock, *flags);
-}
-
-/*
- * rq_lock - lock a given runqueue and disable interrupts.
- */
-static runqueue_t *this_rq_lock(void)
-{
-	runqueue_t *rq;
-
-	local_irq_disable();
-	rq = this_rq();
-	spin_lock(&rq->lock);
-
-	return rq;
-}
-
-static inline void rq_unlock(runqueue_t *rq)
-{
-	spin_unlock_irq(&rq->lock);
-}
-
 /*
  * Adding/removing a task to/from a priority array:
  */
-void dequeue_task(struct task_struct *p, prio_array_t *array)
+static void dequeue_task(struct task_struct *p, prio_array_t *array)
 {
 	BUG_ON(! array);
 	array->nr_active--;
@@ -407,7 +425,7 @@ void dequeue_task(struct task_struct *p, prio_array_t *array)
 	class_dequeue_task(p,array);
 }
 
-void enqueue_task(struct task_struct *p, prio_array_t *array)
+static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
@@ -471,7 +489,6 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq_active(p,rq));
 	rq->nr_running++;
-	rq_load_inc(rq,p);
 }
 
 /*
@@ -481,7 +498,6 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq_active(p,rq));
 	rq->nr_running++;
-	rq_load_inc(rq,p);
 }
 
 static void recalc_task_prio(task_t *p, unsigned long long now)
@@ -613,7 +629,6 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
 	rq->nr_running--;
-	rq_load_dec(rq,p);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
 	dequeue_task(p, p->array);
@@ -987,6 +1002,10 @@ void fastcall sched_fork(task_t *p)
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
 	spin_lock_init(&p->switch_lock);
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+	cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0);
+#endif
+
 #ifdef CONFIG_PREEMPT
 	/*
 	 * During context-switch we hold precisely one spinlock, which
@@ -1062,7 +1081,7 @@ void fastcall wake_up_forked_process(task_t * p)
 		p->array = current->array;
 		p->array->nr_active++;
 		rq->nr_running++;
-		rq_load_inc(rq,p);
+		class_enqueue_task(p,p->array);
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -1395,7 +1414,7 @@ lock_again:
 			p->array = current->array;
 			p->array->nr_active++;
 			rq->nr_running++;
-			rq_load_inc(rq,p);
+			class_enqueue_task(p,p->array);
 		}
 	} else {
 		/* Not the local CPU - must adjust timestamp */
@@ -1500,13 +1519,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 {
 	dequeue_task(p, src_array);
 	src_rq->nr_running--;
-	rq_load_dec(src_rq,p);
-
 	set_task_cpu(p, this_cpu);
 	this_rq->nr_running++;
-	rq_load_inc(this_rq,p);
 	enqueue_task(p, this_array);
-
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
 	/*
@@ -1546,133 +1561,61 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 }
 
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
-
-struct ckrm_cpu_class *find_unbalanced_class(int busiest_cpu, int this_cpu, unsigned long *cls_imbalance)
+static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
+				      int phase, enum idle_type idle)
 {
-	struct ckrm_cpu_class *most_unbalanced_class = NULL;
-	struct ckrm_cpu_class *clsptr;
-	int max_unbalance = 0;
-
-	list_for_each_entry(clsptr,&active_cpu_classes,links) {
-		struct ckrm_local_runqueue *this_lrq    = get_ckrm_local_runqueue(clsptr,this_cpu);
-		struct ckrm_local_runqueue *busiest_lrq = get_ckrm_local_runqueue(clsptr,busiest_cpu);
-		int unbalance_degree;
-		
-		unbalance_degree = (local_queue_nr_running(busiest_lrq) - local_queue_nr_running(this_lrq)) * cpu_class_weight(clsptr);
-		if (unbalance_degree >= *cls_imbalance) 
-			continue;  // already looked at this class
+	long pressure = task_load(tmp);
+	
+	if (pressure > max) 
+		return 0;
 
-		if (unbalance_degree > max_unbalance) {
-			max_unbalance = unbalance_degree;
-			most_unbalanced_class = clsptr;
-		}
-	}
-	*cls_imbalance = max_unbalance;
-	return most_unbalanced_class;
+	if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
+		return 0;
+	return 1;
 }
 
-
 /*
- * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
+ * move tasks for a specic local class
+ * return number of tasks pulled
  */
-static int find_busiest_cpu(runqueue_t *this_rq, int this_cpu, int idle, 
-			    int *imbalance)
+static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
+				      runqueue_t *this_rq,
+				      runqueue_t *busiest,
+				      struct sched_domain *sd,
+				      int this_cpu,
+				      enum idle_type idle,
+				      long* pressure_imbalance) 
 {
-	int cpu_load, load, max_load, i, busiest_cpu;
-	runqueue_t *busiest, *rq_src;
-
-
-	/*Hubertus ... the concept of nr_running is replace with cpu_load */
-	cpu_load = this_rq->ckrm_cpu_load;
-
-	busiest = NULL;
-	busiest_cpu = -1;
-
-	max_load = -1;
-	for_each_online_cpu(i) {
-		rq_src = cpu_rq(i);
-		load = rq_src->ckrm_cpu_load;
-
-		if ((load > max_load) && (rq_src != this_rq)) {
-			busiest = rq_src;
-			busiest_cpu = i;
-			max_load = load;
-		}
-	}
-
-	if (likely(!busiest))
-		goto out;
-
-	*imbalance = max_load - cpu_load;
-
-	/* It needs an at least ~25% imbalance to trigger balancing. */
-	if (!idle && ((*imbalance)*4 < max_load)) {
-		busiest = NULL;
-		goto out;
-	}
-
-	double_lock_balance(this_rq, busiest);
-	/*
-	 * Make sure nothing changed since we checked the
-	 * runqueue length.
-	 */
-	if (busiest->ckrm_cpu_load <= cpu_load) {
-		spin_unlock(&busiest->lock);
-		busiest = NULL;
-	}
-out:
-	return (busiest ? busiest_cpu : -1);
-}
-
-static int load_balance(int this_cpu, runqueue_t *this_rq,
-			struct sched_domain *sd, enum idle_type idle)
-{
-	int imbalance, idx;
-	int busiest_cpu;
-	runqueue_t *busiest;
-	prio_array_t *array;
+	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
 	task_t *tmp;
-        struct ckrm_local_runqueue * busiest_local_queue;
-	struct ckrm_cpu_class *clsptr;
-	int weight;
-	unsigned long cls_imbalance;      // so we can retry other classes
-
-	// need to update global CVT based on local accumulated CVTs
-	read_lock(&class_list_lock);
-	busiest_cpu = find_busiest_cpu(this_rq, this_cpu, idle, &imbalance);
-	if (busiest_cpu == -1)
-		goto out;
-
-	busiest = cpu_rq(busiest_cpu);
-
-	/*
-	 * We only want to steal a number of tasks equal to 1/2 the imbalance,
-	 * otherwise we'll just shift the imbalance to the new queue:
-	 */
-	imbalance /= 2;
-		
-	/* now find class on that runqueue with largest inbalance */
-	cls_imbalance = 0xFFFFFFFF; 
-
- retry_other_class:
-	clsptr = find_unbalanced_class(busiest_cpu, this_cpu, &cls_imbalance);
-	if (!clsptr) 
-		goto out_unlock;
-
-	busiest_local_queue = get_ckrm_local_runqueue(clsptr,busiest_cpu);
-	weight = cpu_class_weight(clsptr);
-
+	int idx;
+	int pulled = 0;
+	int phase = -1;
+	long pressure_min, pressure_max;
+	/*hzheng: magic : 90% balance is enough*/
+	long balance_min = *pressure_imbalance / 10; 
+/*
+ * we don't want to migrate tasks that will reverse the balance
+ *     or the tasks that make too small difference
+ */
+#define CKRM_BALANCE_MAX_RATIO	100
+#define CKRM_BALANCE_MIN_RATIO	1
+ start:
+	phase ++;
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
 	 * be cache-cold, thus switching CPUs has the least effect
 	 * on them.
 	 */
-	if (busiest_local_queue->expired->nr_active)
-		array = busiest_local_queue->expired;
-	else
-		array = busiest_local_queue->active;
+	if (src_lrq->expired->nr_active) {
+		array = src_lrq->expired;
+		dst_array = dst_lrq->expired;
+	} else {
+		array = src_lrq->active;
+		dst_array = dst_lrq->active;
+	}
 	
  new_array:
 	/* Start searching at priority 0: */
@@ -1683,11 +1626,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 	else
 		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
 	if (idx >= MAX_PRIO) {
-		if (array == busiest_local_queue->expired && busiest_local_queue->active->nr_active) {
-			array = busiest_local_queue->active;
+		if (array == src_lrq->expired && src_lrq->active->nr_active) {
+			array = src_lrq->active;
+			dst_array = dst_lrq->active;
 			goto new_array;
 		}
-		goto retry_other_class;
+		if ((! phase) && (! pulled) && (idle != IDLE))
+			goto start; //try again
+		else 
+			goto out; //finished search for this lrq
 	}
 	
 	head = array->queue + idx;
@@ -1697,42 +1644,365 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 	
 	curr = curr->prev;
 	
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd,idle)) {
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
 		goto skip_bitmap;
 	}
-	pull_task(busiest, array, tmp, this_rq, rq_active(tmp,this_rq),this_cpu);
+
+	pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
+	pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
 	/*
-	 * tmp BUG FIX: hzheng
-	 * load balancing can make the busiest local queue empty
-	 * thus it should be removed from bpt
+	 * skip the tasks that will reverse the balance too much
 	 */
-	if (! local_queue_nr_running(busiest_local_queue)) {
-		classqueue_dequeue(busiest_local_queue->classqueue,&busiest_local_queue->classqueue_linkobj);
-		cpu_demand_event(get_rq_local_stat(busiest_local_queue,busiest_cpu),CPU_DEMAND_DEQUEUE,0);		
+	if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
+		*pressure_imbalance -= task_load(tmp);
+		pull_task(busiest, array, tmp, 
+			  this_rq, dst_array, this_cpu);
+		pulled++;
+
+		if (*pressure_imbalance <= balance_min)
+			goto out;
 	}
+		
+	if (curr != head)
+		goto skip_queue;
+	idx++;
+	goto skip_bitmap;
+ out:	       
+	return pulled;
+}
 
-	imbalance -= weight;
-	if (!idle && (imbalance>0)) {
-		if (curr != head)
-			goto skip_queue;
-		idx++;
-		goto skip_bitmap;
+static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
+{
+	long imbalance;
+	/*
+	 * make sure after balance, imbalance' > - imbalance/2
+	 * we don't want the imbalance be reversed too much
+	 */
+	imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) 
+		- pid_get_pressure(rq_ckrm_load(this_rq),1);
+	imbalance /= 2;
+	return imbalance;
+}
+
+/*
+ * try to balance the two runqueues
+ *
+ * Called with both runqueues locked.
+ * if move_tasks is called, it will try to move at least one task over
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+		      unsigned long max_nr_move, struct sched_domain *sd,
+		      enum idle_type idle)
+{
+	struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
+	ckrm_lrq_t* src_lrq,*dst_lrq;
+	long pressure_imbalance, pressure_imbalance_old;
+	int src_cpu = task_cpu(busiest->curr);
+	struct list_head *list;
+	int pulled = 0;
+	long imbalance;
+
+	imbalance =  ckrm_rq_imbalance(this_rq,busiest);
+
+	if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
+		goto out;
+
+	//try to find the vip class
+        list_for_each_entry(clsptr,&active_cpu_classes,links) {
+		src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+
+		if (! lrq_nr_running(src_lrq))
+			continue;
+
+		if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
+			{
+				vip_cls = clsptr;
+			}
 	}
- out_unlock:
-	spin_unlock(&busiest->lock);
+
+	/*
+	 * do search from the most significant class
+	 * hopefully, less tasks will be migrated this way
+	 */
+	clsptr = vip_cls;
+
+ move_class:
+	if (! clsptr)
+		goto out;
+	
+
+	src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+	if (! lrq_nr_running(src_lrq))
+		goto other_class;
+	
+	dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
+
+	//how much pressure for this class should be transferred
+	pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
+	if (pulled && ! pressure_imbalance) 
+		goto other_class;
+	
+	pressure_imbalance_old = pressure_imbalance;
+	
+	//move tasks
+	pulled += 
+		ckrm_cls_move_tasks(src_lrq,dst_lrq,
+				    this_rq,
+				    busiest,
+				    sd,this_cpu,idle,
+				    &pressure_imbalance);
+
+	/* 
+	 * hzheng: 2 is another magic number
+	 * stop balancing if the imbalance is less than 25% of the orig
+	 */
+	if (pressure_imbalance <= (pressure_imbalance_old >> 2))
+		goto out;
+		
+	//update imbalance
+	imbalance *= pressure_imbalance / pressure_imbalance_old;
+ other_class:
+	//who is next?
+	list = clsptr->links.next;
+	if (list == &active_cpu_classes)
+		list = list->next;
+	clsptr = list_entry(list, typeof(*clsptr), links);
+	if (clsptr != vip_cls)
+		goto move_class;
  out:
-	read_unlock(&class_list_lock);
+	return pulled;
+}
+
+/**
+ * ckrm_check_balance - is load balancing necessary?
+ * return 0 if load balancing is not necessary
+ * otherwise return the average load of the system
+ * also, update nr_group
+ *
+ * heuristics: 
+ *   no load balancing if it's load is over average
+ *   no load balancing if it's load is far more than the min
+ * task:
+ *   read the status of all the runqueues
+ */
+static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
+					     enum idle_type idle, int* nr_group)
+{
+	struct sched_group *group = sd->groups;
+	unsigned long min_load, max_load, avg_load;
+	unsigned long total_load, this_load, total_pwr;
+
+	max_load = this_load = total_load = total_pwr = 0;
+	min_load = 0xFFFFFFFF;
+	*nr_group = 0;
+
+	do {
+		cpumask_t tmp;
+		unsigned long load;
+		int local_group;
+		int i, nr_cpus = 0;
+
+		/* Tally up the load of all CPUs in the group */
+		cpus_and(tmp, group->cpumask, cpu_online_map);
+		if (unlikely(cpus_empty(tmp)))
+			goto nextgroup;
+
+		avg_load = 0;
+		local_group = cpu_isset(this_cpu, group->cpumask);
+
+		for_each_cpu_mask(i, tmp) {
+			load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+			nr_cpus++;
+			avg_load += load;
+		}
+
+		if (!nr_cpus)
+			goto nextgroup;
+
+		total_load += avg_load;
+		total_pwr += group->cpu_power;
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			goto nextgroup;
+		} else if (avg_load > max_load) {
+			max_load = avg_load;
+		}      
+		if (avg_load < min_load) {
+			min_load = avg_load;
+		}
+nextgroup:
+		group = group->next;
+		*nr_group = *nr_group + 1;
+	} while (group != sd->groups);
+
+	if (!max_load || this_load >= max_load)
+		goto out_balanced;
+
+	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+	/* hzheng: debugging: 105 is a magic number
+	 * 100*max_load <= sd->imbalance_pct*this_load)
+	 * should use imbalance_pct instead
+	 */
+	if (this_load > avg_load 
+	    || 100*max_load < 105*this_load
+	    || 100*min_load < 70*this_load
+	    )
+		goto out_balanced;
+
+	return avg_load;
+ out_balanced:
 	return 0;
 }
 
+/**
+ * any group that has above average load is considered busy
+ * find the busiest queue from any of busy group
+ */
+static runqueue_t *
+ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
+		     unsigned long avg_load, enum idle_type idle,
+		     int nr_group)
+{
+	struct sched_group *group;
+	runqueue_t * busiest=NULL;
+	unsigned long rand;
+	
+	group = sd->groups;
+	rand = get_ckrm_rand(nr_group);
+	nr_group = 0;
+
+	do {
+		unsigned long load,total_load,max_load;
+		cpumask_t tmp;
+		int i;
+		runqueue_t * grp_busiest;
+
+		cpus_and(tmp, group->cpumask, cpu_online_map);
+		if (unlikely(cpus_empty(tmp)))
+			goto find_nextgroup;
 
-static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+		total_load = 0;
+		max_load = 0;
+		grp_busiest = NULL;
+		for_each_cpu_mask(i, tmp) {
+			load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
+			total_load += load;
+			if (load > max_load) {
+				max_load = load;
+				grp_busiest = cpu_rq(i);
+			}				
+		}
+
+		total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
+		if (total_load > avg_load) {
+			busiest = grp_busiest;
+			if (nr_group >= rand)
+				break;
+		}
+	find_nextgroup:		
+		group = group->next;
+		nr_group ++;
+	} while (group != sd->groups);
+
+	return busiest;
+}
+
+/**
+ * load_balance - pressure based load balancing algorithm used by ckrm
+ */
+static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+			struct sched_domain *sd, enum idle_type idle)
 {
+	runqueue_t *busiest;
+	unsigned long avg_load;
+	int nr_moved,nr_group;
+
+	avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
+	if (! avg_load)
+		goto out_balanced;
+
+	busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
+	if (! busiest)
+		goto out_balanced;
+	/*
+	 * This should be "impossible", but since load
+	 * balancing is inherently racy and statistical,
+	 * it could happen in theory.
+	 */
+	if (unlikely(busiest == this_rq)) {
+		WARN_ON(1);
+		goto out_balanced;
+	}
+
+	nr_moved = 0;
+	if (busiest->nr_running > 1) {
+		/*
+		 * Attempt to move tasks. If find_busiest_group has found
+		 * an imbalance but busiest->nr_running <= 1, the group is
+		 * still unbalanced. nr_moved simply stays zero, so it is
+		 * correctly treated as an imbalance.
+		 */
+		double_lock_balance(this_rq, busiest);
+		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+				      0,sd, idle);		
+		spin_unlock(&busiest->lock);
+		if (nr_moved) {
+			adjust_local_weight();
+		}
+	}
+
+	if (!nr_moved) 
+		sd->nr_balance_failed ++;
+	else
+		sd->nr_balance_failed  = 0;		
+
+	/* We were unbalanced, so reset the balancing interval */
+	sd->balance_interval = sd->min_interval;
+
+	return nr_moved;
+
+out_balanced:
+	/* tune up the balancing interval */
+	if (sd->balance_interval < sd->max_interval)
+		sd->balance_interval *= 2;
+
+	return 0;
 }
-#else /* CONFIG_CKRM_CPU_SCHEDULE */
+
+/*
+ * this_rq->lock is already held
+ */
+static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+				       struct sched_domain *sd)
+{
+	int ret;
+	read_lock(&class_list_lock);
+	ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+	read_unlock(&class_list_lock);
+	return ret;
+}
+
+static inline int load_balance(int this_cpu, runqueue_t *this_rq,
+			struct sched_domain *sd, enum idle_type idle)
+{
+	int ret;
+
+	spin_lock(&this_rq->lock);
+	read_lock(&class_list_lock);
+	ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+	read_unlock(&class_list_lock);
+	spin_unlock(&this_rq->lock);
+	return ret;
+}
+#else /*! CONFIG_CKRM_CPU_SCHEDULE */
 /*
  * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
  * as part of a balancing operation within "domain". Returns the number of
@@ -2097,6 +2367,8 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 out:
 	return nr_moved;
 }
+#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
+
 
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
@@ -2182,7 +2454,6 @@ next_group:
 		group = group->next;
 	} while (group != sd->groups);
 }
-#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
 
 /*
  * rebalance_tick will get called every timer tick, on every CPU.
@@ -2203,8 +2474,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
 
-	ckrm_rebalance_tick(j,this_cpu);
-
 	/* Update our load */
 	old_load = this_rq->cpu_load;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -2243,9 +2512,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
  */
 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
 {
-	ckrm_rebalance_tick(jiffies,cpu);
 }
-
 static inline void idle_balance(int cpu, runqueue_t *rq)
 {
 }
@@ -2267,7 +2534,6 @@ static inline int wake_priority_sleeper(runqueue_t *rq)
 }
 
 DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
-
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
@@ -2291,7 +2557,7 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 #define EXPIRED_STARVING(rq) \
  		(STARVATION_LIMIT && ((rq)->expired_timestamp && \
  		(jiffies - (rq)->expired_timestamp >= \
- 			STARVATION_LIMIT * (local_queue_nr_running(rq)) + 1)))
+ 			STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
 #endif
 
 /*
@@ -2323,8 +2589,10 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 	}
 
 	if (p == rq->idle) {
+#ifdef	CONFIG_VSERVER_HARDCPU
 		if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
 			set_need_resched();	
+#endif
 
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait += sys_ticks;
@@ -2332,6 +2600,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 			cpustat->idle += sys_ticks;
 		if (wake_priority_sleeper(rq))
 			goto out;
+		ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
 		rebalance_tick(cpu, rq, IDLE);
 		return;
 	}
@@ -2373,7 +2642,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 	if (vx_need_resched(p)) {
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
 		/* Hubertus ... we can abstract this out */
-		struct ckrm_local_runqueue* rq = get_task_class_queue(p);
+		ckrm_lrq_t* rq = get_task_lrq(p);
 #endif
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
@@ -2420,6 +2689,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 out_unlock:
 	spin_unlock(&rq->lock);
 out:
+	ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
 	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
@@ -2569,6 +2839,19 @@ need_resched:
 
 	spin_lock_irq(&rq->lock);
 
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+  	if (prev != rq->idle) {
+		unsigned long long run = now - prev->timestamp;
+		ckrm_lrq_t * lrq = get_task_lrq(prev);
+
+		lrq->lrq_load -= task_load(prev);
+		cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
+		lrq->lrq_load += task_load(prev);
+
+		cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+  		update_local_cvt(prev, run);
+	}
+#endif
 	/*
 	 * if entering off of a kernel preemption go straight
 	 * to picking the next task.
@@ -2617,17 +2900,14 @@ pick_next:
 #endif
 	if (unlikely(!rq->nr_running)) {
 		idle_balance(cpu, rq);
-		if (!rq->nr_running) {
-			next = rq->idle;
-			rq->expired_timestamp = 0;
-			wake_sleeping_dependent(cpu, rq);
-			goto switch_tasks;
-		}
 	}
 
 	next = rq_get_next_task(rq);
-	if (next == rq->idle) 
+	if (next == rq->idle) {
+		rq->expired_timestamp = 0;
+		wake_sleeping_dependent(cpu, rq);
 		goto switch_tasks;
+	}
 
 	if (dependent_sleeper(cpu, rq, next)) {
 		next = rq->idle;
@@ -2669,14 +2949,6 @@ switch_tasks:
 		rq->nr_preempt++;
 	RCU_qsctr(task_cpu(prev))++;
 
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-  	if (prev != rq->idle) {
-		unsigned long long run = now - prev->timestamp;
-		cpu_demand_event(get_task_local_stat(prev),CPU_DEMAND_DESCHEDULE,run);
-  		update_local_cvt(prev, run);
-	}
-#endif
-
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0) {
 		prev->sleep_avg = 0;
@@ -2719,7 +2991,6 @@ switch_tasks:
 }
 
 EXPORT_SYMBOL(schedule);
-
 #ifdef CONFIG_PREEMPT
 /*
  * this is is the entry point to schedule() from in-kernel preemption
@@ -3820,7 +4091,6 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
-	set_task_cpu(p, dest_cpu);
 	if (p->array) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
@@ -3831,10 +4101,12 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
 				+ rq_dest->timestamp_last_tick;
 		deactivate_task(p, rq_src);
+		set_task_cpu(p, dest_cpu);
 		activate_task(p, rq_dest, 0);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
-	}
+	} else
+		set_task_cpu(p, dest_cpu);
 
 out:
 	double_rq_unlock(rq_src, rq_dest);
@@ -3869,9 +4141,7 @@ static int migration_thread(void * data)
 		}
 
 		if (rq->active_balance) {
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
 			active_load_balance(rq, cpu);
-#endif
 			rq->active_balance = 0;
 		}
 
@@ -4346,9 +4616,6 @@ void __init sched_init(void)
 {
 	runqueue_t *rq;
  	int i;
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
- 	int j, k;
-#endif
 
 #ifdef CONFIG_SMP
 	/* Set up an initial dummy domain for early boot */
@@ -4367,46 +4634,50 @@ void __init sched_init(void)
 	sched_group_init.next = &sched_group_init;
 	sched_group_init.cpu_power = SCHED_LOAD_SCALE;
 #endif
-
  	init_cpu_classes();
 
 	for (i = 0; i < NR_CPUS; i++) {
 #ifndef CONFIG_CKRM_CPU_SCHEDULE
+		int j, k;
 		prio_array_t *array;
-#endif
+
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
+		for (j = 0; j < 2; j++) {
+			array = rq->arrays + j;
+			for (k = 0; k < MAX_PRIO; k++) {
+				INIT_LIST_HEAD(array->queue + k);
+				__clear_bit(k, array->bitmap);
+			}
+			// delimiter for bitsearch
+			__set_bit(MAX_PRIO, array->bitmap);
+		}
+
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 #else
-		rq->ckrm_cpu_load = 0;
+		rq = cpu_rq(i);
+		spin_lock_init(&rq->lock);
 #endif
+
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_init;
 		rq->cpu_load = 0;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+		ckrm_load_init(rq_ckrm_load(rq));
+#endif
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 #endif
+#ifdef	CONFIG_VSERVER_HARDCPU		
 		INIT_LIST_HEAD(&rq->hold_queue);
-		atomic_set(&rq->nr_iowait, 0);
-
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
-		for (j = 0; j < 2; j++) {
-			array = rq->arrays + j;
-			for (k = 0; k < MAX_PRIO; k++) {
-				INIT_LIST_HEAD(array->queue + k);
-				__clear_bit(k, array->bitmap);
-			}
-			// delimiter for bitsearch
-			__set_bit(MAX_PRIO, array->bitmap);
-		}
 #endif
+		atomic_set(&rq->nr_iowait, 0);
 	}
 
 	/*
@@ -4418,7 +4689,8 @@ void __init sched_init(void)
 	rq->idle = current;
 	set_task_cpu(current, smp_processor_id());
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
-	current->cpu_class = default_cpu_class;
+	cpu_demand_event(&(current)->demand_stat,CPU_DEMAND_INIT,0);
+	current->cpu_class = get_default_cpu_class();
 	current->array = NULL;
 #endif
 	wake_up_forked_process(current);
@@ -4512,10 +4784,30 @@ EXPORT_SYMBOL(task_running_sys);
 #ifdef CONFIG_CKRM_CPU_SCHEDULE
 /**
  * return the classqueue object of a certain processor
- * Note: not supposed to be used in performance sensitive functions
  */
 struct classqueue_struct * get_cpu_classqueue(int cpu)
 {
 	return (& (cpu_rq(cpu)->classqueue) );
 }
+
+/**
+ * _ckrm_cpu_change_class - change the class of a task
+ */
+void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
+{
+	prio_array_t *array;
+	struct runqueue *rq;
+	unsigned long flags;
+
+	rq = task_rq_lock(tsk,&flags); 
+	array = tsk->array;
+	if (array) {
+		dequeue_task(tsk,array);
+		tsk->cpu_class = newcls;
+		enqueue_task(tsk,rq_active(tsk,rq));
+	} else
+		tsk->cpu_class = newcls;
+
+	task_rq_unlock(rq,&flags);
+}
 #endif