From 2405d9c7d438e26dbf782e26edda03daaab279ac Mon Sep 17 00:00:00 2001
From: Planet-Lab Support <support@planet-lab.org>
Date: Fri, 16 Jul 2004 20:14:56 +0000
Subject: [PATCH] This commit was manufactured by cvs2svn to create branch
 'ckrm'.

---
 include/linux/ckrm_classqueue.h | 129 ++++++++
 include/linux/ckrm_sched.h      | 297 +++++++++++++++++
 kernel/ckrm/ckrm_cpu_class.c    | 350 +++++++++++++++++++++
 kernel/ckrm/ckrm_cpu_monitor.c  | 542 ++++++++++++++++++++++++++++++++
 kernel/ckrm_classqueue.c        | 178 +++++++++++
 kernel/ckrm_sched.c             |  71 +++++
 6 files changed, 1567 insertions(+)
 create mode 100644 include/linux/ckrm_classqueue.h
 create mode 100644 include/linux/ckrm_sched.h
 create mode 100644 kernel/ckrm/ckrm_cpu_class.c
 create mode 100644 kernel/ckrm/ckrm_cpu_monitor.c
 create mode 100644 kernel/ckrm_classqueue.c
 create mode 100644 kernel/ckrm_sched.c

diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h
new file mode 100644
index 000000000..1bdf9b775
--- /dev/null
+++ b/include/linux/ckrm_classqueue.h
@@ -0,0 +1,129 @@
+/* include/linux/ckrm_classqueue.h : cpu control for CKRM
+ *
+ * Copyright (C) Haoqiang Zheng, IBM Corp. 2003
+ *           (C) Hubertus Franke, IBM Corp. 2003
+ * 
+ * Circular queue functionality for CKRM cpu controller
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * Aug 28, 2003
+ *        Created.
+ * July 07, 2004
+ *   clean up, add comments     
+ *   
+ */
+
+#ifndef _CKRM_CLASSQUEUE_H
+#define _CKRM_CLASSQUEUE_H
+
+#include <linux/list.h>
+
+#define CLASSQUEUE_SIZE 128
+#define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long))
+
+/**
+ * struct cq_prio_array: duplicates prio_array defined in sched.c 
+ *
+ * I duplicate this data structure to make ckrm_classqueue implementation more modular
+ */
+struct cq_prio_array {
+	int nr_active;
+	unsigned long bitmap[CQ_BITMAP_SIZE];
+	struct list_head queue[CLASSQUEUE_SIZE];
+};
+
+/**
+ * struct classqueue_struct - a runqueue of class local runqueues
+ * @array: priority array
+ * @base: base priority
+ * @base_offset: index in array for the base
+ *
+ * classqueue can be thought of as runqueue of classes (instead of runqueue of tasks)
+ * as task runqueue, each processor has a classqueue
+ * a class enters the classqueue when the first task in this class local runqueue shows up
+ * a class enters the classqueue when the last task in the local runqueue leaves
+ * class local runqueues are ordered based their priority
+ *
+ * status:
+ *   hzheng: is 32bit base long enough?
+ */
+struct classqueue_struct {
+	struct cq_prio_array array;
+	unsigned long base;
+	unsigned long base_offset;
+};
+
+/** 
+ * struct cq_node_struct - the link object between class local runqueue and classqueue
+ * @list: links the class local runqueue to classqueue
+ * @prio: class priority, which is caculated based on it's progress (cvt) and urgency (top_priority)
+ * @index: real index into the classqueue array, calculated based on priority
+ *
+ * NOTE: make sure list is empty when it's not in classqueue
+ */
+struct cq_node_struct {
+	struct list_head list;
+	int prio;
+	int index;
+};
+typedef struct cq_node_struct cq_node_t;
+
+typedef unsigned long long CVT_t;	// cummulative virtual time
+
+static inline void cq_node_init(cq_node_t * node)
+{
+	node->prio = 0;
+	node->index = -1;
+	INIT_LIST_HEAD(&node->list);
+}
+
+/*if the class is in classqueue*/
+static inline int cls_in_classqueue(cq_node_t * node)
+{
+	return !list_empty(&node->list);
+}
+
+/*initialize the data structure*/
+int classqueue_init(struct classqueue_struct *cq);
+
+/*add the class to classqueue*/
+void classqueue_enqueue(struct classqueue_struct *cq, cq_node_t * node, int prio);
+
+/**
+ * classqueue_dequeue - remove the class from classqueue
+ * 
+ * internal:
+ *   called when the last task is removed from the queue
+ *   checked on load balancing and schedule
+ *   hzheng: why don't I call it on class_dequeue_task?
+ */
+void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node);
+
+/*change the position of the class in classqueue*/
+void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int new_prio);
+
+/*return the first class in classqueue*/
+cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
+
+/*update the base priority of the classqueue*/
+void classqueue_update_base(struct classqueue_struct *cq, int new_base);
+
+/**
+ * class_compare_prio: compare the priority of this two nodes
+ */
+static inline int class_compare_prio(struct cq_node_struct* node1, struct cq_node_struct* node2)
+{
+	return ( node1->prio - node2->prio);
+}
+
+#endif
diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h
new file mode 100644
index 000000000..9d82214fb
--- /dev/null
+++ b/include/linux/ckrm_sched.h
@@ -0,0 +1,297 @@
+/* include/linux/ckrm_sched.h - Supports CKRM scheduling
+ *
+ * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
+ * Copyright (C) Hubertus Franke,  IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _CKRM_SCHED_H
+#define _CKRM_SCHED_H
+
+#define CC_BUG_ON_DO(cond,action)  do { if (cond)  action; BUG_ON(cond); } while(0)
+#define CC_BUG_ON(cond)            BUG_ON(cond)
+
+#include <linux/sched.h>
+#include <linux/ckrm_rc.h>
+#include <linux/ckrm_classqueue.h>
+
+//update every second
+#define CVT_UPDATE_TICK     (1*HZ/1 ?: 1)
+#define CLASS_BONUS_RATE 22	// shift from ns to increase class bonus
+#define PRIORITY_BONUS_RATE 0	// ??  Hubertus
+
+#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+struct prio_array {
+	int nr_active;
+	unsigned long bitmap[BITMAP_SIZE];
+	struct list_head queue[MAX_PRIO];
+};
+
+struct ckrm_local_runqueue {
+	cq_node_t classqueue_linkobj;	/*links in classqueue */
+	struct ckrm_cpu_class *cpu_class;	// class it belongs to
+	struct classqueue_struct *classqueue;	// classqueue it belongs tow
+	CVT_t uncounted_cvt;
+	unsigned long long uncounted_ns;
+
+	prio_array_t *active, *expired, arrays[2];
+	/*
+	   set to 0 on init, become null or array switch
+	   set to jiffies whenever an non-interactive job expires
+	   reset to jiffies if expires
+	 */
+	unsigned long expired_timestamp;
+
+	/* 
+	 * highest priority of tasks in active
+	 * initialized to be MAX_PRIO
+	 * updated on enqueue, dequeue
+	 */
+	int top_priority;
+	CVT_t local_cvt;	// snapshot of local_cvt, update on every loadbalance
+	unsigned long magic;	//for debugging
+};
+
+/**
+ * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping
+ */
+struct ckrm_cpu_class_local_stat {
+	unsigned long long run;
+	unsigned long long total;
+	unsigned long long last_sleep;
+	unsigned long cpu_demand; /*estimated cpu demand */
+};
+
+/**
+ * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
+ * 
+ */
+struct ckrm_cpu_class_stat {
+	spinlock_t stat_lock;
+
+	unsigned long long total_ns;	/*how much nano-secs it has consumed */
+
+	struct ckrm_cpu_class_local_stat local_stats[NR_CPUS];
+	unsigned long cpu_demand;
+
+	/*temp stat used by cpu monitor */
+	int effective_guarantee;
+	int effective_limit;
+	int glut;		//true or false
+	/*
+	 * effective_share: for both default class and its children
+	 * self_effective_share: just for the default class
+	 */
+	int effective_share;
+	int self_effective_share;
+};
+
+typedef struct ckrm_cpu_class_stat ckrm_stat_t;
+
+/*
+ * manages the class status
+ * there should be only one instance of this object for each class in the whole system  
+ */
+struct ckrm_cpu_class {
+	struct ckrm_core_class *core;
+	struct ckrm_core_class *parent;
+	struct ckrm_shares shares;
+	spinlock_t cnt_lock;	// always grab parent's lock first and then child's
+	CVT_t global_cvt;	// total cummulative virtual time
+	struct ckrm_cpu_class_stat stat;
+	struct list_head links;	// for linking up in cpu classes
+	struct ckrm_local_runqueue local_queues[NR_CPUS];	// runqueues 
+};
+
+#if CONFIG_CKRM_CPU_SCHEDULE
+#define rq_active(p,rq)   (get_task_class_queue(p)->active)
+#define rq_expired(p,rq)  (get_task_class_queue(p)->expired)
+#else
+#define rq_active(p,rq)   (rq->active)
+#define rq_expired(p,rq)  (rq->expired)
+#endif
+
+//#define cpu_class_weight(cls) (cls->shares.my_guarantee)
+#define cpu_class_weight(cls) (cls->stat.self_effective_share)
+
+#define bpt_queue(cpu) (& (cpu_rq(cpu)->classqueue) )
+CVT_t get_min_cvt(int cpu);
+
+struct classqueue_struct *get_cpu_classqueue(int cpu);
+
+extern struct ckrm_cpu_class default_cpu_class_obj;
+#define default_cpu_class (&default_cpu_class_obj)
+
+#define local_queue_nr_running(local_queue) \
+             (local_queue->active->nr_active + local_queue->expired->nr_active)
+
+static inline struct ckrm_local_runqueue *
+get_ckrm_local_runqueue(struct ckrm_cpu_class*cls, int cpu)
+{
+	return &(cls->local_queues[cpu]);
+}
+
+static inline struct ckrm_local_runqueue *get_task_class_queue(struct task_struct *p)
+{
+	return &(p->cpu_class->local_queues[task_cpu(p)]);
+}
+
+#define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
+#define class_list_entry(list) list_entry(list,struct ckrm_local_runqueue,classqueue_linkobj)
+
+/* some additional interfaces exported from sched.c */
+struct runqueue;
+void dequeue_task(struct task_struct *p, prio_array_t * array);
+void enqueue_task(struct task_struct *p, prio_array_t * array);
+struct runqueue *task_rq_lock(task_t * p, unsigned long *flags);
+void task_rq_unlock(struct runqueue *rq, unsigned long *flags);
+extern spinlock_t cvt_lock;
+extern rwlock_t class_list_lock;
+extern struct list_head active_cpu_classes;
+
+/*functions exported by ckrm_cpu_class.c*/
+int __init init_ckrm_sched_res(void);
+void init_cpu_classes(void);
+
+/*functions exported by ckrm_cpu_monitor.c*/
+void ckrm_cpu_monitor(void);
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
+#define CPU_DEMAND_ENQUEUE 0
+#define CPU_DEMAND_DEQUEUE 1
+#define CPU_DEMAND_DESCHEDULE 2
+void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len);
+
+#define get_task_local_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
+#define get_rq_local_stat(lrq,cpu) (&(lrq)->cpu_class->stat.local_stats[cpu])
+
+/**
+ * get_effective_prio: return the effective priority of a class local queue
+ *
+ * class priority = progress * a + urgency * b
+ * progress = queue cvt
+ * urgency = queue top priority
+ * a and b are scaling factors  
+ * currently, prio increases by 1 if either: top_priority increase by one
+ *                                   or, local_cvt increases by 4ms
+ */
+static inline int get_effective_prio(struct ckrm_local_runqueue * lcq)
+{
+	int prio;
+
+	// cumulative usage
+	prio = lcq->local_cvt >> CLASS_BONUS_RATE;
+	// queue urgency
+	prio += lcq->top_priority >> PRIORITY_BONUS_RATE;
+
+	return prio;
+}
+
+/** 
+ * update_class_priority:
+ * 
+ * called whenever cvt or top_priority changes
+ *
+ * internal: (calling structure)
+ * update_class_priority
+ *   -- set_top_priority
+ *      -- class_enqueue_task
+ *      -- class_dequeue_task
+ *      -- rq_get_next_task (queue switch)
+ *   -- update_local_cvt
+ *      -- schedule
+ *   -- update_global_cvt
+ */
+static inline void update_class_priority(struct ckrm_local_runqueue *local_rq)
+{
+	int effective_prio = get_effective_prio(local_rq);
+	classqueue_update_prio(local_rq->classqueue,
+			       &local_rq->classqueue_linkobj,
+			       effective_prio);
+}
+
+/*
+ *  set the new top priority and reposition the queue
+ *  called when: task enqueue/dequeue and queue switch
+ */
+static inline void set_top_priority(struct ckrm_local_runqueue *class_queue,
+				    int new_priority)
+{
+	class_queue->top_priority = new_priority;
+	update_class_priority(class_queue);
+}
+
+static inline void class_enqueue_task(struct task_struct *p,
+				      prio_array_t * array)
+{
+	struct ckrm_local_runqueue *queue;
+	int effective_prio;
+
+	queue = get_task_class_queue(p);
+
+	if (! cls_in_classqueue(&queue->classqueue_linkobj)) {
+		cpu_demand_event(get_task_local_stat(p),CPU_DEMAND_ENQUEUE,0);
+		/*make sure the cvt of this class is up to date*/
+		queue->local_cvt = get_min_cvt(task_cpu(p));
+		effective_prio = get_effective_prio(queue);
+		classqueue_enqueue(queue->classqueue, &queue->classqueue_linkobj, effective_prio);
+	} 
+	
+	if ((p->prio < queue->top_priority) && (array == queue->active))
+		set_top_priority(queue, p->prio);	
+
+}
+
+static inline void class_dequeue_task(struct task_struct *p,
+				      prio_array_t * array)
+{
+	struct ckrm_local_runqueue *queue = get_task_class_queue(p);
+
+	if ((array == queue->active) && (p->prio == queue->top_priority)
+	    && list_empty(&(array->queue[p->prio])))
+		set_top_priority(queue,
+				 find_next_bit(array->bitmap, MAX_PRIO,
+					       p->prio));
+}
+
+/*
+ *  called after a task is switched out. Update the local cvt accounting 
+ *  we need to stick with long instead of long long due to nonexistent 64-bit division
+ */
+static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
+{
+	struct ckrm_local_runqueue *class_queue = get_task_class_queue(p);
+	struct ckrm_cpu_class *cls = class_queue->cpu_class;
+
+	unsigned long cvt_inc = nsec / cpu_class_weight(cls);
+
+	class_queue->local_cvt += cvt_inc;
+	class_queue->uncounted_cvt += cvt_inc;
+
+	class_queue->uncounted_ns += nsec;
+	update_class_priority(class_queue);
+}
+
+/*
+ * called during loadbalancing 
+ * to charge the class with locally accumulated cvt
+ */
+void update_global_cvts(int this_cpu);
+
+/**
+ * 
+ */
+static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
+{
+	struct cq_node_struct* node1 = &(get_task_class_queue(p)->classqueue_linkobj);
+	struct cq_node_struct* node2 = &(get_task_class_queue(curr)->classqueue_linkobj);
+
+	return (class_compare_prio(node1,node2) < 0);
+}
+#endif
diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c
new file mode 100644
index 000000000..0ded7f3c6
--- /dev/null
+++ b/kernel/ckrm/ckrm_cpu_class.c
@@ -0,0 +1,350 @@
+/* kernel/ckrm/ckrm_cpu_class.c - CPU Class resource controller for CKRM
+ *
+ * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
+ *           (C) Hubertus Franke, IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/errno.h>
+#include <linux/sched.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_rc.h>
+#include <linux/ckrm_tc.h>
+#include <linux/ckrm_sched.h>
+#include <linux/ckrm_classqueue.h>
+#include <linux/seq_file.h>
+
+
+struct ckrm_res_ctlr cpu_rcbs;
+
+/*
+ *  initialize a class object and its local queues
+ */
+ static void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
+{
+	int i,j,k;      
+	prio_array_t *array; 	
+	struct ckrm_local_runqueue* queue;
+
+	for (i = 0 ; i < NR_CPUS ; i++) {
+		queue = &cls->local_queues[i];
+		queue->active  = queue->arrays;
+		queue->expired = queue->arrays+1;
+		
+		for (j = 0; j < 2; j++) {
+			array = queue->arrays + j;
+			for (k = 0; k < MAX_PRIO; k++) {
+				INIT_LIST_HEAD(array->queue + k);
+				__clear_bit(k, array->bitmap);
+			}
+			// delimiter for bitsearch
+			__set_bit(MAX_PRIO, array->bitmap);
+			array->nr_active = 0;
+		}
+
+		queue->expired_timestamp = 0;
+		
+		queue->cpu_class = cls;
+		queue->classqueue = get_cpu_classqueue(i);
+		queue->top_priority = MAX_PRIO;
+		cq_node_init(&queue->classqueue_linkobj);
+		queue->local_cvt = 0;
+		queue->uncounted_cvt = 0;
+		queue->uncounted_ns = 0;
+		queue->magic = 0x43FF43D7;
+	}
+
+	cls->shares = *shares;
+	cls->global_cvt = 0;
+	cls->cnt_lock = SPIN_LOCK_UNLOCKED;
+	ckrm_cpu_stat_init(&cls->stat);
+
+	// add to class list
+	write_lock(&class_list_lock);
+	list_add(&cls->links,&active_cpu_classes);
+	write_unlock(&class_list_lock);
+}
+
+static inline void set_default_share(ckrm_shares_t *shares)
+{
+	shares->my_guarantee     = 0;
+	shares->my_limit         = CKRM_SHARE_DFLT_MAX_LIMIT;
+	shares->total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+	shares->max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+	shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+	shares->cur_max_limit    = CKRM_SHARE_DFLT_MAX_LIMIT;
+}
+
+struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) {
+	return ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
+}
+
+
+void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) 
+{		
+	struct ckrm_cpu_class *cls;
+
+	if (! parent) /*root class*/
+		cls =  default_cpu_class;
+	else
+		cls = (struct ckrm_cpu_class *) kmalloc(sizeof(struct ckrm_cpu_class),GFP_ATOMIC);
+
+	if (cls) {
+		ckrm_shares_t shares;		
+		if ((! parent) && (core)) { 
+			/*
+			 * the default class is already initialized
+			 * so only update the core structure
+			 */
+			cls->core = core;			
+		} else {
+			set_default_share(&shares);
+			init_cpu_class(cls,&shares);
+			cls->core = core;
+			cls->parent = parent;
+		}
+	} else
+		printk("alloc_cpu_class failed GFP_ATOMIC\n");
+
+	return cls;
+}		
+
+/*
+ * hzheng: this is not a stable implementation
+ *         need to check race condition issue here
+ */		
+static void ckrm_free_cpu_class(void *my_res) 
+{			
+	struct ckrm_cpu_class *cls = my_res, *parres, *childres;
+	ckrm_core_class_t *child = NULL;
+	int maxlimit;
+
+	if (!cls) 
+		return;
+
+	/*the default class can't be freed*/
+	if (cls == default_cpu_class) 
+		return;
+
+	// Assuming there will be no children when this function is called
+	parres = ckrm_get_cpu_class(cls->parent);
+
+	// return child's limit/guarantee to parent node
+	spin_lock(&parres->cnt_lock);
+	child_guarantee_changed(&parres->shares, cls->shares.my_guarantee, 0);
+	// run thru parent's children and get the new max_limit of the parent
+	ckrm_lock_hier(parres->core);
+	maxlimit = 0;
+	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+		childres = ckrm_get_cpu_class(child);
+		if (maxlimit < childres->shares.my_limit) {
+			maxlimit = childres->shares.my_limit;
+		}
+	}
+	ckrm_unlock_hier(parres->core);
+	if (parres->shares.cur_max_limit < maxlimit) {
+		parres->shares.cur_max_limit = maxlimit;
+	}
+
+	spin_unlock(&parres->cnt_lock);
+
+	write_lock(&class_list_lock);
+	list_del(&cls->links);
+	write_unlock(&class_list_lock);
+
+	kfree(cls);
+}				
+
+/*
+ *  the system will adjust to the new share automatically  
+ */			
+int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) 
+{	
+        struct ckrm_cpu_class *parres, *cls = my_res;
+        struct ckrm_shares *cur = &cls->shares, *par;
+        int rc = -EINVAL;
+
+        if (!cls) 
+                return rc;
+
+        if (cls->parent) {
+                parres = ckrm_get_cpu_class(cls->parent);
+                spin_lock(&parres->cnt_lock);
+                spin_lock(&cls->cnt_lock);
+                par = &parres->shares;
+        } else {
+                spin_lock(&cls->cnt_lock);
+                par = NULL;
+                parres = NULL;
+        }
+
+	rc = set_shares(new_share, cur, par);
+
+	spin_unlock(&cls->cnt_lock);
+	if (cls->parent) {
+		spin_unlock(&parres->cnt_lock);
+	}
+	return rc;
+}							
+			
+/*
+ * translate the global_CVT to ticks
+ */
+static int ckrm_cpu_get_share(void *my_res,
+			      struct ckrm_shares *shares)
+{			
+	struct ckrm_cpu_class *cls = my_res;
+
+	if (!cls) 
+		return -EINVAL;
+	*shares = cls->shares;
+	return 0;
+}				
+
+int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
+{
+	struct ckrm_cpu_class *cls = my_res;
+
+	if (!cls) 
+		return -EINVAL;
+
+	seq_printf(sfile, "-------- CPU Class Status Start---------\n");
+	seq_printf(sfile, "  gua= %d limit= %d\n",
+		   cls->shares.my_guarantee,
+		   cls->shares.my_limit);
+	seq_printf(sfile, "  total_gua= %d limit= %d\n",
+		   cls->shares.total_guarantee,
+		   cls->shares.max_limit);
+	seq_printf(sfile, "  used_gua= %d cur_limit= %d\n",
+		   cls->shares.unused_guarantee,
+		   cls->shares.cur_max_limit);
+
+	seq_printf(sfile, "  Share= %d\n",cpu_class_weight(cls));
+	seq_printf(sfile, "  cvt= %llu\n",cls->local_queues[0].local_cvt);
+	seq_printf(sfile, "  total_ns= %llu\n",cls->stat.total_ns);
+	seq_printf(sfile, "  prio= %d\n",cls->local_queues[0].classqueue_linkobj.prio);
+	seq_printf(sfile, "  index= %d\n",cls->local_queues[0].classqueue_linkobj.index);
+	seq_printf(sfile, "  run= %llu\n",cls->stat.local_stats[0].run);
+	seq_printf(sfile, "  total= %llu\n",cls->stat.local_stats[0].total);
+	seq_printf(sfile, "  cpu_demand= %lu\n",cls->stat.cpu_demand);
+
+	seq_printf(sfile, "  effective_guarantee= %d\n",cls->stat.effective_guarantee);
+	seq_printf(sfile, "  effective_limit= %d\n",cls->stat.effective_limit);
+	seq_printf(sfile, "  effective_share= %d\n",cls->stat.effective_share);
+	seq_printf(sfile, "-------- CPU Class Status END ---------\n");
+
+
+	return 0;
+}
+
+/*
+ * task will remain in the same cpu but on a different local runqueue
+ */
+static void ckrm_cpu_change_class(void *task, void *old, void *new)
+{		
+	struct task_struct *tsk = task;			   
+	struct ckrm_cpu_class *newcls = new;
+	unsigned long flags;
+	struct runqueue *rq;
+	prio_array_t *array;
+
+	/*sanity checking*/
+	if (!task || ! old || !new)
+		return; 
+
+	rq = task_rq_lock(tsk,&flags); 
+	array = tsk->array;
+	if (array) {
+		dequeue_task(tsk,array);
+		tsk->cpu_class = newcls;
+		enqueue_task(tsk,rq_active(tsk,rq));
+	} else {
+		tsk->cpu_class = newcls;
+	}
+	task_rq_unlock(rq,&flags);
+}							
+
+/*dummy function, not used*/
+static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
+{
+	struct ckrm_cpu_class *cls = my_res;
+
+	if (!cls) 
+		return -EINVAL;
+
+	seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class");
+	return 0;
+}
+
+/*dummy function, not used*/
+static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
+{
+	struct ckrm_cpu_class *cls = my_res;
+
+	if (!cls) 
+		return -EINVAL;
+	printk("ckrm_cpu config='%s'\n",cfgstr);
+	return 0;
+}
+	
+struct ckrm_res_ctlr cpu_rcbs = {
+	.res_name          = "CKRM CPU Class",
+	.res_hdepth        = 1,
+	.resid             = -1,
+	.res_alloc         = ckrm_alloc_cpu_class,
+	.res_free          = ckrm_free_cpu_class,
+	.set_share_values  = ckrm_cpu_set_share,
+	.get_share_values  = ckrm_cpu_get_share,
+	.get_stats         = ckrm_cpu_get_stats,
+	.show_config       = ckrm_cpu_show_config,
+	.set_config        = ckrm_cpu_set_config,
+	.change_resclass   = ckrm_cpu_change_class,
+};
+
+int __init init_ckrm_sched_res(void)
+{
+	struct ckrm_classtype *clstype;
+	int resid = cpu_rcbs.resid;
+
+	clstype = ckrm_find_classtype_by_name("taskclass");
+	if (clstype == NULL) {
+		printk(KERN_INFO" Unknown ckrm classtype<taskclass>");
+		return -ENOENT;
+	}
+
+	if (resid == -1) { /*not registered */
+		resid = ckrm_register_res_ctlr(clstype,&cpu_rcbs);
+		printk("........init_ckrm_sched_res , resid= %d\n",resid);
+	}
+	return 0;
+}
+
+/*
+ * initialize the class structure
+ * add the default class: class 0
+ */
+void init_cpu_classes(void) 
+{
+	int i;
+
+	//init classqueues for each processor
+	for (i=0; i < NR_CPUS; i++)
+		classqueue_init(get_cpu_classqueue(i)); 
+/*
+ * hzheng: initialize the default cpu class
+ *         required for E14 since ckrm_init is called after sched_init
+ */
+	ckrm_alloc_cpu_class(NULL,NULL);
+}
+
+
+EXPORT_SYMBOL(ckrm_get_cpu_class);
diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c
new file mode 100644
index 000000000..674ee6e50
--- /dev/null
+++ b/kernel/ckrm/ckrm_cpu_monitor.c
@@ -0,0 +1,542 @@
+/* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
+ *
+ * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
+ *           (C) Hubertus Franke, IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ * 
+ * 23 June 2004: Created
+ * 
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_rc.h>
+#include <linux/ckrm_tc.h>
+#include <asm/div64.h>
+#include <linux/ckrm_sched.h>
+
+#define CPU_MONITOR_INTERVAL (4*HZ) /*how often do we adjust the shares*/
+#define CKRM_SHARE_ACCURACY 7
+#define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
+
+extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
+
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
+{
+	int i;
+	struct ckrm_cpu_class_local_stat* local_stat;
+	unsigned long long now = sched_clock();
+
+	stat->stat_lock = SPIN_LOCK_UNLOCKED;
+	stat->total_ns = 0;
+	stat->cpu_demand = 0;
+
+	for (i=0; i< NR_CPUS; i++) {
+		local_stat = &stat->local_stats[i];
+		local_stat->run = 0;
+		local_stat->total = 0;
+		local_stat->last_sleep = now;
+		local_stat->cpu_demand = 0;		
+	}
+
+	stat->effective_guarantee = 0;
+	stat->effective_limit = 0;
+	stat->glut = 0;
+	stat->effective_share = 100;
+	stat->self_effective_share = 100;
+}
+/**********************************************/
+/*          cpu demand                        */
+/**********************************************/
+
+/*
+ * How CPU demand is calculated:
+ * consider class local runqueue (clr) first
+ * at any time, a clr can at the following three states
+ * -- run: a task belonning to this class is running on this cpu
+ * -- wait: at least one of its task is running, but the class is not running
+ * -- sleep: none of the task of this class is runnable
+ *
+ * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
+ * 
+ * the cpu_demand of a class = 
+ *    sum of cpu_demand of all the class local runqueues
+ */
+
+/**
+ * update_cpu_demand - update a state change
+ * 
+ * should be called whenever the state of a local queue changes
+ * -- when deschedule : report how much run
+ * -- when enqueue: report how much sleep
+ *
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
+ */
+#define CKRM_CPU_DEMAND_RUN 0
+#define CKRM_CPU_DEMAND_SLEEP 1
+//how often should we recalculate the cpu demand, in ns
+#define CPU_DEMAND_CAL_THRESHOLD (1000000000LL)
+static inline void update_local_cpu_demand(struct ckrm_cpu_class_local_stat* local_stat,int state, unsigned long long len)
+{	
+	local_stat->total += len;
+	if (state == CKRM_CPU_DEMAND_RUN)
+		local_stat->run += len;
+
+	if (local_stat->total >= CPU_DEMAND_CAL_THRESHOLD) {
+		local_stat->total >>= CKRM_SHARE_ACCURACY;
+		if (local_stat->total > 0xFFFFFFFF)
+			local_stat->total = 0xFFFFFFFF;
+
+		do_div(local_stat->run,(unsigned long)local_stat->total);
+		local_stat->cpu_demand +=local_stat->run;
+		local_stat->cpu_demand >>= 1;
+		local_stat->total = 0;
+		local_stat->run = 0;
+	}
+}
+
+static inline void cpu_demand_update_run(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
+{
+	update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_RUN,len);
+}
+
+static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
+{
+	update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
+}
+
+#define CPU_DEMAND_ENQUEUE 0
+#define CPU_DEMAND_DEQUEUE 1
+#define CPU_DEMAND_DESCHEDULE 2
+
+/**
+ * cpu_demand_event - and cpu_demand event occured
+ * @event: one of the following three events:
+ *   CPU_DEMAND_ENQUEUE: local class enqueue
+ *   CPU_DEMAND_DEQUEUE: local class dequeue
+ *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
+ * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
+ */
+void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len) 
+{	
+	switch (event) {
+	case CPU_DEMAND_ENQUEUE: 
+		len = sched_clock() - local_stat->last_sleep;
+		local_stat->last_sleep = 0;
+		cpu_demand_update_sleep(local_stat,len);
+		break;
+	case CPU_DEMAND_DEQUEUE:
+		local_stat->last_sleep = sched_clock();
+		break;
+	case CPU_DEMAND_DESCHEDULE:
+		cpu_demand_update_run(local_stat,len);		
+		break;
+	default:
+		BUG();
+	}
+}
+
+/** 
+ * check all the class local queue
+ * if local queueu is not in runqueue, then it's in sleep state
+ * if compare to last sleep, 
+ */
+static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
+{
+	struct ckrm_cpu_class_local_stat * local_stat = &stat->local_stats[cpu];
+	unsigned long long sleep,now;
+	if (local_stat->last_sleep) {
+		now = sched_clock();
+		sleep = now - local_stat->last_sleep;
+		local_stat->last_sleep = now;
+		cpu_demand_update_sleep(local_stat,sleep);
+	}
+}
+
+/**
+ *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
+ *
+ * self_cpu_demand = sum(cpu demand of all local queues) 
+ */
+static unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat
+						*stat)
+{
+	int cpu_demand = 0;
+	int i;
+
+	for_each_online_cpu(i) {
+		cpu_demand_check_sleep(stat,i);
+		cpu_demand += stat->local_stats[i].cpu_demand;
+	}
+
+	if (cpu_demand > CKRM_SHARE_MAX)
+		cpu_demand = CKRM_SHARE_MAX;
+	return cpu_demand;
+}
+
+/*
+ * update effective cpu demand for each class
+ * assume the root_core->parent == NULL
+ */
+static void update_cpu_demand(struct ckrm_core_class *root_core)
+{
+	struct ckrm_core_class *cur_core, *child_core;
+	struct ckrm_cpu_class *cls;
+
+	cur_core = root_core;
+	child_core = NULL;
+	/*
+	 * iterate the tree
+	 * update cpu_demand of each node
+	 */
+      repeat:
+	if (!cur_core)
+		return;
+
+	cls = ckrm_get_cpu_class(cur_core);
+	if (!child_core)	//first child
+		cls->stat.cpu_demand = get_self_cpu_demand(&cls->stat);
+	else {
+		cls->stat.cpu_demand +=
+		    ckrm_get_cpu_class(child_core)->stat.cpu_demand;
+		if (cls->stat.cpu_demand > CKRM_SHARE_MAX)
+			cls->stat.cpu_demand = CKRM_SHARE_MAX;
+	}
+
+	//next child
+	child_core = ckrm_get_next_child(cur_core, child_core);
+	if (child_core) {
+		//go down
+		cur_core = child_core;
+		child_core = NULL;
+		goto repeat;
+	} else {		//no more child, go back
+		child_core = cur_core;
+		cur_core = child_core->hnode.parent;
+	}
+	goto repeat;
+}
+
+/**********************************************/
+/*          effective guarantee & limit       */
+/**********************************************/
+static inline void set_effective_share(struct ckrm_cpu_class_stat *stat,
+				       int new_share)
+{
+	if (!new_share)
+		new_share = 1;
+	stat->effective_share = new_share;
+}
+
+static inline void set_self_effective_share(struct ckrm_cpu_class_stat *stat,
+					    int new_share)
+{
+	if (!new_share)
+		new_share = 1;
+	stat->self_effective_share = new_share;
+}
+
+static inline void update_child_effective(struct ckrm_core_class *parent)
+{
+	struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
+	struct ckrm_core_class *child_core = ckrm_get_next_child(parent, NULL);
+
+	while (child_core) {
+		struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
+
+		c_cls->stat.effective_guarantee =
+		    p_cls->stat.effective_guarantee *
+		    c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
+		c_cls->stat.effective_limit =
+		    p_cls->stat.effective_guarantee * c_cls->shares.my_limit /
+		    p_cls->shares.total_guarantee;
+
+		child_core = ckrm_get_next_child(parent, child_core);
+	};
+
+}
+
+/*
+ * update effective guarantee and effective limit
+ * -- effective share = parent->effective->share * share/parent->total_share
+ * -- effective limit = parent->effective->share * limit/parent->total_share
+ * should be called only when class structure changed
+ */
+static void update_effective_guarantee_limit(struct ckrm_core_class *root_core)
+{
+	struct ckrm_core_class *cur_core, *child_core = NULL;
+	struct ckrm_cpu_class *cls;
+
+	cur_core = root_core;
+	cls = ckrm_get_cpu_class(cur_core);
+	cls->stat.effective_guarantee = CKRM_SHARE_MAX;
+	cls->stat.effective_limit = cls->stat.effective_guarantee;
+
+      repeat:
+	//check exit
+	if (!cur_core)
+		return;
+
+	//visit this node
+	update_child_effective(cur_core);
+	//next child
+	child_core = ckrm_get_next_child(cur_core, child_core);
+	if (child_core) {
+		//go down
+		cur_core = child_core;
+		child_core = NULL;
+		goto repeat;
+	} else {		//no more child, go back
+		child_core = cur_core;
+		cur_core = child_core->hnode.parent;
+	}
+	goto repeat;
+}
+
+/**********************************************/
+/*          surplus allocation                */
+/**********************************************/
+
+/*
+ * surplus = my_effective_share - demand
+ * if surplus < 0, surplus = 0 
+ */
+static inline int get_node_surplus(struct ckrm_cpu_class *cls)
+{
+	int surplus = cls->stat.effective_guarantee - cls->stat.cpu_demand;
+
+	if (surplus < 0)
+		surplus = 0;
+
+	return surplus;
+}
+
+/*
+ * consume the surplus
+ * return how much consumed
+ * set glut when necessary
+ */
+static inline int node_surplus_consume(int old_surplus,
+				       struct ckrm_core_class *child_core,
+				       struct ckrm_cpu_class *p_cls)
+{
+	int consumed = 0;
+	int inc_limit;
+
+	struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
+
+	if (c_cls->stat.glut)
+		goto out;
+
+	//check demand
+	if (c_cls->stat.effective_share >= c_cls->stat.cpu_demand) {
+		c_cls->stat.glut = 1;
+		goto out;
+	}
+
+	consumed =
+	    old_surplus * c_cls->shares.my_guarantee /
+	    p_cls->shares.total_guarantee;
+
+	//check limit
+	inc_limit = c_cls->stat.effective_limit - c_cls->stat.effective_share;
+	if (inc_limit <= consumed) {
+		c_cls->stat.glut = 1;
+		consumed = inc_limit;
+	}
+
+	c_cls->stat.effective_share += consumed;
+      out:
+	return consumed;
+}
+
+/*
+ * re-allocate the shares for all the childs under this node
+ * task:
+ *  1. get total surplus
+ *  2. allocate surplus
+ *  3. set the effective_share of each node
+ */
+static void alloc_surplus_node(struct ckrm_core_class *parent)
+{
+	int total_surplus = 0, old_surplus = 0;
+	struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
+	struct ckrm_core_class *child_core = NULL;
+	int self_share;
+
+	/*
+	 * calculate surplus 
+	 * total_surplus = sum(child_surplus)
+	 * reset glut flag
+	 * initialize effective_share
+	 */
+	do {
+		child_core = ckrm_get_next_child(parent, child_core);
+		if (child_core) {
+			struct ckrm_cpu_class *c_cls =
+			    ckrm_get_cpu_class(child_core);
+			ckrm_stat_t *stat = &c_cls->stat;
+
+			total_surplus += get_node_surplus(c_cls);
+			stat->glut = 0;
+			set_effective_share(stat, stat->effective_guarantee);
+		}
+	} while (child_core);
+
+	/*distribute the surplus */
+	child_core = NULL;
+	do {
+		if (!child_core)	//keep the surplus of last round
+			old_surplus = total_surplus;
+
+		child_core = ckrm_get_next_child(parent, child_core);
+		if (child_core) {
+			total_surplus -=
+			    node_surplus_consume(old_surplus, child_core,
+						 p_cls);
+		}
+		//start a new round if something is allocated in the last round
+	} while (child_core || (total_surplus != old_surplus));
+
+	//any remaining surplus goes to the default class
+	self_share = p_cls->stat.effective_share *
+	    p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
+	self_share += total_surplus;
+
+	set_self_effective_share(&p_cls->stat, self_share);
+}
+
+/**
+ * alloc_surplus - reallocate unused shares
+ *
+ * class A's usused share should be allocated to its siblings
+ */
+static void alloc_surplus(struct ckrm_core_class *root_core)
+{
+	struct ckrm_core_class *cur_core, *child_core = NULL;
+	struct ckrm_cpu_class *cls;
+
+	cur_core = root_core;
+	cls = ckrm_get_cpu_class(cur_core);
+	cls->stat.glut = 0;
+	set_effective_share(&cls->stat, cls->stat.effective_guarantee);
+      repeat:
+	//check exit
+	if (!cur_core)
+		return;
+
+	//visit this node
+	alloc_surplus_node(cur_core);
+	//next child
+	child_core = ckrm_get_next_child(cur_core, child_core);
+	if (child_core) {
+		//go down
+		cur_core = child_core;
+		child_core = NULL;
+		goto repeat;
+	} else {		//no more child, go back
+		child_core = cur_core;
+		cur_core = child_core->hnode.parent;
+	}
+	goto repeat;
+}
+
+/**
+ *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
+ *
+ * this function is called every CPU_MONITOR_INTERVAL
+ * it computes the cpu demand of each class
+ * and re-allocate the un-used shares to other classes
+ */
+void ckrm_cpu_monitor(void)
+{
+	struct ckrm_core_class *root_core = default_cpu_class->core;
+	if (!root_core)
+		return;
+
+	update_effective_guarantee_limit(root_core);
+	update_cpu_demand(root_core);
+	alloc_surplus(root_core);
+}
+
+/*****************************************************/
+/*            Supporting Functions                   */
+/*****************************************************/
+static pid_t cpu_monitor_pid = -1;
+static int thread_exit = 0;
+
+static int ckrm_cpu_monitord(void *nothing)
+{
+	wait_queue_head_t wait;
+
+	init_waitqueue_head(&wait);
+
+	daemonize("ckrm_cpu_ctrld");
+	for (;;) {
+		/*sleep for sometime before next try*/
+		interruptible_sleep_on_timeout(&wait, CPU_MONITOR_INTERVAL);
+		ckrm_cpu_monitor();
+		if (thread_exit) {
+			break;
+		}
+	}
+	cpu_monitor_pid = -1;
+	thread_exit = 2;
+	printk("cpu_monitord exit\n");
+	return 0;
+}
+
+void ckrm_start_monitor(void)
+{
+	cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
+	if (cpu_monitor_pid < 0) {
+		printk("ckrm_cpu_monitord for failed\n");
+	}
+}
+
+void ckrm_kill_monitor(void)
+{
+	wait_queue_head_t wait;
+	int interval = HZ;
+	init_waitqueue_head(&wait);
+
+	printk("killing process %d\n", cpu_monitor_pid);
+	if (cpu_monitor_pid > 0) {
+		thread_exit = 1;
+		while (thread_exit != 2) {
+			interruptible_sleep_on_timeout(&wait, interval);
+		}
+	}
+}
+
+int ckrm_cpu_monitor_init(void)
+{
+	ckrm_start_monitor();
+	return 0;
+}
+
+void ckrm_cpu_monitor_exit(void)
+{
+	ckrm_kill_monitor();
+}
+
+module_init(ckrm_cpu_monitor_init);
+module_exit(ckrm_cpu_monitor_exit);
+
+MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
+MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
+MODULE_LICENSE("GPL");
diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c
new file mode 100644
index 000000000..1929aaf4e
--- /dev/null
+++ b/kernel/ckrm_classqueue.c
@@ -0,0 +1,178 @@
+/* kernel/ckrm_classqueue.c : implements the class queue
+ *
+ * Copyright (C) Haoqiang Zheng, IBM Corp. 2003
+ *           (C) Hubertus Franke, IBM Corp. 2003
+ *
+ * Class queue functionality for CKRM cpu controller
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * Aug 28, 2003
+ *        Created.
+ * July 08, 2004
+ *        classqueue now has a fixed size
+ *        major clean up
+ *        function/structure names are changed to more intuitive ones
+ */
+#include <linux/sched.h>
+#include <linux/ckrm_classqueue.h>
+
+#define cq_nr_member(cq) (cq->array.nr_active)
+
+/**
+ * get_index - translate the logical priority to the real index in the queue
+ * 
+ * validate the position
+ * a valid prio is [cq->base,cq->base + size -1]
+ */
+static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
+{
+	unsigned long index;
+	int max_prio;
+
+	if (!cq_nr_member(cq))
+		return 0;
+
+	max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
+	if (*prio > max_prio)
+		*prio = max_prio;
+	if (*prio < cq->base)
+		*prio = cq->base;
+
+       	index = (cq->base_offset + (*prio - cq->base)) ;
+	if (index >= CLASSQUEUE_SIZE)
+		index -= CLASSQUEUE_SIZE;
+
+	return index;
+}
+
+/**
+ * initialize a class queue object
+ */
+int classqueue_init(struct classqueue_struct *cq)
+{
+	int i;
+	struct cq_prio_array *array;
+
+	array = &cq->array;
+	for (i = 0; i < CLASSQUEUE_SIZE; i++) {
+		INIT_LIST_HEAD(array->queue + i);
+		__clear_bit(i, array->bitmap);
+	}
+	// delimiter for bitsearch
+	__set_bit(CLASSQUEUE_SIZE, array->bitmap);
+	array->nr_active = 0;
+
+	cq->base = 0;
+	cq->base_offset = -1;	//not valid yet
+
+	return 0;
+}
+
+/**
+ *classqueue_enqueue - add the class to classqueue based on its prio
+ */
+void classqueue_enqueue(struct classqueue_struct *cq,
+			cq_node_t * node, int prio)
+{
+	int index;
+
+	//get real index
+	if (cq_nr_member(cq)) {
+		index = get_index(cq, &prio);
+	} else {		//the first one
+		cq->base = prio;
+		cq->base_offset = 0;
+		index = 0;
+	}
+
+	//add to the queue
+	list_add(&(node->list), &cq->array.queue[index]);
+	__set_bit(index, cq->array.bitmap);
+	cq->array.nr_active++;
+
+	node->index = index;
+	node->prio = prio;
+}
+
+void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node)
+{
+	//delete from queue
+	list_del_init(&(node->list));
+	cq->array.nr_active--;
+
+	//check clear the bitmap
+	if (list_empty(&cq->array.queue[node->index]))
+		__clear_bit(node->index, cq->array.bitmap);
+}
+
+void classqueue_update_prio(struct classqueue_struct *cq,
+			    cq_node_t * node, int new_pos)
+{
+	int index;
+
+	if (! cls_in_classqueue(node)) 
+		return;
+
+	index = get_index(cq, &new_pos);
+	node->prio = new_pos;
+
+	//remove from the original position
+	list_del_init(&(node->list));
+	if (list_empty(&cq->array.queue[node->index]))
+		__clear_bit(node->index, cq->array.bitmap);
+	
+	//add to new positon, round robin for classes with same priority
+	list_add_tail(&(node->list), &cq->array.queue[index]);
+	__set_bit(index, cq->array.bitmap);
+	
+	node->index = index;
+}
+
+cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
+{
+	cq_node_t *result = NULL;
+	int pos;
+
+	/* 
+	 * search over the bitmap to get the first class in the queue
+	 */
+	pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset);
+	if (pos >= CLASSQUEUE_SIZE) {	//do circular search from the beginning
+		pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
+	}
+
+	if (pos < CLASSQUEUE_SIZE) {
+		BUG_ON(list_empty(&cq->array.queue[pos]));
+		result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+	}
+	return result;
+}
+
+/**
+ * Moving the end of queue forward
+ * the new_base here is logical, we need to translate to the abosule position
+ */
+void classqueue_update_base(struct classqueue_struct *cq, int new_base)
+{
+	if (!cq_nr_member(cq)) {
+		cq->base_offset = -1;	//not defined
+		return;
+	}
+
+	//	assert(new_base >= cq->base);
+
+	if (new_base > cq->base) {
+		cq->base_offset = get_index(cq, &new_base);
+		cq->base = new_base;
+	}
+}
diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c
new file mode 100644
index 000000000..ba716d4c5
--- /dev/null
+++ b/kernel/ckrm_sched.c
@@ -0,0 +1,71 @@
+/* kernel/ckrm_sched.c - Supporting functions for ckrm scheduling
+ *
+ * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
+ *           (C) Hubertus Franke, IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/ckrm_sched.h>
+
+/*******************************************************/
+/*                CVT Management                       */
+/*******************************************************/
+#define CVT_WINDOW_SIZE (CLASSQUEUE_SIZE << CLASS_BONUS_RATE)
+static CVT_t max_CVT = CVT_WINDOW_SIZE;
+
+/*
+ *  Also ensure that the classes global cvt is upgraded to the 
+ * minimum CVT in the system, as a class might not have run for a while
+ */
+static void update_global_cvt(struct ckrm_cpu_class *cpu_class, int cpu)
+{
+	struct ckrm_local_runqueue *class_queue =
+	    get_ckrm_local_runqueue(cpu_class, cpu);
+	CVT_t min_cvt;
+	CVT_t local_cvt_old = class_queue->local_cvt;
+
+	spin_lock(&cvt_lock);
+	if (class_queue->uncounted_cvt) {
+		cpu_class->global_cvt += class_queue->uncounted_cvt;
+		class_queue->uncounted_cvt = 0;
+	}
+	min_cvt = max_CVT - CVT_WINDOW_SIZE;
+	if (cpu_class->global_cvt < min_cvt)
+		cpu_class->global_cvt = min_cvt;
+	else  if (cpu_class->global_cvt > max_CVT)
+		max_CVT = cpu_class->global_cvt;
+
+/* update local cvt from global cvt*/
+#if 0
+	class_queue->local_cvt = cpu_class->global_cvt;
+#endif
+	spin_unlock(&cvt_lock);
+
+	if (class_queue->local_cvt != local_cvt_old)
+		update_class_priority(class_queue);
+}
+
+/*
+ * class_list_lock must have been acquired 
+ */
+void update_global_cvts(int this_cpu)
+{
+	struct ckrm_cpu_class *clsptr;
+	struct ckrm_local_runqueue *class_queue;
+
+	/*for each class*/
+	list_for_each_entry(clsptr, &active_cpu_classes, links) {
+		update_global_cvt(clsptr, this_cpu);
+		class_queue = get_ckrm_local_runqueue(clsptr, this_cpu);
+		clsptr->stat.total_ns += class_queue->uncounted_ns;
+		class_queue->uncounted_ns = 0;
+	}
+}
-- 
2.47.0