This commit was manufactured by cvs2svn to create branch 'ckrm'.
authorPlanet-Lab Support <support@planet-lab.org>
Fri, 16 Jul 2004 20:14:56 +0000 (20:14 +0000)
committerPlanet-Lab Support <support@planet-lab.org>
Fri, 16 Jul 2004 20:14:56 +0000 (20:14 +0000)
include/linux/ckrm_classqueue.h [new file with mode: 0644]
include/linux/ckrm_sched.h [new file with mode: 0644]
kernel/ckrm/ckrm_cpu_class.c [new file with mode: 0644]
kernel/ckrm/ckrm_cpu_monitor.c [new file with mode: 0644]
kernel/ckrm_classqueue.c [new file with mode: 0644]
kernel/ckrm_sched.c [new file with mode: 0644]

diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h
new file mode 100644 (file)
index 0000000..1bdf9b7
--- /dev/null
@@ -0,0 +1,129 @@
+/* include/linux/ckrm_classqueue.h : cpu control for CKRM
+ *
+ * Copyright (C) Haoqiang Zheng, IBM Corp. 2003
+ *           (C) Hubertus Franke, IBM Corp. 2003
+ * 
+ * Circular queue functionality for CKRM cpu controller
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * Aug 28, 2003
+ *        Created.
+ * July 07, 2004
+ *   clean up, add comments     
+ *   
+ */
+
+#ifndef _CKRM_CLASSQUEUE_H
+#define _CKRM_CLASSQUEUE_H
+
+#include <linux/list.h>
+
+#define CLASSQUEUE_SIZE 128
+#define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long))
+
+/**
+ * struct cq_prio_array: duplicates prio_array defined in sched.c 
+ *
+ * I duplicate this data structure to make ckrm_classqueue implementation more modular
+ */
+struct cq_prio_array {
+       int nr_active;
+       unsigned long bitmap[CQ_BITMAP_SIZE];
+       struct list_head queue[CLASSQUEUE_SIZE];
+};
+
+/**
+ * struct classqueue_struct - a runqueue of class local runqueues
+ * @array: priority array
+ * @base: base priority
+ * @base_offset: index in array for the base
+ *
+ * classqueue can be thought of as runqueue of classes (instead of runqueue of tasks)
+ * as task runqueue, each processor has a classqueue
+ * a class enters the classqueue when the first task in this class local runqueue shows up
+ * a class enters the classqueue when the last task in the local runqueue leaves
+ * class local runqueues are ordered based their priority
+ *
+ * status:
+ *   hzheng: is 32bit base long enough?
+ */
+struct classqueue_struct {
+       struct cq_prio_array array;
+       unsigned long base;
+       unsigned long base_offset;
+};
+
+/** 
+ * struct cq_node_struct - the link object between class local runqueue and classqueue
+ * @list: links the class local runqueue to classqueue
+ * @prio: class priority, which is caculated based on it's progress (cvt) and urgency (top_priority)
+ * @index: real index into the classqueue array, calculated based on priority
+ *
+ * NOTE: make sure list is empty when it's not in classqueue
+ */
+struct cq_node_struct {
+       struct list_head list;
+       int prio;
+       int index;
+};
+typedef struct cq_node_struct cq_node_t;
+
+typedef unsigned long long CVT_t;      // cummulative virtual time
+
+static inline void cq_node_init(cq_node_t * node)
+{
+       node->prio = 0;
+       node->index = -1;
+       INIT_LIST_HEAD(&node->list);
+}
+
+/*if the class is in classqueue*/
+static inline int cls_in_classqueue(cq_node_t * node)
+{
+       return !list_empty(&node->list);
+}
+
+/*initialize the data structure*/
+int classqueue_init(struct classqueue_struct *cq);
+
+/*add the class to classqueue*/
+void classqueue_enqueue(struct classqueue_struct *cq, cq_node_t * node, int prio);
+
+/**
+ * classqueue_dequeue - remove the class from classqueue
+ * 
+ * internal:
+ *   called when the last task is removed from the queue
+ *   checked on load balancing and schedule
+ *   hzheng: why don't I call it on class_dequeue_task?
+ */
+void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node);
+
+/*change the position of the class in classqueue*/
+void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int new_prio);
+
+/*return the first class in classqueue*/
+cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
+
+/*update the base priority of the classqueue*/
+void classqueue_update_base(struct classqueue_struct *cq, int new_base);
+
+/**
+ * class_compare_prio: compare the priority of this two nodes
+ */
+static inline int class_compare_prio(struct cq_node_struct* node1, struct cq_node_struct* node2)
+{
+       return ( node1->prio - node2->prio);
+}
+
+#endif
diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h
new file mode 100644 (file)
index 0000000..9d82214
--- /dev/null
@@ -0,0 +1,297 @@
+/* include/linux/ckrm_sched.h - Supports CKRM scheduling
+ *
+ * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
+ * Copyright (C) Hubertus Franke,  IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _CKRM_SCHED_H
+#define _CKRM_SCHED_H
+
+#define CC_BUG_ON_DO(cond,action)  do { if (cond)  action; BUG_ON(cond); } while(0)
+#define CC_BUG_ON(cond)            BUG_ON(cond)
+
+#include <linux/sched.h>
+#include <linux/ckrm_rc.h>
+#include <linux/ckrm_classqueue.h>
+
+//update every second
+#define CVT_UPDATE_TICK     (1*HZ/1 ?: 1)
+#define CLASS_BONUS_RATE 22    // shift from ns to increase class bonus
+#define PRIORITY_BONUS_RATE 0  // ??  Hubertus
+
+#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+struct prio_array {
+       int nr_active;
+       unsigned long bitmap[BITMAP_SIZE];
+       struct list_head queue[MAX_PRIO];
+};
+
+struct ckrm_local_runqueue {
+       cq_node_t classqueue_linkobj;   /*links in classqueue */
+       struct ckrm_cpu_class *cpu_class;       // class it belongs to
+       struct classqueue_struct *classqueue;   // classqueue it belongs tow
+       CVT_t uncounted_cvt;
+       unsigned long long uncounted_ns;
+
+       prio_array_t *active, *expired, arrays[2];
+       /*
+          set to 0 on init, become null or array switch
+          set to jiffies whenever an non-interactive job expires
+          reset to jiffies if expires
+        */
+       unsigned long expired_timestamp;
+
+       /* 
+        * highest priority of tasks in active
+        * initialized to be MAX_PRIO
+        * updated on enqueue, dequeue
+        */
+       int top_priority;
+       CVT_t local_cvt;        // snapshot of local_cvt, update on every loadbalance
+       unsigned long magic;    //for debugging
+};
+
+/**
+ * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping
+ */
+struct ckrm_cpu_class_local_stat {
+       unsigned long long run;
+       unsigned long long total;
+       unsigned long long last_sleep;
+       unsigned long cpu_demand; /*estimated cpu demand */
+};
+
+/**
+ * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
+ * 
+ */
+struct ckrm_cpu_class_stat {
+       spinlock_t stat_lock;
+
+       unsigned long long total_ns;    /*how much nano-secs it has consumed */
+
+       struct ckrm_cpu_class_local_stat local_stats[NR_CPUS];
+       unsigned long cpu_demand;
+
+       /*temp stat used by cpu monitor */
+       int effective_guarantee;
+       int effective_limit;
+       int glut;               //true or false
+       /*
+        * effective_share: for both default class and its children
+        * self_effective_share: just for the default class
+        */
+       int effective_share;
+       int self_effective_share;
+};
+
+typedef struct ckrm_cpu_class_stat ckrm_stat_t;
+
+/*
+ * manages the class status
+ * there should be only one instance of this object for each class in the whole system  
+ */
+struct ckrm_cpu_class {
+       struct ckrm_core_class *core;
+       struct ckrm_core_class *parent;
+       struct ckrm_shares shares;
+       spinlock_t cnt_lock;    // always grab parent's lock first and then child's
+       CVT_t global_cvt;       // total cummulative virtual time
+       struct ckrm_cpu_class_stat stat;
+       struct list_head links; // for linking up in cpu classes
+       struct ckrm_local_runqueue local_queues[NR_CPUS];       // runqueues 
+};
+
+#if CONFIG_CKRM_CPU_SCHEDULE
+#define rq_active(p,rq)   (get_task_class_queue(p)->active)
+#define rq_expired(p,rq)  (get_task_class_queue(p)->expired)
+#else
+#define rq_active(p,rq)   (rq->active)
+#define rq_expired(p,rq)  (rq->expired)
+#endif
+
+//#define cpu_class_weight(cls) (cls->shares.my_guarantee)
+#define cpu_class_weight(cls) (cls->stat.self_effective_share)
+
+#define bpt_queue(cpu) (& (cpu_rq(cpu)->classqueue) )
+CVT_t get_min_cvt(int cpu);
+
+struct classqueue_struct *get_cpu_classqueue(int cpu);
+
+extern struct ckrm_cpu_class default_cpu_class_obj;
+#define default_cpu_class (&default_cpu_class_obj)
+
+#define local_queue_nr_running(local_queue) \
+             (local_queue->active->nr_active + local_queue->expired->nr_active)
+
+static inline struct ckrm_local_runqueue *
+get_ckrm_local_runqueue(struct ckrm_cpu_class*cls, int cpu)
+{
+       return &(cls->local_queues[cpu]);
+}
+
+static inline struct ckrm_local_runqueue *get_task_class_queue(struct task_struct *p)
+{
+       return &(p->cpu_class->local_queues[task_cpu(p)]);
+}
+
+#define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
+#define class_list_entry(list) list_entry(list,struct ckrm_local_runqueue,classqueue_linkobj)
+
+/* some additional interfaces exported from sched.c */
+struct runqueue;
+void dequeue_task(struct task_struct *p, prio_array_t * array);
+void enqueue_task(struct task_struct *p, prio_array_t * array);
+struct runqueue *task_rq_lock(task_t * p, unsigned long *flags);
+void task_rq_unlock(struct runqueue *rq, unsigned long *flags);
+extern spinlock_t cvt_lock;
+extern rwlock_t class_list_lock;
+extern struct list_head active_cpu_classes;
+
+/*functions exported by ckrm_cpu_class.c*/
+int __init init_ckrm_sched_res(void);
+void init_cpu_classes(void);
+
+/*functions exported by ckrm_cpu_monitor.c*/
+void ckrm_cpu_monitor(void);
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
+#define CPU_DEMAND_ENQUEUE 0
+#define CPU_DEMAND_DEQUEUE 1
+#define CPU_DEMAND_DESCHEDULE 2
+void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len);
+
+#define get_task_local_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
+#define get_rq_local_stat(lrq,cpu) (&(lrq)->cpu_class->stat.local_stats[cpu])
+
+/**
+ * get_effective_prio: return the effective priority of a class local queue
+ *
+ * class priority = progress * a + urgency * b
+ * progress = queue cvt
+ * urgency = queue top priority
+ * a and b are scaling factors  
+ * currently, prio increases by 1 if either: top_priority increase by one
+ *                                   or, local_cvt increases by 4ms
+ */
+static inline int get_effective_prio(struct ckrm_local_runqueue * lcq)
+{
+       int prio;
+
+       // cumulative usage
+       prio = lcq->local_cvt >> CLASS_BONUS_RATE;
+       // queue urgency
+       prio += lcq->top_priority >> PRIORITY_BONUS_RATE;
+
+       return prio;
+}
+
+/** 
+ * update_class_priority:
+ * 
+ * called whenever cvt or top_priority changes
+ *
+ * internal: (calling structure)
+ * update_class_priority
+ *   -- set_top_priority
+ *      -- class_enqueue_task
+ *      -- class_dequeue_task
+ *      -- rq_get_next_task (queue switch)
+ *   -- update_local_cvt
+ *      -- schedule
+ *   -- update_global_cvt
+ */
+static inline void update_class_priority(struct ckrm_local_runqueue *local_rq)
+{
+       int effective_prio = get_effective_prio(local_rq);
+       classqueue_update_prio(local_rq->classqueue,
+                              &local_rq->classqueue_linkobj,
+                              effective_prio);
+}
+
+/*
+ *  set the new top priority and reposition the queue
+ *  called when: task enqueue/dequeue and queue switch
+ */
+static inline void set_top_priority(struct ckrm_local_runqueue *class_queue,
+                                   int new_priority)
+{
+       class_queue->top_priority = new_priority;
+       update_class_priority(class_queue);
+}
+
+static inline void class_enqueue_task(struct task_struct *p,
+                                     prio_array_t * array)
+{
+       struct ckrm_local_runqueue *queue;
+       int effective_prio;
+
+       queue = get_task_class_queue(p);
+
+       if (! cls_in_classqueue(&queue->classqueue_linkobj)) {
+               cpu_demand_event(get_task_local_stat(p),CPU_DEMAND_ENQUEUE,0);
+               /*make sure the cvt of this class is up to date*/
+               queue->local_cvt = get_min_cvt(task_cpu(p));
+               effective_prio = get_effective_prio(queue);
+               classqueue_enqueue(queue->classqueue, &queue->classqueue_linkobj, effective_prio);
+       } 
+       
+       if ((p->prio < queue->top_priority) && (array == queue->active))
+               set_top_priority(queue, p->prio);       
+
+}
+
+static inline void class_dequeue_task(struct task_struct *p,
+                                     prio_array_t * array)
+{
+       struct ckrm_local_runqueue *queue = get_task_class_queue(p);
+
+       if ((array == queue->active) && (p->prio == queue->top_priority)
+           && list_empty(&(array->queue[p->prio])))
+               set_top_priority(queue,
+                                find_next_bit(array->bitmap, MAX_PRIO,
+                                              p->prio));
+}
+
+/*
+ *  called after a task is switched out. Update the local cvt accounting 
+ *  we need to stick with long instead of long long due to nonexistent 64-bit division
+ */
+static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
+{
+       struct ckrm_local_runqueue *class_queue = get_task_class_queue(p);
+       struct ckrm_cpu_class *cls = class_queue->cpu_class;
+
+       unsigned long cvt_inc = nsec / cpu_class_weight(cls);
+
+       class_queue->local_cvt += cvt_inc;
+       class_queue->uncounted_cvt += cvt_inc;
+
+       class_queue->uncounted_ns += nsec;
+       update_class_priority(class_queue);
+}
+
+/*
+ * called during loadbalancing 
+ * to charge the class with locally accumulated cvt
+ */
+void update_global_cvts(int this_cpu);
+
+/**
+ * 
+ */
+static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
+{
+       struct cq_node_struct* node1 = &(get_task_class_queue(p)->classqueue_linkobj);
+       struct cq_node_struct* node2 = &(get_task_class_queue(curr)->classqueue_linkobj);
+
+       return (class_compare_prio(node1,node2) < 0);
+}
+#endif
diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c
new file mode 100644 (file)
index 0000000..0ded7f3
--- /dev/null
@@ -0,0 +1,350 @@
+/* kernel/ckrm/ckrm_cpu_class.c - CPU Class resource controller for CKRM
+ *
+ * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
+ *           (C) Hubertus Franke, IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/errno.h>
+#include <linux/sched.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_rc.h>
+#include <linux/ckrm_tc.h>
+#include <linux/ckrm_sched.h>
+#include <linux/ckrm_classqueue.h>
+#include <linux/seq_file.h>
+
+
+struct ckrm_res_ctlr cpu_rcbs;
+
+/*
+ *  initialize a class object and its local queues
+ */
+ static void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
+{
+       int i,j,k;      
+       prio_array_t *array;    
+       struct ckrm_local_runqueue* queue;
+
+       for (i = 0 ; i < NR_CPUS ; i++) {
+               queue = &cls->local_queues[i];
+               queue->active  = queue->arrays;
+               queue->expired = queue->arrays+1;
+               
+               for (j = 0; j < 2; j++) {
+                       array = queue->arrays + j;
+                       for (k = 0; k < MAX_PRIO; k++) {
+                               INIT_LIST_HEAD(array->queue + k);
+                               __clear_bit(k, array->bitmap);
+                       }
+                       // delimiter for bitsearch
+                       __set_bit(MAX_PRIO, array->bitmap);
+                       array->nr_active = 0;
+               }
+
+               queue->expired_timestamp = 0;
+               
+               queue->cpu_class = cls;
+               queue->classqueue = get_cpu_classqueue(i);
+               queue->top_priority = MAX_PRIO;
+               cq_node_init(&queue->classqueue_linkobj);
+               queue->local_cvt = 0;
+               queue->uncounted_cvt = 0;
+               queue->uncounted_ns = 0;
+               queue->magic = 0x43FF43D7;
+       }
+
+       cls->shares = *shares;
+       cls->global_cvt = 0;
+       cls->cnt_lock = SPIN_LOCK_UNLOCKED;
+       ckrm_cpu_stat_init(&cls->stat);
+
+       // add to class list
+       write_lock(&class_list_lock);
+       list_add(&cls->links,&active_cpu_classes);
+       write_unlock(&class_list_lock);
+}
+
+static inline void set_default_share(ckrm_shares_t *shares)
+{
+       shares->my_guarantee     = 0;
+       shares->my_limit         = CKRM_SHARE_DFLT_MAX_LIMIT;
+       shares->total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       shares->max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+       shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       shares->cur_max_limit    = CKRM_SHARE_DFLT_MAX_LIMIT;
+}
+
+struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) {
+       return ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
+}
+
+
+void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) 
+{              
+       struct ckrm_cpu_class *cls;
+
+       if (! parent) /*root class*/
+               cls =  default_cpu_class;
+       else
+               cls = (struct ckrm_cpu_class *) kmalloc(sizeof(struct ckrm_cpu_class),GFP_ATOMIC);
+
+       if (cls) {
+               ckrm_shares_t shares;           
+               if ((! parent) && (core)) { 
+                       /*
+                        * the default class is already initialized
+                        * so only update the core structure
+                        */
+                       cls->core = core;                       
+               } else {
+                       set_default_share(&shares);
+                       init_cpu_class(cls,&shares);
+                       cls->core = core;
+                       cls->parent = parent;
+               }
+       } else
+               printk("alloc_cpu_class failed GFP_ATOMIC\n");
+
+       return cls;
+}              
+
+/*
+ * hzheng: this is not a stable implementation
+ *         need to check race condition issue here
+ */            
+static void ckrm_free_cpu_class(void *my_res) 
+{                      
+       struct ckrm_cpu_class *cls = my_res, *parres, *childres;
+       ckrm_core_class_t *child = NULL;
+       int maxlimit;
+
+       if (!cls) 
+               return;
+
+       /*the default class can't be freed*/
+       if (cls == default_cpu_class) 
+               return;
+
+       // Assuming there will be no children when this function is called
+       parres = ckrm_get_cpu_class(cls->parent);
+
+       // return child's limit/guarantee to parent node
+       spin_lock(&parres->cnt_lock);
+       child_guarantee_changed(&parres->shares, cls->shares.my_guarantee, 0);
+       // run thru parent's children and get the new max_limit of the parent
+       ckrm_lock_hier(parres->core);
+       maxlimit = 0;
+       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+               childres = ckrm_get_cpu_class(child);
+               if (maxlimit < childres->shares.my_limit) {
+                       maxlimit = childres->shares.my_limit;
+               }
+       }
+       ckrm_unlock_hier(parres->core);
+       if (parres->shares.cur_max_limit < maxlimit) {
+               parres->shares.cur_max_limit = maxlimit;
+       }
+
+       spin_unlock(&parres->cnt_lock);
+
+       write_lock(&class_list_lock);
+       list_del(&cls->links);
+       write_unlock(&class_list_lock);
+
+       kfree(cls);
+}                              
+
+/*
+ *  the system will adjust to the new share automatically  
+ */                    
+int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) 
+{      
+        struct ckrm_cpu_class *parres, *cls = my_res;
+        struct ckrm_shares *cur = &cls->shares, *par;
+        int rc = -EINVAL;
+
+        if (!cls) 
+                return rc;
+
+        if (cls->parent) {
+                parres = ckrm_get_cpu_class(cls->parent);
+                spin_lock(&parres->cnt_lock);
+                spin_lock(&cls->cnt_lock);
+                par = &parres->shares;
+        } else {
+                spin_lock(&cls->cnt_lock);
+                par = NULL;
+                parres = NULL;
+        }
+
+       rc = set_shares(new_share, cur, par);
+
+       spin_unlock(&cls->cnt_lock);
+       if (cls->parent) {
+               spin_unlock(&parres->cnt_lock);
+       }
+       return rc;
+}                                                      
+                       
+/*
+ * translate the global_CVT to ticks
+ */
+static int ckrm_cpu_get_share(void *my_res,
+                             struct ckrm_shares *shares)
+{                      
+       struct ckrm_cpu_class *cls = my_res;
+
+       if (!cls) 
+               return -EINVAL;
+       *shares = cls->shares;
+       return 0;
+}                              
+
+int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
+{
+       struct ckrm_cpu_class *cls = my_res;
+
+       if (!cls) 
+               return -EINVAL;
+
+       seq_printf(sfile, "-------- CPU Class Status Start---------\n");
+       seq_printf(sfile, "  gua= %d limit= %d\n",
+                  cls->shares.my_guarantee,
+                  cls->shares.my_limit);
+       seq_printf(sfile, "  total_gua= %d limit= %d\n",
+                  cls->shares.total_guarantee,
+                  cls->shares.max_limit);
+       seq_printf(sfile, "  used_gua= %d cur_limit= %d\n",
+                  cls->shares.unused_guarantee,
+                  cls->shares.cur_max_limit);
+
+       seq_printf(sfile, "  Share= %d\n",cpu_class_weight(cls));
+       seq_printf(sfile, "  cvt= %llu\n",cls->local_queues[0].local_cvt);
+       seq_printf(sfile, "  total_ns= %llu\n",cls->stat.total_ns);
+       seq_printf(sfile, "  prio= %d\n",cls->local_queues[0].classqueue_linkobj.prio);
+       seq_printf(sfile, "  index= %d\n",cls->local_queues[0].classqueue_linkobj.index);
+       seq_printf(sfile, "  run= %llu\n",cls->stat.local_stats[0].run);
+       seq_printf(sfile, "  total= %llu\n",cls->stat.local_stats[0].total);
+       seq_printf(sfile, "  cpu_demand= %lu\n",cls->stat.cpu_demand);
+
+       seq_printf(sfile, "  effective_guarantee= %d\n",cls->stat.effective_guarantee);
+       seq_printf(sfile, "  effective_limit= %d\n",cls->stat.effective_limit);
+       seq_printf(sfile, "  effective_share= %d\n",cls->stat.effective_share);
+       seq_printf(sfile, "-------- CPU Class Status END ---------\n");
+
+
+       return 0;
+}
+
+/*
+ * task will remain in the same cpu but on a different local runqueue
+ */
+static void ckrm_cpu_change_class(void *task, void *old, void *new)
+{              
+       struct task_struct *tsk = task;                    
+       struct ckrm_cpu_class *newcls = new;
+       unsigned long flags;
+       struct runqueue *rq;
+       prio_array_t *array;
+
+       /*sanity checking*/
+       if (!task || ! old || !new)
+               return; 
+
+       rq = task_rq_lock(tsk,&flags); 
+       array = tsk->array;
+       if (array) {
+               dequeue_task(tsk,array);
+               tsk->cpu_class = newcls;
+               enqueue_task(tsk,rq_active(tsk,rq));
+       } else {
+               tsk->cpu_class = newcls;
+       }
+       task_rq_unlock(rq,&flags);
+}                                                      
+
+/*dummy function, not used*/
+static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
+{
+       struct ckrm_cpu_class *cls = my_res;
+
+       if (!cls) 
+               return -EINVAL;
+
+       seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class");
+       return 0;
+}
+
+/*dummy function, not used*/
+static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
+{
+       struct ckrm_cpu_class *cls = my_res;
+
+       if (!cls) 
+               return -EINVAL;
+       printk("ckrm_cpu config='%s'\n",cfgstr);
+       return 0;
+}
+       
+struct ckrm_res_ctlr cpu_rcbs = {
+       .res_name          = "CKRM CPU Class",
+       .res_hdepth        = 1,
+       .resid             = -1,
+       .res_alloc         = ckrm_alloc_cpu_class,
+       .res_free          = ckrm_free_cpu_class,
+       .set_share_values  = ckrm_cpu_set_share,
+       .get_share_values  = ckrm_cpu_get_share,
+       .get_stats         = ckrm_cpu_get_stats,
+       .show_config       = ckrm_cpu_show_config,
+       .set_config        = ckrm_cpu_set_config,
+       .change_resclass   = ckrm_cpu_change_class,
+};
+
+int __init init_ckrm_sched_res(void)
+{
+       struct ckrm_classtype *clstype;
+       int resid = cpu_rcbs.resid;
+
+       clstype = ckrm_find_classtype_by_name("taskclass");
+       if (clstype == NULL) {
+               printk(KERN_INFO" Unknown ckrm classtype<taskclass>");
+               return -ENOENT;
+       }
+
+       if (resid == -1) { /*not registered */
+               resid = ckrm_register_res_ctlr(clstype,&cpu_rcbs);
+               printk("........init_ckrm_sched_res , resid= %d\n",resid);
+       }
+       return 0;
+}
+
+/*
+ * initialize the class structure
+ * add the default class: class 0
+ */
+void init_cpu_classes(void) 
+{
+       int i;
+
+       //init classqueues for each processor
+       for (i=0; i < NR_CPUS; i++)
+               classqueue_init(get_cpu_classqueue(i)); 
+/*
+ * hzheng: initialize the default cpu class
+ *         required for E14 since ckrm_init is called after sched_init
+ */
+       ckrm_alloc_cpu_class(NULL,NULL);
+}
+
+
+EXPORT_SYMBOL(ckrm_get_cpu_class);
diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c
new file mode 100644 (file)
index 0000000..674ee6e
--- /dev/null
@@ -0,0 +1,542 @@
+/* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
+ *
+ * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
+ *           (C) Hubertus Franke, IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ * 
+ * 23 June 2004: Created
+ * 
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_rc.h>
+#include <linux/ckrm_tc.h>
+#include <asm/div64.h>
+#include <linux/ckrm_sched.h>
+
+#define CPU_MONITOR_INTERVAL (4*HZ) /*how often do we adjust the shares*/
+#define CKRM_SHARE_ACCURACY 7
+#define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
+
+extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
+
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
+{
+       int i;
+       struct ckrm_cpu_class_local_stat* local_stat;
+       unsigned long long now = sched_clock();
+
+       stat->stat_lock = SPIN_LOCK_UNLOCKED;
+       stat->total_ns = 0;
+       stat->cpu_demand = 0;
+
+       for (i=0; i< NR_CPUS; i++) {
+               local_stat = &stat->local_stats[i];
+               local_stat->run = 0;
+               local_stat->total = 0;
+               local_stat->last_sleep = now;
+               local_stat->cpu_demand = 0;             
+       }
+
+       stat->effective_guarantee = 0;
+       stat->effective_limit = 0;
+       stat->glut = 0;
+       stat->effective_share = 100;
+       stat->self_effective_share = 100;
+}
+/**********************************************/
+/*          cpu demand                        */
+/**********************************************/
+
+/*
+ * How CPU demand is calculated:
+ * consider class local runqueue (clr) first
+ * at any time, a clr can at the following three states
+ * -- run: a task belonning to this class is running on this cpu
+ * -- wait: at least one of its task is running, but the class is not running
+ * -- sleep: none of the task of this class is runnable
+ *
+ * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
+ * 
+ * the cpu_demand of a class = 
+ *    sum of cpu_demand of all the class local runqueues
+ */
+
+/**
+ * update_cpu_demand - update a state change
+ * 
+ * should be called whenever the state of a local queue changes
+ * -- when deschedule : report how much run
+ * -- when enqueue: report how much sleep
+ *
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
+ */
+#define CKRM_CPU_DEMAND_RUN 0
+#define CKRM_CPU_DEMAND_SLEEP 1
+//how often should we recalculate the cpu demand, in ns
+#define CPU_DEMAND_CAL_THRESHOLD (1000000000LL)
+static inline void update_local_cpu_demand(struct ckrm_cpu_class_local_stat* local_stat,int state, unsigned long long len)
+{      
+       local_stat->total += len;
+       if (state == CKRM_CPU_DEMAND_RUN)
+               local_stat->run += len;
+
+       if (local_stat->total >= CPU_DEMAND_CAL_THRESHOLD) {
+               local_stat->total >>= CKRM_SHARE_ACCURACY;
+               if (local_stat->total > 0xFFFFFFFF)
+                       local_stat->total = 0xFFFFFFFF;
+
+               do_div(local_stat->run,(unsigned long)local_stat->total);
+               local_stat->cpu_demand +=local_stat->run;
+               local_stat->cpu_demand >>= 1;
+               local_stat->total = 0;
+               local_stat->run = 0;
+       }
+}
+
+static inline void cpu_demand_update_run(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
+{
+       update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_RUN,len);
+}
+
+static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
+{
+       update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
+}
+
+#define CPU_DEMAND_ENQUEUE 0
+#define CPU_DEMAND_DEQUEUE 1
+#define CPU_DEMAND_DESCHEDULE 2
+
+/**
+ * cpu_demand_event - and cpu_demand event occured
+ * @event: one of the following three events:
+ *   CPU_DEMAND_ENQUEUE: local class enqueue
+ *   CPU_DEMAND_DEQUEUE: local class dequeue
+ *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
+ * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
+ */
+void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len) 
+{      
+       switch (event) {
+       case CPU_DEMAND_ENQUEUE: 
+               len = sched_clock() - local_stat->last_sleep;
+               local_stat->last_sleep = 0;
+               cpu_demand_update_sleep(local_stat,len);
+               break;
+       case CPU_DEMAND_DEQUEUE:
+               local_stat->last_sleep = sched_clock();
+               break;
+       case CPU_DEMAND_DESCHEDULE:
+               cpu_demand_update_run(local_stat,len);          
+               break;
+       default:
+               BUG();
+       }
+}
+
+/** 
+ * check all the class local queue
+ * if local queueu is not in runqueue, then it's in sleep state
+ * if compare to last sleep, 
+ */
+static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
+{
+       struct ckrm_cpu_class_local_stat * local_stat = &stat->local_stats[cpu];
+       unsigned long long sleep,now;
+       if (local_stat->last_sleep) {
+               now = sched_clock();
+               sleep = now - local_stat->last_sleep;
+               local_stat->last_sleep = now;
+               cpu_demand_update_sleep(local_stat,sleep);
+       }
+}
+
+/**
+ *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
+ *
+ * self_cpu_demand = sum(cpu demand of all local queues) 
+ */
+static unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat
+                                               *stat)
+{
+       int cpu_demand = 0;
+       int i;
+
+       for_each_online_cpu(i) {
+               cpu_demand_check_sleep(stat,i);
+               cpu_demand += stat->local_stats[i].cpu_demand;
+       }
+
+       if (cpu_demand > CKRM_SHARE_MAX)
+               cpu_demand = CKRM_SHARE_MAX;
+       return cpu_demand;
+}
+
+/*
+ * update effective cpu demand for each class
+ * assume the root_core->parent == NULL
+ */
+static void update_cpu_demand(struct ckrm_core_class *root_core)
+{
+       struct ckrm_core_class *cur_core, *child_core;
+       struct ckrm_cpu_class *cls;
+
+       cur_core = root_core;
+       child_core = NULL;
+       /*
+        * iterate the tree
+        * update cpu_demand of each node
+        */
+      repeat:
+       if (!cur_core)
+               return;
+
+       cls = ckrm_get_cpu_class(cur_core);
+       if (!child_core)        //first child
+               cls->stat.cpu_demand = get_self_cpu_demand(&cls->stat);
+       else {
+               cls->stat.cpu_demand +=
+                   ckrm_get_cpu_class(child_core)->stat.cpu_demand;
+               if (cls->stat.cpu_demand > CKRM_SHARE_MAX)
+                       cls->stat.cpu_demand = CKRM_SHARE_MAX;
+       }
+
+       //next child
+       child_core = ckrm_get_next_child(cur_core, child_core);
+       if (child_core) {
+               //go down
+               cur_core = child_core;
+               child_core = NULL;
+               goto repeat;
+       } else {                //no more child, go back
+               child_core = cur_core;
+               cur_core = child_core->hnode.parent;
+       }
+       goto repeat;
+}
+
+/**********************************************/
+/*          effective guarantee & limit       */
+/**********************************************/
+static inline void set_effective_share(struct ckrm_cpu_class_stat *stat,
+                                      int new_share)
+{
+       if (!new_share)
+               new_share = 1;
+       stat->effective_share = new_share;
+}
+
+static inline void set_self_effective_share(struct ckrm_cpu_class_stat *stat,
+                                           int new_share)
+{
+       if (!new_share)
+               new_share = 1;
+       stat->self_effective_share = new_share;
+}
+
+static inline void update_child_effective(struct ckrm_core_class *parent)
+{
+       struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
+       struct ckrm_core_class *child_core = ckrm_get_next_child(parent, NULL);
+
+       while (child_core) {
+               struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
+
+               c_cls->stat.effective_guarantee =
+                   p_cls->stat.effective_guarantee *
+                   c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
+               c_cls->stat.effective_limit =
+                   p_cls->stat.effective_guarantee * c_cls->shares.my_limit /
+                   p_cls->shares.total_guarantee;
+
+               child_core = ckrm_get_next_child(parent, child_core);
+       };
+
+}
+
+/*
+ * update effective guarantee and effective limit
+ * -- effective share = parent->effective->share * share/parent->total_share
+ * -- effective limit = parent->effective->share * limit/parent->total_share
+ * should be called only when class structure changed
+ */
+static void update_effective_guarantee_limit(struct ckrm_core_class *root_core)
+{
+       struct ckrm_core_class *cur_core, *child_core = NULL;
+       struct ckrm_cpu_class *cls;
+
+       cur_core = root_core;
+       cls = ckrm_get_cpu_class(cur_core);
+       cls->stat.effective_guarantee = CKRM_SHARE_MAX;
+       cls->stat.effective_limit = cls->stat.effective_guarantee;
+
+      repeat:
+       //check exit
+       if (!cur_core)
+               return;
+
+       //visit this node
+       update_child_effective(cur_core);
+       //next child
+       child_core = ckrm_get_next_child(cur_core, child_core);
+       if (child_core) {
+               //go down
+               cur_core = child_core;
+               child_core = NULL;
+               goto repeat;
+       } else {                //no more child, go back
+               child_core = cur_core;
+               cur_core = child_core->hnode.parent;
+       }
+       goto repeat;
+}
+
+/**********************************************/
+/*          surplus allocation                */
+/**********************************************/
+
+/*
+ * surplus = my_effective_share - demand
+ * if surplus < 0, surplus = 0 
+ */
+static inline int get_node_surplus(struct ckrm_cpu_class *cls)
+{
+       int surplus = cls->stat.effective_guarantee - cls->stat.cpu_demand;
+
+       if (surplus < 0)
+               surplus = 0;
+
+       return surplus;
+}
+
+/*
+ * consume the surplus
+ * return how much consumed
+ * set glut when necessary
+ */
+static inline int node_surplus_consume(int old_surplus,
+                                      struct ckrm_core_class *child_core,
+                                      struct ckrm_cpu_class *p_cls)
+{
+       int consumed = 0;
+       int inc_limit;
+
+       struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
+
+       if (c_cls->stat.glut)
+               goto out;
+
+       //check demand
+       if (c_cls->stat.effective_share >= c_cls->stat.cpu_demand) {
+               c_cls->stat.glut = 1;
+               goto out;
+       }
+
+       consumed =
+           old_surplus * c_cls->shares.my_guarantee /
+           p_cls->shares.total_guarantee;
+
+       //check limit
+       inc_limit = c_cls->stat.effective_limit - c_cls->stat.effective_share;
+       if (inc_limit <= consumed) {
+               c_cls->stat.glut = 1;
+               consumed = inc_limit;
+       }
+
+       c_cls->stat.effective_share += consumed;
+      out:
+       return consumed;
+}
+
+/*
+ * re-allocate the shares for all the childs under this node
+ * task:
+ *  1. get total surplus
+ *  2. allocate surplus
+ *  3. set the effective_share of each node
+ */
+static void alloc_surplus_node(struct ckrm_core_class *parent)
+{
+       int total_surplus = 0, old_surplus = 0;
+       struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
+       struct ckrm_core_class *child_core = NULL;
+       int self_share;
+
+       /*
+        * calculate surplus 
+        * total_surplus = sum(child_surplus)
+        * reset glut flag
+        * initialize effective_share
+        */
+       do {
+               child_core = ckrm_get_next_child(parent, child_core);
+               if (child_core) {
+                       struct ckrm_cpu_class *c_cls =
+                           ckrm_get_cpu_class(child_core);
+                       ckrm_stat_t *stat = &c_cls->stat;
+
+                       total_surplus += get_node_surplus(c_cls);
+                       stat->glut = 0;
+                       set_effective_share(stat, stat->effective_guarantee);
+               }
+       } while (child_core);
+
+       /*distribute the surplus */
+       child_core = NULL;
+       do {
+               if (!child_core)        //keep the surplus of last round
+                       old_surplus = total_surplus;
+
+               child_core = ckrm_get_next_child(parent, child_core);
+               if (child_core) {
+                       total_surplus -=
+                           node_surplus_consume(old_surplus, child_core,
+                                                p_cls);
+               }
+               //start a new round if something is allocated in the last round
+       } while (child_core || (total_surplus != old_surplus));
+
+       //any remaining surplus goes to the default class
+       self_share = p_cls->stat.effective_share *
+           p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
+       self_share += total_surplus;
+
+       set_self_effective_share(&p_cls->stat, self_share);
+}
+
+/**
+ * alloc_surplus - reallocate unused shares
+ *
+ * class A's usused share should be allocated to its siblings
+ */
+static void alloc_surplus(struct ckrm_core_class *root_core)
+{
+       struct ckrm_core_class *cur_core, *child_core = NULL;
+       struct ckrm_cpu_class *cls;
+
+       cur_core = root_core;
+       cls = ckrm_get_cpu_class(cur_core);
+       cls->stat.glut = 0;
+       set_effective_share(&cls->stat, cls->stat.effective_guarantee);
+      repeat:
+       //check exit
+       if (!cur_core)
+               return;
+
+       //visit this node
+       alloc_surplus_node(cur_core);
+       //next child
+       child_core = ckrm_get_next_child(cur_core, child_core);
+       if (child_core) {
+               //go down
+               cur_core = child_core;
+               child_core = NULL;
+               goto repeat;
+       } else {                //no more child, go back
+               child_core = cur_core;
+               cur_core = child_core->hnode.parent;
+       }
+       goto repeat;
+}
+
+/**
+ *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
+ *
+ * this function is called every CPU_MONITOR_INTERVAL
+ * it computes the cpu demand of each class
+ * and re-allocate the un-used shares to other classes
+ */
+void ckrm_cpu_monitor(void)
+{
+       struct ckrm_core_class *root_core = default_cpu_class->core;
+       if (!root_core)
+               return;
+
+       update_effective_guarantee_limit(root_core);
+       update_cpu_demand(root_core);
+       alloc_surplus(root_core);
+}
+
+/*****************************************************/
+/*            Supporting Functions                   */
+/*****************************************************/
+static pid_t cpu_monitor_pid = -1;
+static int thread_exit = 0;
+
+static int ckrm_cpu_monitord(void *nothing)
+{
+       wait_queue_head_t wait;
+
+       init_waitqueue_head(&wait);
+
+       daemonize("ckrm_cpu_ctrld");
+       for (;;) {
+               /*sleep for sometime before next try*/
+               interruptible_sleep_on_timeout(&wait, CPU_MONITOR_INTERVAL);
+               ckrm_cpu_monitor();
+               if (thread_exit) {
+                       break;
+               }
+       }
+       cpu_monitor_pid = -1;
+       thread_exit = 2;
+       printk("cpu_monitord exit\n");
+       return 0;
+}
+
+void ckrm_start_monitor(void)
+{
+       cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
+       if (cpu_monitor_pid < 0) {
+               printk("ckrm_cpu_monitord for failed\n");
+       }
+}
+
+void ckrm_kill_monitor(void)
+{
+       wait_queue_head_t wait;
+       int interval = HZ;
+       init_waitqueue_head(&wait);
+
+       printk("killing process %d\n", cpu_monitor_pid);
+       if (cpu_monitor_pid > 0) {
+               thread_exit = 1;
+               while (thread_exit != 2) {
+                       interruptible_sleep_on_timeout(&wait, interval);
+               }
+       }
+}
+
+int ckrm_cpu_monitor_init(void)
+{
+       ckrm_start_monitor();
+       return 0;
+}
+
+void ckrm_cpu_monitor_exit(void)
+{
+       ckrm_kill_monitor();
+}
+
+module_init(ckrm_cpu_monitor_init);
+module_exit(ckrm_cpu_monitor_exit);
+
+MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
+MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
+MODULE_LICENSE("GPL");
diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c
new file mode 100644 (file)
index 0000000..1929aaf
--- /dev/null
@@ -0,0 +1,178 @@
+/* kernel/ckrm_classqueue.c : implements the class queue
+ *
+ * Copyright (C) Haoqiang Zheng, IBM Corp. 2003
+ *           (C) Hubertus Franke, IBM Corp. 2003
+ *
+ * Class queue functionality for CKRM cpu controller
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ *
+ * Aug 28, 2003
+ *        Created.
+ * July 08, 2004
+ *        classqueue now has a fixed size
+ *        major clean up
+ *        function/structure names are changed to more intuitive ones
+ */
+#include <linux/sched.h>
+#include <linux/ckrm_classqueue.h>
+
+#define cq_nr_member(cq) (cq->array.nr_active)
+
+/**
+ * get_index - translate the logical priority to the real index in the queue
+ * 
+ * validate the position
+ * a valid prio is [cq->base,cq->base + size -1]
+ */
+static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
+{
+       unsigned long index;
+       int max_prio;
+
+       if (!cq_nr_member(cq))
+               return 0;
+
+       max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
+       if (*prio > max_prio)
+               *prio = max_prio;
+       if (*prio < cq->base)
+               *prio = cq->base;
+
+               index = (cq->base_offset + (*prio - cq->base)) ;
+       if (index >= CLASSQUEUE_SIZE)
+               index -= CLASSQUEUE_SIZE;
+
+       return index;
+}
+
+/**
+ * initialize a class queue object
+ */
+int classqueue_init(struct classqueue_struct *cq)
+{
+       int i;
+       struct cq_prio_array *array;
+
+       array = &cq->array;
+       for (i = 0; i < CLASSQUEUE_SIZE; i++) {
+               INIT_LIST_HEAD(array->queue + i);
+               __clear_bit(i, array->bitmap);
+       }
+       // delimiter for bitsearch
+       __set_bit(CLASSQUEUE_SIZE, array->bitmap);
+       array->nr_active = 0;
+
+       cq->base = 0;
+       cq->base_offset = -1;   //not valid yet
+
+       return 0;
+}
+
+/**
+ *classqueue_enqueue - add the class to classqueue based on its prio
+ */
+void classqueue_enqueue(struct classqueue_struct *cq,
+                       cq_node_t * node, int prio)
+{
+       int index;
+
+       //get real index
+       if (cq_nr_member(cq)) {
+               index = get_index(cq, &prio);
+       } else {                //the first one
+               cq->base = prio;
+               cq->base_offset = 0;
+               index = 0;
+       }
+
+       //add to the queue
+       list_add(&(node->list), &cq->array.queue[index]);
+       __set_bit(index, cq->array.bitmap);
+       cq->array.nr_active++;
+
+       node->index = index;
+       node->prio = prio;
+}
+
+void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node)
+{
+       //delete from queue
+       list_del_init(&(node->list));
+       cq->array.nr_active--;
+
+       //check clear the bitmap
+       if (list_empty(&cq->array.queue[node->index]))
+               __clear_bit(node->index, cq->array.bitmap);
+}
+
+void classqueue_update_prio(struct classqueue_struct *cq,
+                           cq_node_t * node, int new_pos)
+{
+       int index;
+
+       if (! cls_in_classqueue(node)) 
+               return;
+
+       index = get_index(cq, &new_pos);
+       node->prio = new_pos;
+
+       //remove from the original position
+       list_del_init(&(node->list));
+       if (list_empty(&cq->array.queue[node->index]))
+               __clear_bit(node->index, cq->array.bitmap);
+       
+       //add to new positon, round robin for classes with same priority
+       list_add_tail(&(node->list), &cq->array.queue[index]);
+       __set_bit(index, cq->array.bitmap);
+       
+       node->index = index;
+}
+
+cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
+{
+       cq_node_t *result = NULL;
+       int pos;
+
+       /* 
+        * search over the bitmap to get the first class in the queue
+        */
+       pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset);
+       if (pos >= CLASSQUEUE_SIZE) {   //do circular search from the beginning
+               pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
+       }
+
+       if (pos < CLASSQUEUE_SIZE) {
+               BUG_ON(list_empty(&cq->array.queue[pos]));
+               result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+       }
+       return result;
+}
+
+/**
+ * Moving the end of queue forward
+ * the new_base here is logical, we need to translate to the abosule position
+ */
+void classqueue_update_base(struct classqueue_struct *cq, int new_base)
+{
+       if (!cq_nr_member(cq)) {
+               cq->base_offset = -1;   //not defined
+               return;
+       }
+
+       //      assert(new_base >= cq->base);
+
+       if (new_base > cq->base) {
+               cq->base_offset = get_index(cq, &new_base);
+               cq->base = new_base;
+       }
+}
diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c
new file mode 100644 (file)
index 0000000..ba716d4
--- /dev/null
@@ -0,0 +1,71 @@
+/* kernel/ckrm_sched.c - Supporting functions for ckrm scheduling
+ *
+ * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
+ *           (C) Hubertus Franke, IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/ckrm_sched.h>
+
+/*******************************************************/
+/*                CVT Management                       */
+/*******************************************************/
+#define CVT_WINDOW_SIZE (CLASSQUEUE_SIZE << CLASS_BONUS_RATE)
+static CVT_t max_CVT = CVT_WINDOW_SIZE;
+
+/*
+ *  Also ensure that the classes global cvt is upgraded to the 
+ * minimum CVT in the system, as a class might not have run for a while
+ */
+static void update_global_cvt(struct ckrm_cpu_class *cpu_class, int cpu)
+{
+       struct ckrm_local_runqueue *class_queue =
+           get_ckrm_local_runqueue(cpu_class, cpu);
+       CVT_t min_cvt;
+       CVT_t local_cvt_old = class_queue->local_cvt;
+
+       spin_lock(&cvt_lock);
+       if (class_queue->uncounted_cvt) {
+               cpu_class->global_cvt += class_queue->uncounted_cvt;
+               class_queue->uncounted_cvt = 0;
+       }
+       min_cvt = max_CVT - CVT_WINDOW_SIZE;
+       if (cpu_class->global_cvt < min_cvt)
+               cpu_class->global_cvt = min_cvt;
+       else  if (cpu_class->global_cvt > max_CVT)
+               max_CVT = cpu_class->global_cvt;
+
+/* update local cvt from global cvt*/
+#if 0
+       class_queue->local_cvt = cpu_class->global_cvt;
+#endif
+       spin_unlock(&cvt_lock);
+
+       if (class_queue->local_cvt != local_cvt_old)
+               update_class_priority(class_queue);
+}
+
+/*
+ * class_list_lock must have been acquired 
+ */
+void update_global_cvts(int this_cpu)
+{
+       struct ckrm_cpu_class *clsptr;
+       struct ckrm_local_runqueue *class_queue;
+
+       /*for each class*/
+       list_for_each_entry(clsptr, &active_cpu_classes, links) {
+               update_global_cvt(clsptr, this_cpu);
+               class_queue = get_ckrm_local_runqueue(clsptr, this_cpu);
+               clsptr->stat.total_ns += class_queue->uncounted_ns;
+               class_queue->uncounted_ns = 0;
+       }
+}