There is a bug in the CKRM CPU scheduler. This has been reported to the

author Marc Fiuczynski <mef@cs.princeton.edu>

Tue, 11 Jan 2005 04:56:07 +0000 (04:56 +0000)

committer Marc Fiuczynski <mef@cs.princeton.edu>

Tue, 11 Jan 2005 04:56:07 +0000 (04:56 +0000)
author Marc Fiuczynski <mef@cs.princeton.edu>
Tue, 11 Jan 2005 04:56:07 +0000 (04:56 +0000)
committer Marc Fiuczynski <mef@cs.princeton.edu>
Tue, 11 Jan 2005 04:56:07 +0000 (04:56 +0000)
diff --git a/configs/kernel-2.6.8-i686-planetlab.config b/configs/kernel-2.6.8-i686-planetlab.config

index baabedb..ffa265f 100644 (file)
--- a/configs/kernel-2.6.8-i686-planetlab.config
+++ b/configs/kernel-2.6.8-i686-planetlab.config
@@ -32,7 +32,7 @@ CONFIG_CKRM_RES_NUMTASKS=y
  CONFIG_CKRM_CPU_SCHEDULE=y
  # CONFIG_CKRM_RES_BLKIO is not set
  # CONFIG_CKRM_RES_MEM is not set
-CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT=y
+# CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT is not set
  # CONFIG_CKRM_TYPE_SOCKETCLASS is not set
  CONFIG_CKRM_RBCE=y
  CONFIG_SYSCTL=y
diff --git a/include/linux/ckrm_ce.h b/include/linux/ckrm_ce.h

index 0bde15d..f4e91e9 100644 (file)
--- a/include/linux/ckrm_ce.h
+++ b/include/linux/ckrm_ce.h
@@ -9,10 +9,13 @@
   *
   * Latest version, more details at http://ckrm.sf.net
   * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   *
   */
  
@@ -29,49 +32,67 @@
  
  #ifdef CONFIG_CKRM
  
-#include "ckrm.h"  // getting the event names
+#include <linux/ckrm.h>                // getting the event names
  
  /* Action parameters identifying the cause of a task<->class notify callback 
- * these can perculate up to user daemon consuming records send by the classification
- * engine
+ * these can perculate up to user daemon consuming records send by the 
+ * classification engine
   */
  
  #ifdef __KERNEL__
  
-typedef void* (*ce_classify_fct_t)(enum ckrm_event event, void *obj, ... );   
-typedef void  (*ce_notify_fct_t)  (enum ckrm_event event, void *classobj, void *obj);
+typedef void *(*ce_classify_fct_t) (enum ckrm_event event, void *obj, ...);
+typedef void (*ce_notify_fct_t) (enum ckrm_event event, void *classobj,
+                                void *obj);
  
  typedef struct ckrm_eng_callback {
         /* general state information */
-       int  always_callback;  /* set if CE should always be called back regardless of numclasses */
+       int always_callback;    /* set if CE should always be called back 
+                                  regardless of numclasses */
+
+
+
  
         /* callbacks which are called without holding locks */
  
-       unsigned long c_interest;         /* set of classification events CE is interested in */
-       ce_classify_fct_t   classify;     /* generic classify */
+       unsigned long c_interest;       /* set of classification events of 
+                                          interest to CE 
+                                       */
+
+       /* generic classify */
+       ce_classify_fct_t classify;     
+       /* class added */
+       void (*class_add) (const char *name, void *core, int classtype);
+       /* class deleted */
+       void (*class_delete) (const char *name, void *core, int classtype);
+
  
-       void   (*class_add)   (const char *name, void *core); /* class added */
-       void   (*class_delete)(const char *name, void *core); /* class deleted */
+       /* callbacks which are called while holding task_lock(tsk) */
  
-       /* callback which are called while holding task_lock(tsk) */
-       unsigned long n_interest;         /* set of notification events CE is interested in */
-       ce_notify_fct_t     notify;       /* notify on class switch */
+       unsigned long n_interest;       /* set of notification events of 
+                                          interest to CE 
+                                       */
+       /* notify on class switch */
+       ce_notify_fct_t notify; 
  
  } ckrm_eng_callback_t;
  
  struct inode;
-struct dentry; 
+struct dentry;
  
  typedef struct rbce_eng_callback {
-       int (*mkdir)(struct inode *, struct dentry *, int); // mkdir
-       int (*rmdir)(struct inode *, struct dentry *); // rmdir
+       int (*mkdir) (struct inode *, struct dentry *, int);    // mkdir
+       int (*rmdir) (struct inode *, struct dentry *); // rmdir
+       int (*mnt) (void);
+       int (*umnt) (void);
  } rbce_eng_callback_t;
  
-extern int ckrm_register_engine  (const char *name, ckrm_eng_callback_t *);
+extern int ckrm_register_engine(const char *name, ckrm_eng_callback_t *);
  extern int ckrm_unregister_engine(const char *name);
  
  extern void *ckrm_classobj(char *, int *classtype);
-extern int get_exe_path_name(struct task_struct *t, char *filename, int max_size);
+extern int get_exe_path_name(struct task_struct *t, char *filename,
+                            int max_size);
  
  extern int rcfs_register_engine(rbce_eng_callback_t *);
  extern int rcfs_unregister_engine(rbce_eng_callback_t *);
@@ -84,8 +105,8 @@ extern void ckrm_core_grab(void *);
  extern void ckrm_core_drop(void *);
  #endif
  
-#endif // CONFIG_CKRM
+#endif                         // CONFIG_CKRM
  
-#endif // __KERNEL__
+#endif                         // __KERNEL__
  
-#endif // _LINUX_CKRM_CE_H
+#endif                         // _LINUX_CKRM_CE_H
diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h

index a02794d..1453f5e 100644 (file)
--- a/include/linux/ckrm_classqueue.h
+++ b/include/linux/ckrm_classqueue.h
@@ -21,16 +21,6 @@
   * July 07, 2004
   *   clean up, add comments
   *
- *
- * Overview:
- * ---------
- *
- * Please read Documentation/ckrm/cpu_sched for a general overview of
- * how the O(1) CKRM scheduler.
- *
- * ckrm_classqueue.h provides the definition to maintain the 
- * per cpu class runqueue.
- *   
   */
  
  #ifndef _CKRM_CLASSQUEUE_H
@@ -38,13 +28,14 @@
  
  #include <linux/list.h>
  
-#warning mef: is classqueue_size big enough for PlanetLab
-#define CLASSQUEUE_SIZE_SHIFT  7
-#define CLASSQUEUE_SIZE ( 1 << CLASSQUEUE_SIZE_SHIFT )
+#define CLASSQUEUE_SIZE 1024   // acb: changed from 128
+//#define CLASSQUEUE_SIZE 128
  #define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long))
  
  /**
   * struct cq_prio_array: duplicates prio_array defined in sched.c 
+ *
+ * I duplicate this data structure to make ckrm_classqueue implementation more modular
   */
  struct cq_prio_array {
         int nr_active;
@@ -58,50 +49,42 @@ struct cq_prio_array {
   * @base: base priority
   * @base_offset: index in array for the base
   *
- * classqueue can be thought of as runqueue of lrq's (per cpu object of
- * a CKRM class as task runqueue (instead of runqueue of tasks)
- * - a class's local lrq is enqueued into the local classqueue when a
- *   first task is enqueued lrq.
- * - a class's local lrq is removed from the local classqueue when the 
- *   last task is dequeued from the lrq.
- * - lrq's are ordered based on their priority (determined elsewhere)
- *   ( CKRM: caculated based on it's progress (cvt) and urgency (top_priority)
+ * classqueue can be thought of as runqueue of classes (instead of runqueue of tasks)
+ * as task runqueue, each processor has a classqueue
+ * a class enters the classqueue when the first task in this class local runqueue shows up
+ * a class enters the classqueue when the last task in the local runqueue leaves
+ * class local runqueues are ordered based their priority
+ *
+ * status:
+ *   hzheng: is 32bit base long enough?
   */
-
  struct classqueue_struct {
-       int enabled;                   // support dynamic on/off
+       struct cq_prio_array array;
         unsigned long base;
         unsigned long base_offset;
-       struct cq_prio_array array;
  };
  
  /** 
- * struct cq_node_struct:
- * - the link object between class local runqueue and classqueue
+ * struct cq_node_struct - the link object between class local runqueue and classqueue
   * @list: links the class local runqueue to classqueue
- * @prio: class priority
+ * @prio: class priority, which is caculated based on it's progress (cvt) and urgency (top_priority)
   * @index: real index into the classqueue array, calculated based on priority
+ *
+ * NOTE: make sure list is empty when it's not in classqueue
   */
  struct cq_node_struct {
         struct list_head list;
         int prio;
         int index;
-       /*
-        * set when the class jump out of the class queue window
-        * class with this value set should be repositioned whenever classqueue slides window
-        * real_prio is valid when need_repos is set
-        */
-       int real_prio;
-       int need_repos; 
  };
  typedef struct cq_node_struct cq_node_t;
  
+typedef unsigned long long CVT_t;      // cummulative virtual time
+
  static inline void cq_node_init(cq_node_t * node)
  {
         node->prio = 0;
         node->index = -1;
-       node->real_prio = 0;
-       node->need_repos = 0;
         INIT_LIST_HEAD(&node->list);
  }
  
@@ -112,18 +95,23 @@ static inline int cls_in_classqueue(cq_node_t * node)
  }
  
  /*initialize the data structure*/
-int classqueue_init(struct classqueue_struct *cq, int enabled);
+int classqueue_init(struct classqueue_struct *cq);
  
-/*add the class to classqueue at given priority */
-void classqueue_enqueue(struct classqueue_struct *cq, 
-                       cq_node_t * node, int prio);
+/*add the class to classqueue*/
+void classqueue_enqueue(struct classqueue_struct *cq, cq_node_t * node, int prio);
  
-/*remove the class from classqueue */
+/**
+ * classqueue_dequeue - remove the class from classqueue
+ * 
+ * internal:
+ *   called when the last task is removed from the queue
+ *   checked on load balancing and schedule
+ *   hzheng: why don't I call it on class_dequeue_task?
+ */
  void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node);
  
  /*change the position of the class in classqueue*/
-void classqueue_update_prio(struct classqueue_struct *cq, 
-                           cq_node_t * node, int new_prio);
+void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int new_prio);
  
  /*return the first class in classqueue*/
  cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
@@ -134,8 +122,7 @@ void classqueue_update_base(struct classqueue_struct *cq);
  /**
   * class_compare_prio: compare the priority of this two nodes
   */
-static inline int class_compare_prio(struct cq_node_struct* node1, 
-                                    struct cq_node_struct* node2)
+static inline int class_compare_prio(struct cq_node_struct* node1, struct cq_node_struct* node2)
  {
         return ( node1->prio - node2->prio);
  }
diff --git a/include/linux/ckrm_rc.h b/include/linux/ckrm_rc.h

index e514f1c..1bf2d07 100644 (file)
--- a/include/linux/ckrm_rc.h
+++ b/include/linux/ckrm_rc.h
@@ -32,152 +32,152 @@
  
  #include <linux/list.h>
  #include <linux/ckrm.h>
-#include <linux/ckrm_ce.h>    
+#include <linux/ckrm_ce.h>
  #include <linux/seq_file.h>
  
-
  /* maximum number of class types */
-#define CKRM_MAX_CLASSTYPES         32       
+#define CKRM_MAX_CLASSTYPES         32
  /* maximum classtype name length */
-#define CKRM_MAX_CLASSTYPE_NAME     32       
+#define CKRM_MAX_CLASSTYPE_NAME     32
  
  /* maximum resource controllers per classtype */
-#define CKRM_MAX_RES_CTLRS           8     
+#define CKRM_MAX_RES_CTLRS           8
  /* maximum resource controller name length */
-#define CKRM_MAX_RES_NAME          128       
-
+#define CKRM_MAX_RES_NAME          128
  
  struct ckrm_core_class;
  struct ckrm_classtype;
  
-/********************************************************************************
+/*****************************************************************************
   * Share specifications
- *******************************************************************************/
+ *****************************************************************************/
  
  typedef struct ckrm_shares {
         int my_guarantee;
         int my_limit;
         int total_guarantee;
         int max_limit;
-       int unused_guarantee;  // not used as parameters
-       int cur_max_limit;     // not used as parameters
+       int unused_guarantee;   // not used as parameters
+       int cur_max_limit;      // not used as parameters
  } ckrm_shares_t;
  
-#define CKRM_SHARE_UNCHANGED     (-1)  // value to indicate no change
-#define CKRM_SHARE_DONTCARE      (-2)  // value to indicate don't care.
-#define CKRM_SHARE_DFLT_TOTAL_GUARANTEE (100) // Start off with these values
-#define CKRM_SHARE_DFLT_MAX_LIMIT     (100) // to simplify set_res_shares logic
-
+#define CKRM_SHARE_UNCHANGED     (-1)  
+#define CKRM_SHARE_DONTCARE      (-2)  
+#define CKRM_SHARE_DFLT_TOTAL_GUARANTEE (100) 
+#define CKRM_SHARE_DFLT_MAX_LIMIT     (100)  
  
-/********************************************************************************
+/******************************************************************************
   * RESOURCE CONTROLLERS
- *******************************************************************************/
+ *****************************************************************************/
  
  /* resource controller callback structure */
  
  typedef struct ckrm_res_ctlr {
         char res_name[CKRM_MAX_RES_NAME];
-       int  res_hdepth;                  // maximum hierarchy
-       int  resid;                       // (for now) same as the enum resid
-       struct ckrm_classtype *classtype; // classtype owning this resource controller
+       int res_hdepth;         // maximum hierarchy
+       int resid;              // (for now) same as the enum resid
+       struct ckrm_classtype *classtype;    // classtype owning this res ctlr
  
         /* allocate/free new resource class object for resource controller */
-       void *(*res_alloc)  (struct ckrm_core_class *this, struct ckrm_core_class *parent);
-       void  (*res_free)   (void *);
+       void *(*res_alloc) (struct ckrm_core_class * this,
+                           struct ckrm_core_class * parent);
+       void (*res_free) (void *);
  
         /* set/get limits/guarantees for a resource controller class */
-       int  (*set_share_values) (void* , struct ckrm_shares *shares);
-       int  (*get_share_values) (void* , struct ckrm_shares *shares);
+       int (*set_share_values) (void *, struct ckrm_shares * shares);
+       int (*get_share_values) (void *, struct ckrm_shares * shares);
  
         /* statistics and configuration access */
-       int  (*get_stats)    (void* , struct seq_file *);
-       int  (*reset_stats)  (void *);
-       int  (*show_config)  (void* , struct seq_file *);
-       int  (*set_config)   (void* , const char *cfgstr);
+       int (*get_stats) (void *, struct seq_file *);
+       int (*reset_stats) (void *);
+       int (*show_config) (void *, struct seq_file *);
+       int (*set_config) (void *, const char *cfgstr);
  
-       void (*change_resclass)(void *, void *, void *);
+       void (*change_resclass) (void *, void *, void *);
  
  } ckrm_res_ctlr_t;
  
-/***************************************************************************************
+/******************************************************************************
   * CKRM_CLASSTYPE
   *
- *   A <struct ckrm_classtype> object describes a dimension for CKRM to classify 
- *   along. I needs to provide methods to create and manipulate class objects in
- *   this dimension
- ***************************************************************************************/
+ * A <struct ckrm_classtype> object describes a dimension for CKRM to classify 
+ * along. Need to provide methods to create and manipulate class objects in
+ * this dimension
+ *****************************************************************************/
  
  /* list of predefined class types, we always recognize */
  #define CKRM_CLASSTYPE_TASK_CLASS    0
-#define CKRM_CLASSTYPE_SOCKET_CLASS 1
-#define CKRM_RESV_CLASSTYPES         2  /* always +1 of last known type */
+#define CKRM_CLASSTYPE_SOCKET_CLASS  1
+#define CKRM_RESV_CLASSTYPES         2 /* always +1 of last known type */
  
  #define CKRM_MAX_TYPENAME_LEN       32
  
-
  typedef struct ckrm_classtype {
-       /* Hubertus:   Rearrange slots so that they are more cache friendly during access */
+       /* Hubertus:   Rearrange slots later for cache friendliness */
  
         /* resource controllers */
-       spinlock_t        res_ctlrs_lock;        /* protect data below (other than atomics) */
-       int               max_res_ctlrs;         /* maximum number of resource controller allowed */
-       int               max_resid;             /* maximum resid used                      */
-       int               resid_reserved;        /* maximum number of reserved controllers  */
-       long              bit_res_ctlrs;         /* bitmap of resource ID used              */
-       atomic_t          nr_resusers[CKRM_MAX_RES_CTLRS];
-       ckrm_res_ctlr_t*  res_ctlrs[CKRM_MAX_RES_CTLRS];
+       spinlock_t res_ctlrs_lock;  // protect res ctlr related data
+       int max_res_ctlrs;          // max number of res ctlrs allowed 
+       int max_resid;              // max resid used                      
+       int resid_reserved;         // max number of reserved controllers  
+       long bit_res_ctlrs;         // bitmap of resource ID used              
+       atomic_t nr_resusers[CKRM_MAX_RES_CTLRS];
+       ckrm_res_ctlr_t *res_ctlrs[CKRM_MAX_RES_CTLRS];
+
  
         /* state about my classes */
  
-       struct ckrm_core_class   *default_class; // pointer to default class
-       struct list_head          classes;       // listhead to link up all classes of this classtype
-       int                       num_classes;    // how many classes do exist
+       struct ckrm_core_class *default_class;  
+       struct list_head classes;  // link all classes of this classtype
+       int num_classes;         
  
         /* state about my ce interaction */
-       int                       ce_regd;       // Has a CE been registered for this classtype
-       int                       ce_cb_active;  // are callbacks active
-       atomic_t                  ce_nr_users;   // how many transient calls active
-       struct ckrm_eng_callback  ce_callbacks;  // callback engine
-
-       // Begin classtype-rcfs private data. No rcfs/fs specific types used. 
-       int               mfidx;             // Index into genmfdesc array used to initialize
-                                            // mfdesc and mfcount 
-       void              *mfdesc;           // Array of descriptors of root and magic files
-       int               mfcount;           // length of above array 
-       void              *rootde;           // root dentry created by rcfs
-       // End rcfs private data 
-
-       char name[CKRM_MAX_TYPENAME_LEN];    // currently same as mfdesc[0]->name but could be different
-       int  typeID;                           /* unique TypeID                         */
-       int  maxdepth;                         /* maximum depth supported               */
+       atomic_t ce_regd;               // if CE registered
+       int ce_cb_active;       // if Callbacks active
+       atomic_t ce_nr_users;   // number of active transient calls 
+       struct ckrm_eng_callback ce_callbacks;  // callback engine
+
+       // Begin classtype-rcfs private data. No rcfs/fs specific types used. 
+       int mfidx;              // Index into genmfdesc array used to initialize
+       void *mfdesc;           // Array of descriptors of root and magic files
+       int mfcount;            // length of above array 
+       void *rootde;           // root dentry created by rcfs
+       // End rcfs private data 
+
+       char name[CKRM_MAX_TYPENAME_LEN]; // currently same as mfdesc[0]->name 
+                                         // but could be different
+       int typeID;             // unique TypeID
+       int maxdepth;           // maximum depth supported
  
         /* functions to be called on any class type by external API's */
-       struct ckrm_core_class*  (*alloc)(struct ckrm_core_class *parent, const char *name);   /* alloc class instance */
-       int                      (*free) (struct ckrm_core_class *cls);                        /* free  class instance */
-       
-       int                      (*show_members)(struct ckrm_core_class *, struct seq_file *);
-       int                      (*show_stats)  (struct ckrm_core_class *, struct seq_file *);
-       int                      (*show_config) (struct ckrm_core_class *, struct seq_file *);
-       int                      (*show_shares) (struct ckrm_core_class *, struct seq_file *);
-
-       int                      (*reset_stats) (struct ckrm_core_class *, const char *resname, 
-                                                const char *);
-       int                      (*set_config)  (struct ckrm_core_class *, const char *resname,
-                                                const char *cfgstr);
-       int                      (*set_shares)  (struct ckrm_core_class *, const char *resname,
-                                                struct ckrm_shares *shares);
-       int                      (*forced_reclassify)(struct ckrm_core_class *, const char *);
-
-  
+
+       struct ckrm_core_class *(*alloc) (struct ckrm_core_class * parent, 
+                                         const char *name);    
+       int (*free) (struct ckrm_core_class * cls);     
+       int (*show_members) (struct ckrm_core_class *, struct seq_file *);
+       int (*show_stats) (struct ckrm_core_class *, struct seq_file *);
+       int (*show_config) (struct ckrm_core_class *, struct seq_file *);
+       int (*show_shares) (struct ckrm_core_class *, struct seq_file *);
+
+       int (*reset_stats) (struct ckrm_core_class *, const char *resname,
+                           const char *);
+       int (*set_config) (struct ckrm_core_class *, const char *resname,
+                          const char *cfgstr);
+       int (*set_shares) (struct ckrm_core_class *, const char *resname,
+                          struct ckrm_shares * shares);
+       int (*forced_reclassify) (struct ckrm_core_class *, const char *);
+
         /* functions to be called on a class type by ckrm internals */
-       void                     (*add_resctrl)(struct ckrm_core_class *, int resid);     // class initialization for new RC
- 
+
+       /* class initialization for new RC */
+       void (*add_resctrl) (struct ckrm_core_class *, int resid);      
+
  } ckrm_classtype_t;
  
-/******************************************************************************************
+/******************************************************************************
   * CKRM CORE CLASS
   *      common part to any class structure (i.e. instance of a classtype)
- ******************************************************************************************/
+ ******************************************************************************/
  
  /* basic definition of a hierarchy that is to be used by the the CORE classes
   * and can be used by the resource class objects
@@ -186,24 +186,28 @@ typedef struct ckrm_classtype {
  #define CKRM_CORE_MAGIC                0xBADCAFFE
  
  typedef struct ckrm_hnode {
-        struct ckrm_core_class *parent;
-       struct list_head   siblings; /* linked list of siblings */
-       struct list_head   children; /* anchor for children     */
+       struct ckrm_core_class *parent;
+       struct list_head siblings;      
+       struct list_head children;      
  } ckrm_hnode_t;
  
  typedef struct ckrm_core_class {
-       struct ckrm_classtype *classtype; // what type does this core class belong to
-        void* res_class[CKRM_MAX_RES_CTLRS];                 // pointer to array of resource classes
-       spinlock_t class_lock;             // to protect the list and the array above
-       struct list_head objlist;         // generic list for any object list to be maintained by class
-       struct list_head clslist;         // to link up all classes in a single list type wrt to type
-       struct dentry  *dentry;           // dentry of inode in the RCFS
+       struct ckrm_classtype *classtype;       
+       void *res_class[CKRM_MAX_RES_CTLRS];    // resource classes 
+       spinlock_t class_lock;                  // protects list,array above 
+
+       
+       struct list_head objlist;               // generic object list 
+       struct list_head clslist;               // peer classtype classes
+       struct dentry *dentry;                  // dentry of inode in the RCFS
         int magic;
-       struct ckrm_hnode  hnode;    // hierarchy
-       rwlock_t hnode_rwlock; // rw_clock protecting the hnode above.
+
+       struct ckrm_hnode hnode;                // hierarchy
+       rwlock_t hnode_rwlock;                  // protects hnode above.
         atomic_t refcnt;
         const char *name;
-       int delayed;                      // core deletion delayed because of race conditions
+       int delayed;                            // core deletion delayed 
+                                               // because of race conditions
  } ckrm_core_class_t;
  
  /* type coerce between derived class types and ckrm core class type */
@@ -215,59 +219,76 @@ typedef struct ckrm_core_class {
  /* what type is a class of ISA */
  #define class_isa(clsptr)          (class_core(clsptr)->classtype)
  
-
-/******************************************************************************************
+/******************************************************************************
   * OTHER
- ******************************************************************************************/
+ ******************************************************************************/
  
-#define ckrm_get_res_class(rescls,resid,type)   ((type*)((rescls)->res_class[resid]))
+#define ckrm_get_res_class(rescls, resid, type) \
+       ((type*) (((resid != -1) && ((rescls) != NULL) \
+                          && ((rescls) != (void *)-1)) ? \
+        ((struct ckrm_core_class *)(rescls))->res_class[resid] : NULL))
  
-extern int ckrm_register_res_ctlr   (struct ckrm_classtype *, ckrm_res_ctlr_t *);
-extern int ckrm_unregister_res_ctlr (ckrm_res_ctlr_t *);
+
+extern int ckrm_register_res_ctlr(struct ckrm_classtype *, ckrm_res_ctlr_t *);
+extern int ckrm_unregister_res_ctlr(ckrm_res_ctlr_t *);
  
  extern int ckrm_validate_and_grab_core(struct ckrm_core_class *core);
-extern int ckrm_init_core_class(struct ckrm_classtype  *clstype,struct ckrm_core_class *dcore,
-                               struct ckrm_core_class *parent, const char *name);
-extern int ckrm_release_core_class(struct ckrm_core_class *);   // Hubertus .. can disappear after cls del debugging
-extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type, const char *resname);
+extern int ckrm_init_core_class(struct ckrm_classtype *clstype,
+                               struct ckrm_core_class *dcore,
+                               struct ckrm_core_class *parent,
+                               const char *name);
+extern int ckrm_release_core_class(struct ckrm_core_class *);  
+// Hubertus .. can disappear after cls del debugging
+extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type,
+                                                const char *resname);
  
  #if 0
  
-// Hubertus ... need to straighten out all these I don't think we will even call thsie ore are we 
+// Hubertus ... need to straighten out all these I don't think we will even 
+// call this or are we 
  
  /* interface to the RCFS filesystem */
-extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *, const char *, int);
+extern struct ckrm_core_class *ckrm_alloc_core_class(struct ckrm_core_class *,
+                                                    const char *, int);
  
  // Reclassify the given pid to the given core class by force
  extern void ckrm_forced_reclassify_pid(int, struct ckrm_core_class *);
  
  // Reclassify the given net_struct  to the given core class by force
-extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *, 
-               struct ckrm_core_class *);
+extern void ckrm_forced_reclassify_laq(struct ckrm_net_struct *,
+                                      struct ckrm_core_class *);
  
  #endif
  
  extern void ckrm_lock_hier(struct ckrm_core_class *);
  extern void ckrm_unlock_hier(struct ckrm_core_class *);
-extern struct ckrm_core_class * ckrm_get_next_child(struct ckrm_core_class *,
-                           struct ckrm_core_class *);
+extern struct ckrm_core_class *ckrm_get_next_child(struct ckrm_core_class *,
+                                                  struct ckrm_core_class *);
  
  extern void child_guarantee_changed(struct ckrm_shares *, int, int);
  extern void child_maxlimit_changed(struct ckrm_shares *, int);
-extern int  set_shares(struct ckrm_shares *, struct ckrm_shares *, struct ckrm_shares *);
+extern int set_shares(struct ckrm_shares *, struct ckrm_shares *,
+                     struct ckrm_shares *);
  
  /* classtype registration and lookup */
-extern int ckrm_register_classtype  (struct ckrm_classtype *clstype);
+extern int ckrm_register_classtype(struct ckrm_classtype *clstype);
  extern int ckrm_unregister_classtype(struct ckrm_classtype *clstype);
-extern struct ckrm_classtype* ckrm_find_classtype_by_name(const char *name);
+extern struct ckrm_classtype *ckrm_find_classtype_by_name(const char *name);
  
  /* default functions that can be used in classtypes's function table */
-extern int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq);
-extern int ckrm_class_show_stats(struct ckrm_core_class *core, struct seq_file *seq);
-extern int ckrm_class_show_config(struct ckrm_core_class *core, struct seq_file *seq);
-extern int ckrm_class_set_config(struct ckrm_core_class *core, const char *resname, const char *cfgstr);
-extern int ckrm_class_set_shares(struct ckrm_core_class *core, const char *resname, struct ckrm_shares *shares);
-extern int ckrm_class_reset_stats(struct ckrm_core_class *core, const char *resname, const char *unused);
+extern int ckrm_class_show_shares(struct ckrm_core_class *core,
+                                 struct seq_file *seq);
+extern int ckrm_class_show_stats(struct ckrm_core_class *core,
+                                struct seq_file *seq);
+extern int ckrm_class_show_config(struct ckrm_core_class *core,
+                                 struct seq_file *seq);
+extern int ckrm_class_set_config(struct ckrm_core_class *core,
+                                const char *resname, const char *cfgstr);
+extern int ckrm_class_set_shares(struct ckrm_core_class *core,
+                                const char *resname,
+                                struct ckrm_shares *shares);
+extern int ckrm_class_reset_stats(struct ckrm_core_class *core,
+                                 const char *resname, const char *unused);
  
  #if 0
  extern void ckrm_ns_hold(struct ckrm_net_struct *);
@@ -275,21 +296,21 @@ extern void ckrm_ns_put(struct ckrm_net_struct *);
  extern void *ckrm_set_rootcore_byname(char *, void *);
  #endif
  
-static inline void ckrm_core_grab(struct ckrm_core_class *core)  
-{ 
-       if (core) atomic_inc(&core->refcnt);
+static inline void ckrm_core_grab(struct ckrm_core_class *core)
+{
+       if (core)
+               atomic_inc(&core->refcnt);
  }
  
-static inline void ckrm_core_drop(struct ckrm_core_class *core) 
-{ 
+static inline void ckrm_core_drop(struct ckrm_core_class *core)
+{
         // only make definition available in this context
-       extern void ckrm_free_core_class(struct ckrm_core_class *core);   
+       extern void ckrm_free_core_class(struct ckrm_core_class *core);
         if (core && (atomic_dec_and_test(&core->refcnt)))
-           ckrm_free_core_class(core);
+               ckrm_free_core_class(core);
  }
  
-static inline unsigned int
-ckrm_is_core_valid(ckrm_core_class_t *core)
+static inline unsigned int ckrm_is_core_valid(ckrm_core_class_t * core)
  {
         return (core && (core->magic == CKRM_CORE_MAGIC));
  }
@@ -299,14 +320,16 @@ ckrm_is_core_valid(ckrm_core_class_t *core)
  //                               ckrm_res_ctrl   *ctlr,
  //                               void            *robj,
  //                               int              bmap)
-#define forall_class_resobjs(cls,rcbs,robj,bmap)                                                                       \
-       for ( bmap=((cls->classtype)->bit_res_ctlrs) ;                                                                  \
-            ({ int rid; ((rid=ffs(bmap)-1) >= 0) &&                                                                    \
-                        (bmap&=~(1<<rid),((rcbs=cls->classtype->res_ctlrs[rid]) && (robj=cls->res_class[rid]))); }) ;  \
+#define forall_class_resobjs(cls,rcbs,robj,bmap)                       \
+       for ( bmap=((cls->classtype)->bit_res_ctlrs) ;                  \
+            ({ int rid; ((rid=ffs(bmap)-1) >= 0) &&                    \
+                        (bmap &= ~(1<<rid),                            \
+                               ((rcbs=cls->classtype->res_ctlrs[rid])  \
+                                && (robj=cls->res_class[rid]))); });   \
             )
  
-extern struct ckrm_classtype* ckrm_classtypes[]; /* should provide a different interface */
-
+extern struct ckrm_classtype *ckrm_classtypes[];       
+/* should provide a different interface */
  
  /*-----------------------------------------------------------------------------
   * CKRM event callback specification for the classtypes or resource controllers 
@@ -317,51 +340,61 @@ extern struct ckrm_classtype* ckrm_classtypes[]; /* should provide a different i
   *-----------------------------------------------------------------------------*/
  
  struct ckrm_event_spec {
-       enum ckrm_event     ev;
+       enum ckrm_event ev;
         struct ckrm_hook_cb cb;
  };
-#define CKRM_EVENT_SPEC(EV,FCT) { CKRM_EVENT_##EV, { (ckrm_event_cb)FCT, NULL } }
+#define CKRM_EVENT_SPEC(EV,FCT) { CKRM_EVENT_##EV, \
+                                       { (ckrm_event_cb)FCT, NULL } }
  
  int ckrm_register_event_set(struct ckrm_event_spec especs[]);
  int ckrm_unregister_event_set(struct ckrm_event_spec especs[]);
  int ckrm_register_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb);
  int ckrm_unregister_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb);
  
-/******************************************************************************************
+/******************************************************************************
   * CE Invocation interface
- ******************************************************************************************/
+ ******************************************************************************/
  
  #define ce_protect(ctype)      (atomic_inc(&((ctype)->ce_nr_users)))
  #define ce_release(ctype)      (atomic_dec(&((ctype)->ce_nr_users)))
  
  // CE Classification callbacks with 
  
-#define CE_CLASSIFY_NORET(ctype, event, objs_to_classify...)                                   \
-do {                                                                                           \
-       if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.c_interest)))       \
-               (*(ctype)->ce_callbacks.classify)(event, objs_to_classify);                     \
+#define CE_CLASSIFY_NORET(ctype, event, objs_to_classify...)           \
+do {                                                                   \
+       if ((ctype)->ce_cb_active                                       \
+           && (test_bit(event,&(ctype)->ce_callbacks.c_interest)))     \
+               (*(ctype)->ce_callbacks.classify)(event,                \
+                                                 objs_to_classify);    \
  } while (0)
  
-#define CE_CLASSIFY_RET(ret, ctype, event, objs_to_classify...)                                        \
-do {                                                                                           \
-       if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.c_interest)))       \
-               ret = (*(ctype)->ce_callbacks.classify)(event, objs_to_classify);               \
+#define CE_CLASSIFY_RET(ret, ctype, event, objs_to_classify...)                \
+do {                                                                   \
+       if ((ctype)->ce_cb_active                                       \
+           && (test_bit(event,&(ctype)->ce_callbacks.c_interest)))     \
+               ret = (*(ctype)->ce_callbacks.classify)(event,          \
+                                                       objs_to_classify);\
  } while (0)
  
-#define CE_NOTIFY(ctype, event, cls, objs_to_classify)                                         \
-do {                                                                                           \
-       if ((ctype)->ce_cb_active && (test_bit(event,&(ctype)->ce_callbacks.n_interest)))       \
-               (*(ctype)->ce_callbacks.notify)(event,cls,objs_to_classify);                    \
+#define CE_NOTIFY(ctype, event, cls, objs_to_classify)                 \
+do {                                                                   \
+       if ((ctype)->ce_cb_active                                       \
+           && (test_bit(event,&(ctype)->ce_callbacks.n_interest)))     \
+               (*(ctype)->ce_callbacks.notify)(event,                  \
+                                               cls,objs_to_classify);  \
  } while (0)
  
+/***************
+ * RCFS related 
+ ***************/
  
-#endif // CONFIG_CKRM
-
-#endif // __KERNEL__
-
-#endif // _LINUX_CKRM_RC_H
-
+/* vars needed by other modules/core */
  
+extern int rcfs_mounted;
+extern int rcfs_engine_regd;
  
+#endif                         // CONFIG_CKRM
  
+#endif                         // __KERNEL__
  
+#endif                         // _LINUX_CKRM_RC_H
diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h

index dc00aea..088e06c 100644 (file)
--- a/include/linux/ckrm_sched.h
+++ b/include/linux/ckrm_sched.h
@@ -3,6 +3,8 @@
   * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
   * Copyright (C) Hubertus Franke,  IBM Corp. 2004
   * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
@@ -10,17 +12,6 @@
   *
   */
  
-/*
- * Overview:
- * ---------
- *
- * Please read Documentation/ckrm/cpu_sched for a general overview of
- * how the O(1) CKRM scheduler.
- *
- * ckrm_sched.h provides the definition for the per class local runqueue.
- *
- */
-   
  #ifndef _CKRM_SCHED_H
  #define _CKRM_SCHED_H
  
@@ -36,31 +27,18 @@ struct prio_array {
         struct list_head queue[MAX_PRIO];
  };
  
-
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
-
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#define rq_active(p,rq)   (get_task_lrq(p)->active)
+#define rq_expired(p,rq)  (get_task_lrq(p)->expired)
+int __init init_ckrm_sched_res(void);
+#else
  #define rq_active(p,rq)   (rq->active)
  #define rq_expired(p,rq)  (rq->expired)
  static inline void init_ckrm_sched_res(void) {}
  static inline int ckrm_cpu_monitor_init(void) {return 0;}
+#endif //CONFIG_CKRM_CPU_SCHEDULE
  
-#else
-
-#define rq_active(p,rq)   (get_task_lrq(p)->active)
-#define rq_expired(p,rq)  (get_task_lrq(p)->expired)
-
-enum ckrm_sched_mode {
-       CKRM_SCHED_MODE_DISABLED, /* always use default linux scheduling     */
-                                 /* effectively disables the ckrm scheduler */
-       CKRM_SCHED_MODE_ENABLED  /* always uses ckrm scheduling behavior    */
-};
-
-extern unsigned int ckrm_sched_mode;     /* true internal sched_mode (DIS/EN ABLED) */
-
-int __init init_ckrm_sched_res(void);
-
-typedef unsigned long long CVT_t;      // cummulative virtual time
-
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
  struct ckrm_runqueue {
         cq_node_t classqueue_linkobj;   /*links in classqueue */
         struct ckrm_cpu_class *cpu_class;       // class it belongs to
@@ -74,7 +52,6 @@ struct ckrm_runqueue {
            reset to jiffies if expires
          */
         unsigned long expired_timestamp;
-        int best_expired_prio;
  
         /* 
          * highest priority of tasks in active
@@ -85,38 +62,23 @@ struct ckrm_runqueue {
         CVT_t local_cvt;
  
         unsigned long lrq_load;
-
-       /* Three different weights are distinguished:
-        * local_weight, skewed_weight, over_weight:
-        *
-        * - local_weight:  main weight to drive CVT progression
-        * - over_weight:   weight to reduce savings when over its guarantee
-        * - skewed_weight: weight to use when local_weight to small
-        *                  avoids starvation problems.
-        */
         int local_weight;   
-       int over_weight;
-       int skewed_weight;
+
  
         /*
-        * unused CPU time accumulated while the class 
+        * unused CPU time accumulated while thoe class 
          * is inactive goes to savings
          * 
          * initialized to be 0
          * a class can't accumulate more than SAVING_THRESHOLD of savings
          */
-       CVT_t savings;
+       unsigned long long savings;
  
         unsigned long magic;    //for debugging
-} ____cacheline_aligned_in_smp;
-
-#define CKRM_LRQ_MAGIC (0xACDC0702)
+};
  
  typedef struct ckrm_runqueue ckrm_lrq_t;
  
-#define ckrm_cpu_disabled() (ckrm_sched_mode == CKRM_SCHED_MODE_DISABLED)   
-#define ckrm_cpu_enabled()  (ckrm_sched_mode == CKRM_SCHED_MODE_ENABLED)   
-
  /**
   * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
   * 
@@ -141,31 +103,24 @@ struct ckrm_cpu_class_stat {
          */
         int eshare;
         int meshare;
-
-       /* a boolean indicates if the class has savings or not */
-       int has_savings; 
-
-       /*
-        * a temporary value used by reorder_surplus_queue 
-        */
-       int demand_per_share;
  };
  
  #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
  
-#define USAGE_SAMPLE_FREQ  (HZ)  //sample every 1 seconds
-#define USAGE_MAX_HISTORY  (60)  // keep the last 60 usage samples
+#define USAGE_SAMPLE_FREQ HZ  //sample every 1 seconds
  #define NS_PER_SAMPLE      (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
+#define USAGE_WINDOW_SIZE 60  //keep the last 60 sample
  
  struct ckrm_usage {
-       unsigned long samples[USAGE_MAX_HISTORY]; //record usages 
+       unsigned long samples[USAGE_WINDOW_SIZE]; //record usages 
         unsigned long sample_pointer;  // pointer for the sliding window
         unsigned long long last_ns;    // ns for last sample
         long long last_sample_jiffies; // in number of jiffies
  };
  
  /*
- * CPU controller object allocated for each CLASS
+ * manages the class status
+ * there should be only one instance of this object for each class in the whole system  
   */
  struct ckrm_cpu_class {
         struct ckrm_core_class *core;
@@ -174,16 +129,12 @@ struct ckrm_cpu_class {
         spinlock_t cnt_lock;    // always grab parent's lock first and then child's
         struct ckrm_cpu_class_stat stat;
         struct list_head links; // for linking up in cpu classes
-       struct list_head surplus_queue; //used for surplus allocation
-       ckrm_lrq_t* local_queues[NR_CPUS];      // runqueues 
+       ckrm_lrq_t local_queues[NR_CPUS];       // runqueues 
         struct ckrm_usage usage;
         unsigned long magic;    //for debugging
-#ifdef __SIMULATOR__
-       int class_id;
-#endif
  };
  
-#define cpu_class_weight(cls)   (SHARE_TO_WEIGHT(cls->stat.meshare))
+#define cpu_class_weight(cls) (cls->stat.meshare)
  #define local_class_weight(lrq) (lrq->local_weight)
  
  static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
@@ -199,7 +150,7 @@ static inline void ckrm_usage_init(struct ckrm_usage* usage)
  {
         int i;
  
-       for (i=0; i < USAGE_MAX_HISTORY; i++)
+       for (i=0; i < USAGE_WINDOW_SIZE; i++)
                 usage->samples[i] = 0;
         usage->sample_pointer = 0;
         usage->last_ns = 0;
@@ -237,21 +188,49 @@ static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr)
         //      printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
  
         usage->sample_pointer ++;
-       if (usage->sample_pointer >= USAGE_MAX_HISTORY)
+       if (usage->sample_pointer >= USAGE_WINDOW_SIZE)
                 usage->sample_pointer = 0;
  }
  
+//duration is specified in number of jiffies
+//return the usage in percentage
+static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration)
+{
+       int nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
+       struct ckrm_usage* usage = &clsptr->usage;
+       unsigned long long total = 0;
+       int i, idx;
+
+       if (nr_samples > USAGE_WINDOW_SIZE)
+               nr_samples = USAGE_WINDOW_SIZE;
+
+       idx = usage->sample_pointer;    
+       for (i = 0; i< nr_samples; i++) {
+               if (! idx)
+                       idx = USAGE_WINDOW_SIZE;
+               idx --;
+               total += usage->samples[idx];
+       }
+        total *= 100;
+        do_div(total,nr_samples);
+        do_div(total,NS_PER_SAMPLE);
+       do_div(total,cpus_weight(cpu_online_map));
+        return total;
+}
+
+
  #define lrq_nr_running(lrq) \
               (lrq->active->nr_active + lrq->expired->nr_active)
  
-static inline ckrm_lrq_t *get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
+static inline ckrm_lrq_t *
+get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
  {
-       return cls->local_queues[cpu];
+       return &(cls->local_queues[cpu]);
  }
  
  static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
  {
-       return p->cpu_class->local_queues[task_cpu(p)];
+       return &(p->cpu_class->local_queues[task_cpu(p)]);
  }
  
  #define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
@@ -274,10 +253,9 @@ void ckrm_cpu_change_class(void *task, void *old, void *new);
  #define CPU_DEMAND_INIT 3
  
  /*functions exported by ckrm_cpu_monitor.c*/
-int update_effectives(void);
  void ckrm_cpu_monitor(int check_min);
  int ckrm_cpu_monitor_init(void);
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares);
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
  void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
  void adjust_local_weight(void);
  
@@ -311,54 +289,62 @@ void adjust_local_weight(void);
   *
   *******************************************************************/
  
-/*
- * The class priority is biasd toward classes with high priority tasks. 
- * But we need to prevent this bias from starving other classes.
- * If a class has nice value of -20, how much it can starve the default class?
- * priority bonus =  (120-100) >> PRIORITY_QUANTIZER, 
- * if PRIORITY_QUANTIZER = 2, then it's 5 steps ahead
- * A class without bonus thus can't get to run until: 
- * bonus * CKRM_MAX_WEIGHT * CVT_INC_PERSHARE = (120-100) >> PRIORITY_QUANTIZER
- *  (1 << CKRM_WEIGHT_SHIFT)
- *  (1 << CLASS_QUANTIZER) 
-*/
-
-/* 
- * CKRM_WEIGHT_SHIFT and CLASS_QUANTIZER control how much a class with 
- * high priority task can starve a normal priority class, so it should
- * be constant CLASS_QUANTIZER should not be too small otherwise we 
- * don't have enough bins in the classqueue.
- * The ideal value of CLASS_QUANTIZER is 20, but a little smaller is acceptable
- */
+#define CLASS_QUANTIZER 16     //shift from ns to increase class bonus
+#define PRIORITY_QUANTIZER 2   //controls how much a high prio task can borrow
  
-#define CLASS_QUANTIZER     (18)// shift from ns to increase class bonus
-#define PRIORITY_QUANTIZER  (2) // how much a high prio task can borrow
-#define CKRM_WEIGHT_SHIFT   (8) // 1/2^x == finest weight granularity
-#define CKRM_MAX_WEIGHT     (1<<CKRM_WEIGHT_SHIFT)  // - " -
+#define CKRM_SHARE_ACCURACY 13
+#define NSEC_PER_MS 1000000
+#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
  
-/* SHARES:
- * shares are set in a hierarchical path. Since specified share settings 
- * of a class (c) are relative to the parent (p) and its totals
- * the shares can get very small, dependent on how many classes are 
- * specified.
- */
+
+#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC)  // 10 seconds
   
-#define CKRM_SHARE_SHIFT (13)  
-#define CKRM_SHARE_MAX   (1 << CKRM_SHARE_SHIFT)
+#define CVT_UPDATE_TICK     ((HZ/2)?:1)
  
-#define SHARE_TO_WEIGHT(x) ((x) >> (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
-#define WEIGHT_TO_SHARE(x) ((x) << (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
+// ABSOLUTE_CKRM_TUNING determines whether classes can make up
+// lost time in absolute time or in relative values
  
-/* Other constants */
+#define ABSOLUTE_CKRM_TUNING         // preferred due to more predictable behavior
  
-#define NSEC_PER_MS          (1000000)
-#define NSEC_PER_JIFFIES     (NSEC_PER_SEC/HZ)
+#ifdef ABSOLUTE_CKRM_TUNING
  
-#define MAX_SAVINGS_ABSOLUTE (4LLU*NSEC_PER_SEC)  // 4 seconds
-#define CVT_UPDATE_TICK      ((HZ/2)?:1)
  #define MAX_SAVINGS          MAX_SAVINGS_ABSOLUTE
+//an absolute bonus of 200ms for classes when reactivated
+#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
  #define SAVINGS_LEAK_SPEED   (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
  
+#define scale_cvt(val,lrq)   ((val)*local_class_weight(lrq))
+#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq)))
+
+#else
+
+#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) 
+/*
+ * to improve system responsiveness
+ * an inactive class is put a little bit ahead of the current class when it wakes up
+ * the amount is set in normalized term to simplify the calculation
+ * for class with 100% share, it can be 2s ahead
+ * while for class with 10% share, it can be 200ms ahead
+ */
+#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS)  
+
+/*
+ * normalized savings can't be more than MAX_NORMALIZED_SAVINGS
+ * based on the current configuration
+ * this means that a class with share 100% will accumulate 10s at most
+ * while a class with 1% of the share can only accumulate 100ms
+ */
+
+//a class with share 100% can get 100ms every 500ms
+//while a class with share 10% can only get 10ms every 500ms
+#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY)
+
+#define scale_cvt(val,lrq)   (val)
+#define unscale_cvt(val,lrq) (val)
+
+#endif
+
+
  /**
   * get_effective_prio: return the effective priority of a class local queue
   *
@@ -374,7 +360,6 @@ static inline int get_effective_prio(ckrm_lrq_t * lrq)
         int prio;
  
         prio = lrq->local_cvt >> CLASS_QUANTIZER;  // cumulative usage
-#define URGENCY_SUPPORT 1
  #ifndef URGENCY_SUPPORT
  #warning "ACB removing urgency calculation from get_effective_prio"
  #else
@@ -428,10 +413,83 @@ static inline unsigned long task_load(struct task_struct* p)
  }
  
  /*
- * moved to ckrm_sched.c
- * but may need to make it static inline to improve performance
+ * runqueue load is the local_weight of all the classes on this cpu
+ * must be called with class_list_lock held
   */
-void update_local_cvt(struct task_struct *p, unsigned long nsec);
+static inline unsigned long ckrm_cpu_load(int cpu)
+{
+       struct ckrm_cpu_class *clsptr;
+       ckrm_lrq_t* lrq;
+       struct ckrm_cpu_demand_stat* l_stat;
+       int total_load = 0;
+       int load;
+
+       list_for_each_entry(clsptr,&active_cpu_classes,links) {
+               lrq =  get_ckrm_lrq(clsptr,cpu);
+               l_stat = get_cls_local_stat(clsptr,cpu);
+               load = lrq->local_weight;
+               if (l_stat->cpu_demand < load)
+                       load = l_stat->cpu_demand;
+               total_load += load;
+       }       
+       return total_load;
+}
+
+static inline void class_enqueue_task(struct task_struct *p,
+                                     prio_array_t * array)
+{
+       ckrm_lrq_t *lrq;
+       int effective_prio;
+
+       lrq = get_task_lrq(p);
+
+       cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
+       lrq->lrq_load += task_load(p);
+
+       if ((p->prio < lrq->top_priority) && (array == lrq->active))
+               set_top_priority(lrq, p->prio); 
+
+       if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
+               cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
+               effective_prio = get_effective_prio(lrq);
+               classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio);
+       } 
+
+}
+
+static inline void class_dequeue_task(struct task_struct *p,
+                                     prio_array_t * array)
+{
+       ckrm_lrq_t *lrq = get_task_lrq(p);
+       unsigned long load = task_load(p);
+
+       BUG_ON(lrq->lrq_load < load);
+       lrq->lrq_load -= load;
+
+       cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
+
+       if ((array == lrq->active) && (p->prio == lrq->top_priority)
+           && list_empty(&(array->queue[p->prio])))
+               set_top_priority(lrq,
+                                find_next_bit(array->bitmap, MAX_PRIO,
+                                              p->prio));
+}
+
+/*
+ *  called after a task is switched out. Update the local cvt accounting 
+ *  we need to stick with long instead of long long due to nonexistent 64-bit division
+ */
+static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
+{
+       ckrm_lrq_t * lrq = get_task_lrq(p);
+
+       unsigned long cvt_inc = nsec / local_class_weight(lrq);
+
+       lrq->local_cvt += cvt_inc;
+       lrq->uncounted_ns += nsec;
+
+       update_class_priority(lrq);
+}
                                                                                  
  static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
  {
@@ -459,14 +517,11 @@ static inline int get_ckrm_rand(unsigned long val)
         return rand;
  }
  
-void update_class_cputime(int this_cpu, int idle);
+void update_class_cputime(int this_cpu);
  
  /**********************************************/
  /*          PID_LOAD_BALANCING                */
  /**********************************************/
-
-#define CPU_PID_CTRL_TICK 32
-
  struct ckrm_load_struct {
         unsigned long load_p;   /*propotional*/
         unsigned long load_i;   /*integral   */
@@ -482,12 +537,26 @@ static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
  }
  
  void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
-long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group);
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
  #define rq_ckrm_load(rq) (&((rq)->ckrm_load))
  
+static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load)
+{
+       read_lock(&class_list_lock);
  
-#endif /*CONFIG_CKRM_CPU_SCHEDULE */
-
+#ifdef CONFIG_SMP
+       ckrm_load_sample(ckrm_load,this_cpu);
  #endif
  
+       if (! (j % CVT_UPDATE_TICK)) {
+               //              printk("ckrm_sched j=%lu\n",j);
+               classqueue_update_base(get_cpu_classqueue(this_cpu));
+               update_class_cputime(this_cpu);
+       }
+
+       read_unlock(&class_list_lock);
+}
  
+#endif //CONFIG_CKRM_CPU_SCHEDULE
+
+#endif
diff --git a/include/linux/ckrm_tc.h b/include/linux/ckrm_tc.h

index 6a57025..5650dd3 100644 (file)
--- a/include/linux/ckrm_tc.h
+++ b/include/linux/ckrm_tc.h
@@ -1,18 +1,13 @@
  #include <linux/ckrm_rc.h>
  
-
-
  #define TASK_CLASS_TYPE_NAME "taskclass"
  
  typedef struct ckrm_task_class {
-       struct ckrm_core_class core;   
+       struct ckrm_core_class core;
  } ckrm_task_class_t;
  
-
  // Index into genmfdesc array, defined in rcfs/dir_modules.c,
  // which has the mfdesc entry that taskclass wants to use
  #define TC_MF_IDX  0
  
-
  extern int ckrm_forced_reclassify_pid(int pid, struct ckrm_task_class *cls);
-
diff --git a/include/linux/sched.h b/include/linux/sched.h

index eda93cb..dd50052 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -607,6 +607,7 @@ struct task_struct {
         spinlock_t  ckrm_tsklock; 
         void       *ce_data;
  #ifdef CONFIG_CKRM_TYPE_TASKCLASS
+       // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS 
         struct ckrm_task_class *taskclass;
         struct list_head        taskclass_link;
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c

index e732fdf..f1cfb26 100644 (file)
--- a/kernel/ckrm/ckrm.c
+++ b/kernel/ckrm/ckrm.c
@@ -82,7 +82,6 @@ inline unsigned int is_res_regd(struct ckrm_classtype *clstype, int resid)
             );
  }
  
-static 
  struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
                                           const char *resname)
  {
@@ -102,8 +101,10 @@ struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype,
         return NULL;
  }
  
+EXPORT_SYMBOL(ckrm_resctlr_lookup);
+
  /* given a classname return the class handle and its classtype*/
-void *ckrm_classobj(const char *classname, int *classTypeID)
+void *ckrm_classobj(char *classname, int *classTypeID)
  {
         int i;
  
@@ -863,10 +864,7 @@ int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq)
                 atomic_inc(&clstype->nr_resusers[i]);
                 rcbs = clstype->res_ctlrs[i];
                 if (rcbs && rcbs->get_share_values) {
-                       int rc = (*rcbs->get_share_values)(core->res_class[i], 
-                                                          &shares);
-                       if (rc == -ENOSYS) 
-                               continue;
+                       (*rcbs->get_share_values) (core->res_class[i], &shares);
                         seq_printf(seq,"res=%s,guarantee=%d,limit=%d,"
                                    "total_guarantee=%d,max_limit=%d\n",
                                    rcbs->res_name, shares.my_guarantee,
diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c

index 1bf482f..f947f07 100644 (file)
--- a/kernel/ckrm/ckrm_cpu_class.c
+++ b/kernel/ckrm/ckrm_cpu_class.c
@@ -22,35 +22,9 @@
  #include <linux/ckrm_sched.h>
  #include <linux/ckrm_classqueue.h>
  #include <linux/seq_file.h>
-#include <linux/parser.h>
-
-#define CPU_CTRL_NAME  "cpu"
  
  struct ckrm_res_ctlr cpu_rcbs;
  
-#define CKRM_CPU_USAGE_DETAIL_MAX 3
-static int usage_detail = 3;  /* 0: show usage 
-                              * 1: show settings
-                              * 2: show effectives
-                              * 3: show per runqueue stats
-                              */
-
-static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode);
-
-/*
- * update effective share setting after:
- * -- remove class
- * -- change class share
- * we don't need to call update_effectives() when add new class since 
- * the defaults grt of new class is 0
- * CAUTION: might need a lock here
- */
-static inline void update_class_effectives(void) 
-{
-       //      update_effectives();
-       ckrm_cpu_monitor(0);
-}
-
  /**
   * insert_cpu_class - insert a class to active_cpu_class list
   *
@@ -64,21 +38,25 @@ static inline void insert_cpu_class(struct ckrm_cpu_class *cls)
  /*
   *  initialize a class object and its local queues
   */
-
-CVT_t get_min_cvt_locking(int cpu);
-ckrm_lrq_t *rq_get_dflt_lrq(int cpu);
-
-static void init_cpu_class_lrq(struct ckrm_cpu_class *cls, 
-                              int cpu, int isdflt)
+void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
  {
-       int j,k;
-       ckrm_lrq_t *queue = cls->local_queues[cpu];
+       int i,j,k;      
+       prio_array_t *array;    
+       ckrm_lrq_t* queue;
  
+       cls->shares = *shares;
+       cls->cnt_lock = SPIN_LOCK_UNLOCKED;
+       ckrm_cpu_stat_init(&cls->stat);
+       ckrm_usage_init(&cls->usage);
+       cls->magic = CKRM_CPU_CLASS_MAGIC;
+
+       for (i = 0 ; i < NR_CPUS ; i++) {
+               queue = &cls->local_queues[i];
         queue->active   = queue->arrays;
         queue->expired  = queue->arrays+1;
         
         for (j = 0; j < 2; j++) {
-               prio_array_t *array = queue->arrays + j;
+                       array = queue->arrays + j;
                 for (k = 0; k < MAX_PRIO; k++) {
                         INIT_LIST_HEAD(array->queue + k);
                         __clear_bit(k, array->bitmap);
@@ -89,56 +67,20 @@ static void init_cpu_class_lrq(struct ckrm_cpu_class *cls,
         }
         
         queue->expired_timestamp = 0;
-       queue->best_expired_prio = MAX_PRIO;
         
         queue->cpu_class = cls;
-       queue->classqueue = get_cpu_classqueue(cpu);
+               queue->classqueue = get_cpu_classqueue(i);
         queue->top_priority = MAX_PRIO;
         cq_node_init(&queue->classqueue_linkobj);
-       queue->local_cvt = isdflt ? 0 : get_min_cvt_locking(cpu);
+               queue->local_cvt = 0;
         queue->lrq_load = 0;
         queue->local_weight = cpu_class_weight(cls);
-       if (queue->local_weight == 0)
-               queue->local_weight = 1;
-       queue->over_weight = 0;
-       queue->skewed_weight = CKRM_MAX_WEIGHT/2; /*otherwise class might starve on start*/
         queue->uncounted_ns = 0;
         queue->savings = 0;
-       queue->magic = CKRM_LRQ_MAGIC;
-}
-
-void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
-{
-       int i;      
-       int isdflt;
-       struct ckrm_cpu_class *dfltcls;
-
-       dfltcls = get_default_cpu_class();
-
-       isdflt = (cls==dfltcls);
-
-       cls->shares = *shares;
-       cls->cnt_lock = SPIN_LOCK_UNLOCKED;
-       ckrm_cpu_stat_init(&cls->stat,isdflt ? CKRM_SHARE_MAX : 1);
-       ckrm_usage_init(&cls->usage);
-       cls->magic = CKRM_CPU_CLASS_MAGIC;
-
-       memset(cls->local_queues,0,NR_CPUS*sizeof(ckrm_lrq_t*));
-       
-       if (isdflt) {
-               for (i=0; i< NR_CPUS; i++) {
-                       cls->local_queues[i] = rq_get_dflt_lrq(i);
-                       init_cpu_class_lrq(cls,i,1);
-               }
-       } else {
-               for_each_cpu(i) {
-                       cls->local_queues[i] = kmalloc(sizeof(ckrm_lrq_t),
-                                                      GFP_KERNEL);
-                       BUG_ON(cls->local_queues[i]==NULL);
-                       init_cpu_class_lrq(cls,i,0);
-               }
+               queue->magic = 0x43FF43D7;
         }
  
+       // add to class list
         write_lock(&class_list_lock);
         insert_cpu_class(cls);
         write_unlock(&class_list_lock);
@@ -159,13 +101,13 @@ struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core)
         struct ckrm_cpu_class * cls;
         cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
         if (valid_cpu_class(cls))
-               return (ckrm_cpu_enabled() ? cls : get_default_cpu_class());
+               return cls;
         else
                 return NULL;
  }
  
-void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, 
-                          struct ckrm_core_class *parent) 
+
+void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) 
  {              
         struct ckrm_cpu_class *cls;
  
@@ -194,14 +136,15 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core,
         return cls;
  }              
  
-void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr);
-
+/*
+ * hzheng: this is not a stable implementation
+ *         need to check race condition issue here
+ */            
  static void ckrm_free_cpu_class(void *my_res) 
  {                      
         struct ckrm_cpu_class *cls = my_res, *parres, *childres;
         ckrm_core_class_t *child = NULL;
         int maxlimit;
-       int i;
  
         if (!cls) 
                 return;
@@ -236,19 +179,10 @@ static void ckrm_free_cpu_class(void *my_res)
         list_del(&cls->links);
         write_unlock(&class_list_lock);
  
-       ckrm_cpu_class_queue_delete_sync(cls);
-
-       for_each_cpu(i) {
-               ckrm_lrq_t *lrq = get_ckrm_lrq(cls,i);
-               if (!lrq) continue;
-               lrq->magic = -99;
-               kfree(lrq);
-       }
         kfree(cls);
  
-       //call ckrm_cpu_monitor after class is removed
-       if (ckrm_cpu_enabled())
-               update_class_effectives();
+       //call ckrm_cpu_monitor after class removed
+       ckrm_cpu_monitor(0);
  }                              
  
  /*
@@ -260,12 +194,8 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
          struct ckrm_shares *cur = &cls->shares, *par;
          int rc = -EINVAL;
  
-       if (ckrm_cpu_disabled())
-               return -ENOSYS;
          if (!cls)
                 return rc;
-       if (new_share->total_guarantee > CKRM_SHARE_MAX)
-               return -E2BIG;
  
          if (cls->parent) {
                  parres = ckrm_get_cpu_class(cls->parent);
@@ -285,7 +215,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
                 new_share->my_guarantee = 0;
  
         rc = set_shares(new_share, cur, par);
-       if (!rc && cur->my_limit == CKRM_SHARE_DONTCARE)
+       if (cur->my_limit == CKRM_SHARE_DONTCARE)
                 cur->my_limit = cur->max_limit;
  
  
@@ -295,7 +225,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
         }
  
         //call ckrm_cpu_monitor after changes are changed
-       update_class_effectives();
+       ckrm_cpu_monitor(0);
  
         return rc;
  }                                                      
@@ -305,90 +235,22 @@ static int ckrm_cpu_get_share(void *my_res,
  {                      
         struct ckrm_cpu_class *cls = my_res;
  
-       if (ckrm_cpu_disabled())
-               return -ENOSYS;
          if (!cls)
                 return -EINVAL;
-
         *shares = cls->shares;
         return 0;
  }                              
  
-/*
- *   get_ckrm_usage():
- *     obtain a sequence of <num> usage informations
- *     returns number of usages reported.
- *
- *     report IN:  specifies the sequence of jiffies for which to report
- *                 must be ordered (smallest first)
- *            OUT: returns the usage in each field
- *
- */
-
-
-int ckrm_cpu_get_usage(struct ckrm_cpu_class* clsptr, 
-                      int num, ulong report[])
-{
-       struct ckrm_usage* usage = &clsptr->usage;
-       unsigned long long total = 0;
-       int i, idx, cur, num_ofs;
-
-       num_ofs = cur = i = 0;
-       idx = usage->sample_pointer;    
-
-       for ( num_ofs = 0; num_ofs < num ; num_ofs++ ) {
-               int nr_samples;
-               int duration = report[num_ofs]; 
-               unsigned long long totval = 0;
-
-               nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
-               
-               if (nr_samples > USAGE_MAX_HISTORY)
-                       nr_samples = USAGE_MAX_HISTORY;
-
-               for ( ; i< nr_samples; i++) {
-                       if (! idx)
-                               idx = USAGE_MAX_HISTORY;
-                       idx --;
-                       total += usage->samples[idx];
-               }
-               totval = total * 1000;
-               do_div(totval,NS_PER_SAMPLE);
-               do_div(totval,nr_samples * cpus_weight(cpu_online_map));
-               report[num_ofs] = totval;
-       }
-
-        return num;
-}
-
  int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
  {
         struct ckrm_cpu_class *cls = my_res;
         struct ckrm_cpu_class_stat* stat = &cls->stat;
         ckrm_lrq_t* lrq;
         int i;
-       ulong usage[3] = { 2*HZ, 10*HZ, 60*HZ };
  
-       if (!cls || ckrm_cpu_disabled()) 
+       if (!cls) 
                 return -EINVAL;
  
-       ckrm_cpu_get_usage(cls,3,usage);
-
-       /* this will after full stabilization become the only cpu usage stats
-        */
-
-       seq_printf(sfile, "cpu-usage(2,10,60)= %lu %lu %lu\n",
-                  usage[0],usage[1],usage[2]);
-
-       if (usage_detail < 1) 
-               return 0;
-
-       /* the extended statistics we can decide whether we want to make the 
-        * additional statistics available over config options
-        * eitherway they should be reported in a more concised form
-        * during stabilization, this is OK
-        */
-
         seq_printf(sfile, "-------- CPU Class Status Start---------\n");
         seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n",
                    cls->shares.my_guarantee,
@@ -399,35 +261,26 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
                    cls->shares.unused_guarantee,
                    cls->shares.cur_max_limit);
  
-       if (usage_detail < 2) 
-               goto out;
-
         seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt);
         seq_printf(sfile, "\tmegrt= %d\n",stat->megrt);
         seq_printf(sfile, "\tehl= %d\n",stat->ehl);
         seq_printf(sfile, "\tmehl= %d\n",stat->mehl);
         seq_printf(sfile, "\teshare= %d\n",stat->eshare);
-       seq_printf(sfile, "\tmeshare= %d\n",stat->meshare);
+       seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls));
         seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand);
         seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns);
-       seq_printf(sfile, "\tusage(2,10,60)= %lu %lu %lu\n",
-                  usage[0],usage[1],usage[2]);
-
-       if (usage_detail < 3) 
-               goto out;
-
-       /* provide per run queue information */
+       seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n",
+                  get_ckrm_usage(cls,2*HZ),
+                  get_ckrm_usage(cls,10*HZ),
+                  get_ckrm_usage(cls,60*HZ)
+                  );
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(cls,i);              
-               seq_printf(sfile, "\tlrq %d demand= %lu weight= %d "
-                          "lrq_load= %lu cvt= %llu sav= %llu\n",
-                          i,stat->local_stats[i].cpu_demand,
-                          local_class_weight(lrq),lrq->lrq_load,
-                          lrq->local_cvt,lrq->savings);
+               seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings);
         }
  
-out:
         seq_printf(sfile, "-------- CPU Class Status END ---------\n");
+
         return 0;
  }
  
@@ -443,34 +296,10 @@ void ckrm_cpu_change_class(void *task, void *old, void *new)
         if (!task || ! old || !new)
                 return; 
  
-       if (ckrm_cpu_disabled())
-               newcls = get_default_cpu_class();
         _ckrm_cpu_change_class(tsk,newcls);
  }                                                      
  
-enum config_token_t {
-       config_usage_detail,   /* define usage level                      */
-       config_disable,        /* always use default linux scheduling     */
-                              /* effectively disables the ckrm scheduler */
-       config_enable,         /* always uses ckrm scheduling behavior    */
-       config_err             /* parsing error */
-};
-
-#define CKRM_SCHED_MODE_DISABLED_STR "disabled"
-#define CKRM_SCHED_MODE_ENABLED_STR  "enabled"
-
-static char *ckrm_sched_mode_str[] = { 
-               CKRM_SCHED_MODE_DISABLED_STR,
-               CKRM_SCHED_MODE_ENABLED_STR
-};
-
-static match_table_t config_tokens = {
-       { config_disable,      "mode="CKRM_SCHED_MODE_DISABLED_STR },
-       { config_enable,       "mode="CKRM_SCHED_MODE_ENABLED_STR  },
-       { config_usage_detail, "usage_detail=%u"                   },
-       { config_err,          NULL                                }
-};
-
+/*dummy function, not used*/
  static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
  {
         struct ckrm_cpu_class *cls = my_res;
@@ -478,61 +307,23 @@ static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile)
         if (!cls) 
                 return -EINVAL;
  
-       seq_printf(sfile, "res=%s,mode=%s",
-                  CPU_CTRL_NAME,ckrm_sched_mode_str[ckrm_sched_mode]);
-       if (!ckrm_cpu_disabled())  /* enabled || mixed */
-               seq_printf(sfile, ",usage_detail=%u",usage_detail);
-       seq_printf(sfile,"\n");
+       seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class");
         return 0;
  }
  
+/*dummy function, not used*/
  static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
  {
         struct ckrm_cpu_class *cls = my_res;
-       char *p;
-       char **cfgstr_p = (char**)&cfgstr;
-       substring_t args[MAX_OPT_ARGS];
-       int option,rc;
-       enum ckrm_sched_mode new_sched_mode;
  
         if (!cls) 
                 return -EINVAL;
-
-       new_sched_mode = ckrm_sched_mode;       
-       rc = 0;
-
-       while ((p = strsep(cfgstr_p, ",")) != NULL) {
-               int token;
-               if (!*p)
-                       continue;
-               
-               token = match_token(p, config_tokens, args);
-               switch (token) {
-               case config_usage_detail:
-                       if (ckrm_cpu_disabled() || 
-                           (match_int(&args[0], &option)) ||
-                           (option > CKRM_CPU_USAGE_DETAIL_MAX))
-                       {
-                               return -EINVAL;
-                       }
-                       usage_detail = option;
-                       break;
-               case config_disable:
-                       new_sched_mode = CKRM_SCHED_MODE_DISABLED;
-                       break;
-               case config_enable:
-                       new_sched_mode = CKRM_SCHED_MODE_ENABLED;
-                       break;
-               case config_err:
-                       return -EINVAL;
-               }
-       }
-       rc = ckrm_cpu_set_mode(new_sched_mode);
-       return rc;
+       printk(KERN_DEBUG "ckrm_cpu config='%s'\n",cfgstr);
+       return 0;
  }
         
  struct ckrm_res_ctlr cpu_rcbs = {
-       .res_name          = CPU_CTRL_NAME,
+       .res_name          = "cpu",
         .res_hdepth        = 1,
         .resid             = -1,
         .res_alloc         = ckrm_alloc_cpu_class,
@@ -573,69 +364,14 @@ void init_cpu_classes(void)
  
         //init classqueues for each processor
         for (i=0; i < NR_CPUS; i++)
-               classqueue_init(get_cpu_classqueue(i),ckrm_cpu_enabled()); 
-
-       ckrm_alloc_cpu_class(NULL,NULL);
-}
-
-void ckrm_cpu_class_queue_update(int on);
-void ckrm_cpu_start_monitor(void);
-void ckrm_cpu_kill_monitor(void);
-
-static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode) 
-{
-        struct task_struct *proc, *tsk;
-       struct ckrm_cpu_class *new_cls = NULL;
-       int i;
-
-       if (mode == ckrm_sched_mode)
-               return 0;
+               classqueue_init(get_cpu_classqueue(i)); 
  
-       printk("ckrm_cpu_set_mode from <%s> to <%s> pid=%d\n",
-                  ckrm_sched_mode_str[ckrm_sched_mode],
-                  ckrm_sched_mode_str[mode], 
-                  current->pid);
-
-       if (mode == CKRM_SCHED_MODE_DISABLED) {
-               ckrm_cpu_kill_monitor();
-               new_cls = get_default_cpu_class();
-       } else {
-               ckrm_cpu_class_queue_update(1);
-       }
-                             
-       /* run twice through the list to catch everyone,
-        * current and transient once
-         */
-
-        read_lock(&tasklist_lock);
-
-       ckrm_sched_mode = mode;
-       /* we have to run through the list twice
-        * first catch all existing tasks
-        * and then deal with some potential race condition
+       /*
+        * hzheng: initialize the default cpu class
+        *  required for E14/E15 since ckrm_init is called after sched_init
          */
-       for ( i=2 ; i-- ; ) {
-               /* lock class_list_lock ? */
-       
-               do_each_thread(proc, tsk) {
-                       if (mode == CKRM_SCHED_MODE_ENABLED) {
-                               new_cls = ckrm_get_res_class(class_core(tsk->taskclass),
-                                                            cpu_rcbs.resid,
-                                                            struct ckrm_cpu_class);
-                       }       
-                       _ckrm_cpu_change_class(tsk,new_cls);
-               } while_each_thread(proc, tsk);
+       ckrm_alloc_cpu_class(NULL,NULL);
         }
-        read_unlock(&tasklist_lock);
  
-       if (mode == CKRM_SCHED_MODE_DISABLED) 
-               ckrm_cpu_class_queue_update(0);
-       else 
-               ckrm_cpu_start_monitor();
-       return 0;
-}
  
  EXPORT_SYMBOL(ckrm_get_cpu_class);
-
-
-
diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c

index d8d6bd3..a608f4e 100644 (file)
--- a/kernel/ckrm/ckrm_cpu_monitor.c
+++ b/kernel/ckrm/ckrm_cpu_monitor.c
@@ -28,30 +28,21 @@
  #include <asm/div64.h>
  #include <linux/ckrm_sched.h>
  
-// #define CONFIG_CKRM_SUPPORT_MAXLIMITS
-
  #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
+#define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
  
  #define CKRM_CPU_DEMAND_RUN 0
  #define CKRM_CPU_DEMAND_SLEEP 1
-//sample task cpu demand every 32ms
-#define CPU_DEMAND_TASK_RECALC  ( 32*1000*1000LL)
-#define CPU_DEMAND_CLASS_RECALC (256*1000*1000LL)
+//sample task cpu demand every 64ms
+#define CPU_DEMAND_TASK_RECALC  (64000000LL)
+#define CPU_DEMAND_CLASS_RECALC (256000000LL)
  #define CPU_DEMAND_TP_CLASS 0
  #define CPU_DEMAND_TP_TASK 1
  
-static void update_ckrm_idle(unsigned long surplus);
-
-void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu);
-int alloc_surplus(struct ckrm_core_class *root_core);
  extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
+void update_ckrm_idle(unsigned long surplus);
  
  /*interface to share definition*/
-static inline int get_my_grt(struct ckrm_cpu_class *cls)
-{
-       return cls->shares.unused_guarantee;
-}
-
  static inline int get_soft_limit(struct ckrm_cpu_class *cls)
  {
         return cls->shares.my_limit;
@@ -72,57 +63,6 @@ static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
         return cls->shares.total_guarantee;
  }
  
-static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
-                                      int new_share)
-{
-       if (!new_share)
-               new_share = 1;
-
-       BUG_ON(new_share < 0);
-       stat->eshare = new_share;
-}
-
-static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
-                                           int new_share)
-{
-       if (!new_share)
-               new_share = 1;
-
-       BUG_ON(new_share < 0);
-       stat->meshare = new_share;
-}
-
-/**
- *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
- *
- * self_cpu_demand = sum(cpu demand of all local queues) 
- */
-static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
-{
-       int cpu_demand = 0;
-       int i;
-       int cpuonline = 0;
-
-       for_each_online_cpu(i) {
-               cpu_demand_check_sleep(stat,i);
-               cpu_demand += stat->local_stats[i].cpu_demand;
-               cpuonline ++;
-       }
-
-       return (cpu_demand/cpuonline);
-}
-
-/*
- * my max demand = min(cpu_demand, my effective hard limit)
- */
-static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
-{
-       unsigned long mmax_demand = get_self_cpu_demand(stat);
-       if (mmax_demand > stat->mehl)
-               mmax_demand = stat->mehl;
-
-       return mmax_demand;
-}
  
  static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
  {
@@ -145,7 +85,7 @@ static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat,
         }
  }
  
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares)
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
  {
         int i;
  
@@ -162,517 +102,10 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares)
         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
  
-       stat->eshare = eshares;
-       stat->meshare = eshares;
-
-       stat->has_savings = 0;  
-       stat->demand_per_share = 0;
-
+       stat->eshare = CKRM_SHARE_MAX;
+       stat->meshare = CKRM_SHARE_MAX;
  }
  
-#if 0  // keep handy for debugging if necessary
-void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num)
-{
-       struct ckrm_cpu_class_stat* stat = &clsptr->stat;
-       printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num,
-               clsptr, (clsptr == get_default_cpu_class()),
-               clsptr->shares.my_guarantee, 
-               clsptr->shares.my_limit, 
-               clsptr->shares.total_guarantee,
-               clsptr->shares.max_limit, 
-               clsptr->shares.unused_guarantee);
-       printk("      egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n",
-               stat->egrt,stat->megrt,stat->ehl,stat->mehl,
-               stat->eshare,stat->meshare);
-}
-#endif
-
-/**********************************************/
-/*          surplus allocation                */
-/**********************************************/
-
-/*
- * surplus = egrt - demand
- * if surplus < 0, surplus = 0 
- */
-static inline int get_node_surplus(struct ckrm_cpu_class *cls)
-{
-       int surplus = cls->stat.egrt - cls->stat.max_demand;
-
-       if (surplus < 0)
-               surplus = 0;
-
-       return surplus;
-}
-
-/*
- * consume savings in advance because this class give surplus to others
- * this is a quick hack, should be integrated with balance_savings()
- */
-static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr, 
-                                           int savings_consumed) 
-{
-       long long total_savings;
-       ckrm_lrq_t* lrq;
-       int i;
-       int cpu_online = 0;
-       
-       total_savings = 0;
-       for_each_online_cpu(i) {
-               lrq = get_ckrm_lrq(clsptr,i);
-               total_savings += lrq->savings;
-               cpu_online ++;
-       }
-       
-       total_savings -= savings_consumed;
-       if (total_savings < 0)
-               total_savings = 0;
-
-       //get the average savings
-       do_div(total_savings,cpu_online);       
-       for_each_online_cpu(i) {
-               lrq = get_ckrm_lrq(clsptr,i);
-               lrq->savings = total_savings;
-       }
-}
-
-static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
-{
-       int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
-       int savings_consumed;
-
-       if (surplus < 0)
-               surplus = 0;
-
-       /*
-        * a quick hack about the hierarchy savings distribution 
-        * may not be the right way to do
-        *
-        * since this node give its surplus to other nodes, 
-        * it's savings should be consumed
-        * suppose CPU_MONITOR_INTERVAL = (HZ) 
-        * savings_consumed is roughly how much savings will be consumed for the next second
-        */
-       if (surplus) {
-               savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT);
-               consumed_surplus_savings(cls, savings_consumed) ;
-       }
-
-       return surplus;
-}
-
-/*
- * all the class in the queue consume the surplus in order
- * each class consume the amount propotional to its egrt
- */
-static int consume_surplus_in_order(struct list_head* queue,
-                                          struct ckrm_cpu_class *p_cls,
-                                          int total_surplus)
-{
-       int total_grt = 0;
-       struct ckrm_cpu_class *clsptr;  
-
-       /*
-        * get total_grt of the classes in the queue
-        * total_grt can be maintained instead of re-calcuated each time
-        */
-       list_for_each_entry(clsptr,queue,surplus_queue) {
-               if (unlikely(clsptr == p_cls))
-                       total_grt += clsptr->stat.megrt;
-               else
-                       total_grt += clsptr->stat.egrt;
-       }
-
-       if (! total_grt)
-               goto consume_out;
-       
-       //allocate in order
-       list_for_each_entry(clsptr,queue,surplus_queue) {               
-               int surplus_per_share;
-               int consumed, my_grt;
-
-               BUG_ON(! total_grt);
-               surplus_per_share = 
-                       (total_surplus << CKRM_SHARE_SHIFT) / total_grt;
-
-               if (surplus_per_share <= 0)
-                       break;
-
-               if (unlikely(clsptr == p_cls))  //self_node consuming
-                       my_grt =  clsptr->stat.megrt;
-               else
-                       my_grt = clsptr->stat.egrt;
-
-               BUG_ON(clsptr->stat.demand_per_share <= 0);
-
-               if (clsptr->stat.demand_per_share < surplus_per_share)
-                       surplus_per_share = clsptr->stat.demand_per_share;
-
-               consumed = surplus_per_share * my_grt;
-               consumed >>= CKRM_SHARE_SHIFT;
-               total_surplus -= consumed;
-               BUG_ON(total_surplus < 0);
-               total_grt -= my_grt;
-
-               if (unlikely(clsptr == p_cls))
-                       set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed);                     
-               else
-                       set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed);
-       }       
- consume_out:  
-       if (total_surplus <= 1) //if total_suplus too small, no need to allocate again
-               total_surplus = 0;
-       return total_surplus;
-}
-
-/*
- * link all the children of parent and the parent itself using their surplus_queue field
- * link the whole queue using src_queue
- * if anything wrong return -1
- */
-static int get_class_surplus_queue(struct ckrm_core_class *parent,
-                                  struct list_head* src_queue)
-{
-       struct ckrm_core_class *child_core = NULL;
-       struct ckrm_cpu_class *p_cls,*c_cls;
-       int ret = -1;
-
-       p_cls = ckrm_get_cpu_class(parent);
-       if (! p_cls)
-               goto link_out;
-
-       INIT_LIST_HEAD(src_queue);
-
-       //add the parent node itself
-       list_add(&p_cls->surplus_queue,src_queue);
-       do {
-               child_core = ckrm_get_next_child(parent, child_core);
-               if (child_core) {
-                       c_cls = ckrm_get_cpu_class(child_core);                         
-                       if (! c_cls)
-                               goto link_out;
-                       list_add(&c_cls->surplus_queue,src_queue);
-               }
-       } while (child_core);
-
-       ret = 0;
-
- link_out:
-       return ret;
-}
-
-/*
- * insert the class to queue based on stat->demand_per_share
- * status: tested
- */
-static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr)
-{
-       struct ckrm_cpu_class *cur_cls = NULL;  
-       int end_of_queue = 1;
-
-       list_for_each_entry(cur_cls,queue,surplus_queue) {
-               if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) {
-                       end_of_queue = 0;
-                       break;
-               }
-       }
-
-       //insert the clsptr
-       if (! cur_cls || end_of_queue)
-               list_add_tail(&clsptr->surplus_queue,queue);
-       else
-               list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue);
-}
-
-/*
- * copy all classes in src_queue to dst_queue,
- * reorder the classes based on their normalized demand 
- * if a class already saturate (eshare >= demand), also remove it from src_queue
- * return the total guarantee of the selected classes
- *
- * @src_queue: source queue
- * @dst_queue: destination queue
- * @check_sl: check soft limit
- * @check_savings: only class has savings should be considered
- */
-
-static unsigned long reorder_surplus_queue(struct list_head* src_queue, 
-                                          struct list_head* dst_queue, 
-                                          int check_sl, int check_savings, 
-                                          struct ckrm_cpu_class *p_cls) 
-{
-       struct ckrm_cpu_class *clsptr, *tmp;    
-
-       INIT_LIST_HEAD(dst_queue);
-
-       list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) {
-               struct ckrm_cpu_class_stat* stat = &clsptr->stat;
-               int inc_limit;
-               int max_demand, eshare, esl,grt;
-
-               if (unlikely(clsptr == p_cls)) {
-                       max_demand = get_mmax_demand(stat);
-                       eshare  = stat->meshare;
-                       esl = get_mysoft_limit(clsptr);
-                       grt = stat->megrt;
-               } else {
-                       max_demand = stat->max_demand;
-                       eshare = stat->eshare;
-                       esl = get_soft_limit(clsptr);
-                       grt = stat->egrt;
-               }
-
-               //hard limit and demand limit
-               inc_limit = max_demand - eshare;
-               
-               //no additional share needed
-               if (inc_limit <= 0 || ! grt) {
-                       list_del(&clsptr->surplus_queue);
-                       continue;
-               }
-                       
-               //or no more savings
-               if (check_savings && ! stat->has_savings)
-                       continue;
-               
-               //check soft limit
-               if (check_sl) {
-                       int soft_limit;
-
-                       soft_limit = p_cls->stat.eshare * esl
-                               / p_cls->shares.total_guarantee;
-
-                       if (soft_limit < max_demand)
-                               inc_limit = soft_limit - eshare;
-                       if ( inc_limit <= 0)   /* can turn negative */
-                               continue;
-               }
-
-               BUG_ON(! grt);
-               //get the stat->demand_per_share
-               stat->demand_per_share = 
-                       (inc_limit << CKRM_SHARE_SHIFT) / grt;  
-
-               list_del_init(&clsptr->surplus_queue);
-               //insert the class to the queue
-               insert_surplus_queue(dst_queue,clsptr);
-       }
-       return 0;
-}
-
-/*
- * get all the surplus that should be reallocated to the children
- */
-static inline int get_total_surplus(struct ckrm_cpu_class *p_cls,
-                                   struct ckrm_core_class *parent) 
-{
-       struct ckrm_cpu_class *c_cls;
-       int total_surplus;
-       struct ckrm_core_class *child_core = NULL;
-
-       //additional share assigned to this sub node from parent
-       total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
-       BUG_ON(total_surplus < 0);
-
-       //surplus of this node
-       total_surplus += get_my_node_surplus(p_cls);
-       do {
-               child_core = ckrm_get_next_child(parent, child_core);
-               if (child_core) {
-                       c_cls = ckrm_get_cpu_class(child_core);                         
-                       if (! c_cls) {
-                               total_surplus = 0;
-                               break;
-                       }
-
-                       total_surplus += get_node_surplus(c_cls);                       
-               }
-       } while (child_core);
-
-       return total_surplus;
-}
-/**
- * alloc_surplus_node: re-allocate the shares for a single level
- * @parent: parent node
- * return the remaining surplus
- *
- * The surplus reallocation policy is like below.
- * -- the classes that have eshare >= demand don't need any additional share. 
- *     So they don't participate the surplus allocation.
- * -- all the other classes received share in this order:
- * 1. has savings, not over soft limit
- * 2. has savings, but over soft limit
- * 3. no savings, not over soft limit
- * 4. no savings, over soft limit
- * 
- * In each of the 4 levels above, classes get surplus propotionally to its guarantee
- */
-static int alloc_surplus_node(struct ckrm_core_class *parent)
-{
-       struct ckrm_cpu_class *p_cls;
-       int total_surplus;
-       int ret = -1;
-       struct list_head src_queue, dst_queue;
-
-       p_cls = ckrm_get_cpu_class(parent);
-       if (! p_cls) //safty check
-               goto realloc_out;
-
-       ret = 0;
-       total_surplus = get_total_surplus(p_cls,parent);
-
-       if (! total_surplus) //no surplus to be allocated 
-               goto realloc_out;
-
-       /* 
-        * first round, allocated to tasks with savings, check_sl
-        */
-       get_class_surplus_queue(parent,&src_queue);
-       reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls);
-       if (! list_empty(&dst_queue)) {
-               total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
-               if (! total_surplus)
-                       goto realloc_out;
-       }
-
-       /* 
-        * second round, check savings, but no check_sl
-        */
-       //merge the src_queue and dst_queue and reorder
-       list_splice(&dst_queue, &src_queue);
-       reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls);
-       if (! list_empty(&dst_queue)) {
-               total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
-               if (! total_surplus)
-                       goto realloc_out;
-       }
-
-       /* 
-        * third round, no check savings, but check_sl
-        */
-       //merge the src_queue and dst_queue and reorder
-       list_splice(&dst_queue, &src_queue);
-       reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls);
-       if (! list_empty(&dst_queue)) {
-               total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
-               if (! total_surplus)
-                       goto realloc_out;
-       }
-       /* 
-        * fourth round, no check savings, no check_sl
-        */
-       //merge the src_queue and dst_queue and reorder
-       list_splice(&dst_queue, &src_queue);
-       reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls);
-       if (! list_empty(&dst_queue))
-               total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);       
-       
- realloc_out:
-       return ret;
-}
-
-/*
- * return true if the class total savings > MIN_SAVINGS 
- */
-static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online)
-{
-       unsigned long long total_savings;
-       ckrm_lrq_t* lrq;
-       int i;
-#define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS)
-       
-       total_savings = 0;
-       for_each_online_cpu(i) {
-               lrq = get_ckrm_lrq(clsptr,i);
-               total_savings += lrq->savings;
-       }
-
-       if (total_savings < CLASS_MIN_SAVINGS)
-               return 0;
-
-       //get the average savings
-       do_div(total_savings,cpu_online);       
-       for_each_online_cpu(i) {
-               lrq = get_ckrm_lrq(clsptr,i);
-               lrq->savings = total_savings;
-       }
-
-       /*
-        * hzheng: this is another quick hack
-        * only say I have savings when this node has more demand
-        * ignoring the requirement of child classes
-        */
-       if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat))
-               return 1;
-       else
-               return 0;
-}
-
-/*
- * check savings status
- * set has_savings field if the class or its sub class has savings
- */
-static void check_savings_status(struct ckrm_core_class *root_core)
-{
-       struct ckrm_cpu_class *clsptr;
-       int cpu_online;
-
-       cpu_online = cpus_weight(cpu_online_map);       
-
-       //class status: demand, share,total_ns prio, index
-       list_for_each_entry(clsptr,&active_cpu_classes,links) 
-               clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online);
-}
-
-/**
- * alloc_surplus - reallocate unused shares
- *
- * class A's usused share should be allocated to its siblings
- * the re-allocation goes downward from the top
- */
-int alloc_surplus(struct ckrm_core_class *root_core)
-{
-       struct ckrm_core_class *cur_core, *child_core;
-       //      struct ckrm_cpu_class *cls;
-       int ret = -1;
-
-       check_savings_status(root_core);
-
-       /*initialize*/
-       cur_core = root_core;
-       child_core = NULL;
-       //      cls = ckrm_get_cpu_class(cur_core);
-
-       /*the ckrm idle tasks get all what's remaining*/
-       /*hzheng: uncomment the following like for hard limit support */
-       //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
-       
- repeat:
-       //check exit
-       if (!cur_core)
-               return 0;
-
-       //visit this node only once
-       if (! child_core) 
-               if ( alloc_surplus_node(cur_core) < 0 )
-                       return ret;
-
-       //next child
-       child_core = ckrm_get_next_child(cur_core, child_core);
-       if (child_core) {
-               //go down
-               cur_core = child_core;
-               child_core = NULL;
-               goto repeat;
-       } else {                //no more child, go back
-               child_core = cur_core;
-               cur_core = child_core->hnode.parent;
-       }
-       goto repeat;
-}
-
-
-
  /**********************************************/
  /*          cpu demand                        */
  /**********************************************/
@@ -701,29 +134,27 @@ int alloc_surplus(struct ckrm_core_class *root_core)
   * how often should we recalculate the cpu demand
   * the number is in ns
   */
-static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,
-                                         int state, unsigned long long len)
+static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
  {      
         local_stat->total += len;
         if (state == CKRM_CPU_DEMAND_RUN)
                 local_stat->run += len;
  
         if (local_stat->total >= local_stat->recalc_interval) {
-               local_stat->total >>= CKRM_SHARE_SHIFT;
-               if (unlikely(local_stat->run > ULONG_MAX))
-                       local_stat->run = ULONG_MAX;
+               local_stat->total >>= CKRM_SHARE_ACCURACY;
+               if (unlikely(local_stat->run > 0xFFFFFFFF))
+                       local_stat->run = 0xFFFFFFFF;
  
-               if (unlikely(local_stat->total > ULONG_MAX))
-                       local_stat->total = ULONG_MAX;
+               if (local_stat->total > 0xFFFFFFFF) 
+                       local_stat->total = 0xFFFFFFFF;
                         
                 do_div(local_stat->run,(unsigned long)local_stat->total);
  
-               if (unlikely(local_stat->total > ULONG_MAX)) {
-                       //happens after very long sleep
+               if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
                         local_stat->cpu_demand = local_stat->run;
-               } else { 
-                       local_stat->cpu_demand = 
-                            (local_stat->cpu_demand + local_stat->run) >> 1;
+               else {
+                       local_stat->cpu_demand += local_stat->run;
+                       local_stat->cpu_demand >>= 1;
                 }
                 local_stat->total = 0;
                 local_stat->run = 0;
@@ -762,22 +193,54 @@ void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsign
         }
  }
  
-/** 
- * check all the class local queue
- * 
- * to deal with excessive long run/sleep state
- * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
+/** 
+ * check all the class local queue
+ * 
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
+ */
+static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
+{
+       struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
+       unsigned long long sleep,now;
+       if (local_stat->last_sleep) {
+               now = sched_clock();
+               sleep = now - local_stat->last_sleep;
+               local_stat->last_sleep = now;
+               update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
+       }
+}
+
+/**
+ *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
+ *
+ * self_cpu_demand = sum(cpu demand of all local queues) 
+ */
+static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
+{
+       int cpu_demand = 0;
+       int i;
+       int cpuonline = 0;
+
+       for_each_online_cpu(i) {
+               cpu_demand_check_sleep(stat,i);
+               cpu_demand += stat->local_stats[i].cpu_demand;
+               cpuonline ++;
+       }
+
+       return (cpu_demand/cpuonline);
+}
+
+/*
+ * my max demand = min(cpu_demand, my effective hard limit)
   */
-void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
+static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
  {
-       struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
-       unsigned long long sleep,now;
-       if (local_stat->last_sleep) {
-               now = sched_clock();
-               sleep = now - local_stat->last_sleep;
-               local_stat->last_sleep = now;
-               update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
-       }
+       unsigned long mmax_demand = get_self_cpu_demand(stat);
+       if (mmax_demand > stat->mehl)
+               mmax_demand = stat->mehl;
+
+       return mmax_demand;
  }
  
  /**
@@ -838,6 +301,26 @@ static int update_max_demand(struct ckrm_core_class *root_core)
  /**********************************************/
  /*          effective guarantee & limit       */
  /**********************************************/
+static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
+                                      int new_share)
+{
+       if (!new_share)
+               new_share = 1;
+
+       BUG_ON(new_share < 0);
+       stat->eshare = new_share;
+}
+
+static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
+                                           int new_share)
+{
+       if (!new_share)
+               new_share = 1;
+
+       BUG_ON(new_share < 0);
+       stat->meshare = new_share;
+}
+
  /**
   *update_child_effective - update egrt, ehl, mehl for all children of parent
   *@parent: the parent node
@@ -863,7 +346,7 @@ static int update_child_effective(struct ckrm_core_class *parent)
                     p_cls->stat.egrt *
                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
  
-               c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls)
+               c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
                         / c_cls->shares.total_guarantee;
                 
                 c_cls->stat.ehl =
@@ -889,9 +372,8 @@ static int update_child_effective(struct ckrm_core_class *parent)
   *
   * return -1 if anything wrong happened (eg: the structure changed during the process)
   */
-int update_effectives(void)
+static int update_effectives(struct ckrm_core_class *root_core)
  {
-       struct ckrm_core_class *root_core = get_default_cpu_class()->core;
         struct ckrm_core_class *cur_core, *child_core;
         struct ckrm_cpu_class *cls;
         int ret = -1;
@@ -902,7 +384,7 @@ int update_effectives(void)
  
         //initialize the effectives for root 
         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
-       cls->stat.megrt = cls->stat.egrt * get_my_grt(cls)
+       cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
                 / cls->shares.total_guarantee;
         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
                 / cls->shares.total_guarantee;
@@ -936,11 +418,288 @@ int update_effectives(void)
  }
  
  /**********************************************/
-/*           CKRM Idle Tasks                  */
+/*          surplus allocation                */
  /**********************************************/
  
-#ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS
+/*
+ * surplus = egrt - demand
+ * if surplus < 0, surplus = 0 
+ */
+static inline int get_node_surplus(struct ckrm_cpu_class *cls)
+{
+       int surplus = cls->stat.egrt - cls->stat.max_demand;
+
+       if (surplus < 0)
+               surplus = 0;
+
+       return surplus;
+}
+
+static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
+{
+       int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
+
+       if (surplus < 0)
+               surplus = 0;
+
+       return surplus;
+}
+
+/**
+ * consume_surplus: decides how much surplus a node can consume
+ * @ckeck_sl: if check_sl is set, then check soft_limitx
+ * return how much consumed
+ *
+ * implements all the CKRM Scheduling Requirement
+ * assume c_cls is valid
+ */
+static inline int consume_surplus(int surplus,
+                                      struct ckrm_cpu_class *c_cls,
+                                      struct ckrm_cpu_class *p_cls,
+                                      int check_sl
+                                      )
+{
+       int consumed = 0;
+       int inc_limit;
+       int total_grt = p_cls->shares.total_guarantee;
+
+       BUG_ON(surplus < 0);
+
+       /*can't consume more than demand or hard limit*/
+       if (c_cls->stat.eshare >= c_cls->stat.max_demand)
+               goto out;
+
+       //the surplus allocation is propotional to grt
+       consumed =
+               surplus * c_cls->shares.my_guarantee / total_grt;
+
+       if (! consumed) //no more share
+               goto out;
+
+       //hard limit and demand limit
+       inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
+
+       if (check_sl) {
+               int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
+                       /total_grt;
+               if (esl < c_cls->stat.max_demand)
+                       inc_limit = esl - c_cls->stat.eshare;
+       }
+
+       if (consumed > inc_limit)
+               consumed = inc_limit;
+
+        BUG_ON(consumed < 0);
+ out:          
+       return consumed;
+}
+
+/*
+ * how much a node can consume for itself?
+ */
+static inline int consume_self_surplus(int surplus,
+                                      struct ckrm_cpu_class *p_cls,
+                                      int check_sl
+                                      )
+{
+       int consumed = 0;
+       int inc_limit;
+       int total_grt = p_cls->shares.total_guarantee;
+       int max_demand = get_mmax_demand(&p_cls->stat);
+
+       BUG_ON(surplus < 0);
+
+       /*can't consume more than demand or hard limit*/
+       if (p_cls->stat.meshare >= max_demand)
+               goto out;
+
+       //the surplus allocation is propotional to grt
+       consumed =
+               surplus * p_cls->shares.unused_guarantee / total_grt;
+
+       if (! consumed) //no more share
+               goto out;
+
+       //hard limit and demand limit
+       inc_limit = max_demand - p_cls->stat.meshare;
+
+       if (check_sl) {
+               int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
+                       /total_grt;
+               if (mesl < max_demand)
+                       inc_limit = mesl - p_cls->stat.meshare;
+       }
+
+       if (consumed > inc_limit)
+               consumed = inc_limit;
+
+        BUG_ON(consumed < 0);
+ out:          
+       return consumed;
+}
+
+
+/*
+ * allocate surplus to all its children and also its default class
+ */
+static int alloc_surplus_single_round(
+                                     int surplus,
+                                     struct ckrm_core_class *parent,
+                                     struct ckrm_cpu_class *p_cls,
+                                     int check_sl)
+{
+       struct ckrm_cpu_class *c_cls;
+       struct ckrm_core_class *child_core = NULL;
+       int total_consumed = 0,consumed;
+
+       //first allocate to the default class
+       consumed  =
+               consume_self_surplus(surplus,p_cls,check_sl);
+
+       if (consumed > 0) {
+               set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
+               total_consumed += consumed;
+       }
+
+       do {
+               child_core = ckrm_get_next_child(parent, child_core);
+               if (child_core)  {
+                       c_cls = ckrm_get_cpu_class(child_core);
+                       if (! c_cls)
+                               return -1;
+
+                       consumed    =
+                               consume_surplus(surplus, c_cls,
+                                                    p_cls,check_sl);
+                       if (consumed > 0) {
+                               set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
+                               total_consumed += consumed;
+                       }
+               }
+       } while (child_core);
+
+       return total_consumed;
+}
+
+/**
+ * alloc_surplus_node: re-allocate the shares for children under parent
+ * @parent: parent node
+ * return the remaining surplus
+ *
+ * task:
+ *  1. get total surplus
+ *  2. allocate surplus
+ *  3. set the effective_share of each node
+ */
+static int alloc_surplus_node(struct ckrm_core_class *parent)
+{
+       struct ckrm_cpu_class *p_cls,*c_cls;
+       int total_surplus,consumed;
+       int check_sl;
+       int ret = -1;
+       struct ckrm_core_class *child_core = NULL;
+
+       p_cls = ckrm_get_cpu_class(parent);
+       if (! p_cls)
+               goto realloc_out;
+
+       /*
+        * get total surplus
+        */
+       total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
+       BUG_ON(total_surplus < 0);
+       total_surplus += get_my_node_surplus(p_cls);
+
+       do {
+               child_core = ckrm_get_next_child(parent, child_core);
+               if (child_core) {
+                       c_cls = ckrm_get_cpu_class(child_core);                         
+                       if (! c_cls)
+                               goto realloc_out;
+
+                       total_surplus += get_node_surplus(c_cls);
+               }
+       } while (child_core);
+
+
+       if (! total_surplus) {
+               ret = 0;
+               goto realloc_out;
+       }
+
+       /* 
+        * distributing the surplus 
+        * first with the check_sl enabled
+        * once all the tasks has research the soft limit, disable check_sl and try again
+        */
+       
+       check_sl = 1;
+       do {
+               consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
+               if (consumed < 0) //something is wrong
+                       goto realloc_out;
+
+               if (! consumed)
+                       check_sl = 0;
+               else
+                       total_surplus -= consumed;
+
+       } while ((total_surplus > 0) && (consumed || check_sl) );
+
+       ret = 0;
+       
+ realloc_out:
+       return ret;
+}
+
+/**
+ * alloc_surplus - reallocate unused shares
+ *
+ * class A's usused share should be allocated to its siblings
+ * the re-allocation goes downward from the top
+ */
+static int alloc_surplus(struct ckrm_core_class *root_core)
+{
+       struct ckrm_core_class *cur_core, *child_core;
+       //      struct ckrm_cpu_class *cls;
+       int ret = -1;
+
+       /*initialize*/
+       cur_core = root_core;
+       child_core = NULL;
+       //      cls = ckrm_get_cpu_class(cur_core);
+
+       /*the ckrm idle tasks get all what's remaining*/
+       /*hzheng: uncomment the following like for hard limit support */
+       //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
+       
+ repeat:
+       //check exit
+       if (!cur_core)
+               return 0;
+
+       //visit this node only once
+       if (! child_core) 
+               if ( alloc_surplus_node(cur_core) < 0 )
+                       return ret;
+
+       //next child
+       child_core = ckrm_get_next_child(cur_core, child_core);
+       if (child_core) {
+               //go down
+               cur_core = child_core;
+               child_core = NULL;
+               goto repeat;
+       } else {                //no more child, go back
+               child_core = cur_core;
+               cur_core = child_core->hnode.parent;
+       }
+       goto repeat;
+}
  
+/**********************************************/
+/*           CKRM Idle Tasks                  */
+/**********************************************/
  struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
  struct task_struct* ckrm_idle_tasks[NR_CPUS];
  
@@ -951,7 +710,7 @@ static inline int get_nr_idle(unsigned long surplus)
         int nr_idle = 0; 
         
         nr_idle = surplus * cpu_online;
-       nr_idle >>= CKRM_SHARE_SHIFT;
+       nr_idle >>= CKRM_SHARE_ACCURACY;
  
         if (surplus) 
                 nr_idle ++;
@@ -963,8 +722,7 @@ static inline int get_nr_idle(unsigned long surplus)
  }
  
  /**
- * update_ckrm_idle: update the status of the idle class according 
- *                   to the new surplus
+ * update_ckrm_idle: update the status of the idle class according to the new surplus
   * surplus: new system surplus
   *
   * Task:
@@ -1058,20 +816,6 @@ void ckrm_start_ckrm_idle(void)
         }
  }
  
-void ckrm_stop_ckrm_idle(void)
-{
-       BUG_ON(1);   // not yet implemented
-}
-
-#else
-
-static inline void ckrm_start_ckrm_idle(void) { };
-static inline void ckrm_stop_ckrm_idle(void) { };
-static inline void update_ckrm_idle(unsigned long surplus) { };
-
-#endif
-
-
  /**********************************************/
  /*          Local Weight                      */
  /**********************************************/
@@ -1087,19 +831,8 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
         int i;
         unsigned long class_weight;
         unsigned long long lw;  
-       struct ckrm_cpu_class_stat *stat;
-       unsigned long oweight;
-       unsigned long skewed_limit;
-       /*
-        * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare
-        * then we set the skewed_share 
-        */
-#define SKEWED_SHARE_RATIO 8
-#define SKEWED_WEIGHT_MIN 3
         
-       /* get total pressure of the class, if there is not pressure (.. class is
-        * idle, then leave the weights as is
-        */
+       //get total pressure
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(clsptr,i);
                 total_pressure += lrq->lrq_load;
@@ -1108,54 +841,26 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
         if (! total_pressure)
                 return;
         
-       stat = &clsptr->stat;
-
         class_weight = cpu_class_weight(clsptr) * cpu_online;
  
-       /* calculate or skewed limit weight */
-       skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO);
-       if (skewed_limit < SKEWED_WEIGHT_MIN)
-               skewed_limit = SKEWED_WEIGHT_MIN;
-
-       /* calculate over_weight */     
-       BUG_ON(stat->meshare < stat->megrt);
-       oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare;
-       oweight = SHARE_TO_WEIGHT(oweight);
-
         /*
          * update weight for each cpu, minimun is 1
          */
         for_each_online_cpu(i) {
                 lrq = get_ckrm_lrq(clsptr,i);
-               lrq->over_weight = oweight;
-               if (! lrq->lrq_load) {
-                       /* give idle class a high share to boost 
-                        * interactiveness 
-                        */
+               if (! lrq->lrq_load)
+                       /*give idle class a high share to boost interactiveness */
                         lw = cpu_class_weight(clsptr); 
-                       if (unlikely(lw==0))
-                               lw = 1;
-               } else {
-                       lw = lrq->lrq_load;
-                       lw *= class_weight;
+               else {
+                       lw = lrq->lrq_load * class_weight;
                         do_div(lw,total_pressure);
-                       if (unlikely(lw==0))
+                       if (!lw)
                                 lw = 1;
-                       else if (unlikely(lw > CKRM_MAX_WEIGHT))
-                               lw = CKRM_MAX_WEIGHT;
+                       else if (lw > CKRM_SHARE_MAX)
+                               lw = CKRM_SHARE_MAX;
                 }       
-               BUG_ON(lw > CKRM_MAX_WEIGHT);
  
-               /* 
-                * set is_skewed and local_weight in proper order
-                * to avoid race condition
-                */
                 lrq->local_weight = lw;
-               if (lw < skewed_limit) 
-                       lrq->skewed_weight = skewed_limit;
-               else
-                       lrq->skewed_weight = 0;
-               BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight));
         }
  }
  
@@ -1200,11 +905,9 @@ void ckrm_cpu_monitor(int check_min)
         static unsigned long long last_check = 0;
         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
         unsigned long long now; 
-       int loc;
-
-#define MIN_CPU_MONITOR_INTERVAL (100*1000*1000)  /* 100 MSEC */
+#define MIN_CPU_MONITOR_INTERVAL 100000000UL
  
-       if (ckrm_cpu_disabled() || !root_core)
+       if (!root_core)
                 return;
  
         //do nothing if someone already holding the lock
@@ -1216,37 +919,29 @@ void ckrm_cpu_monitor(int check_min)
         now = sched_clock();
  
         //consecutive check should be at least 100ms apart
-       if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL))
-               goto outunlock_np;
+       if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL))
+               goto outunlock;
  
         last_check = now;
  
-       if (update_effectives() != 0) {
-               loc = 0;
+       if (update_effectives(root_core) != 0)
                 goto outunlock;
-       }
         
-       if (update_max_demand(root_core) != 0) {
-               loc = 1;
+       if (update_max_demand(root_core) != 0)
                 goto outunlock;
-       }
         
-#warning mef: alloc_surplus call back in system;
-       if (alloc_surplus(root_core) != 0) {
-               loc = 2;
+#ifndef ALLOC_SURPLUS_SUPPORT
+#warning "MEF taking out alloc_surplus"
+#else
+       if (alloc_surplus(root_core) != 0)
                 goto outunlock;
-       }
+#endif
         
         adjust_local_weight();
  
- outunlock_np:
+ outunlock:    
         read_unlock(&class_list_lock);
         spin_unlock(&lock);
-       return;
-
- outunlock:    
-       printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc);
-       goto outunlock_np;
  }
  
  /*****************************************************/
@@ -1258,8 +953,6 @@ static int thread_exit = 0;
  static int ckrm_cpu_monitord(void *nothing)
  {
         daemonize("ckrm_cpu_ctrld");
-       printk("cpu_monitord started\n");
-       thread_exit = 0;
         for (;;) {
                 /*sleep for sometime before next try*/
                 set_current_state(TASK_INTERRUPTIBLE);
@@ -1275,19 +968,15 @@ static int ckrm_cpu_monitord(void *nothing)
         return 0;
  }
  
-void ckrm_cpu_start_monitor(void)
+void ckrm_start_monitor(void)
  {
-       if (cpu_monitor_pid != -1) {
-               /* already started ... */
-               return;
-       }       
         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
         if (cpu_monitor_pid < 0) {
                 printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
         }
  }
  
-void ckrm_cpu_kill_monitor(void)
+void ckrm_kill_monitor(void)
  {
         printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
         if (cpu_monitor_pid > 0) {
@@ -1299,12 +988,22 @@ void ckrm_cpu_kill_monitor(void)
         }
  }
  
-static int __init ckrm_cpu_init_monitor(void)
+int ckrm_cpu_monitor_init(void)
  {
-       if (ckrm_cpu_enabled()) 
-               ckrm_cpu_start_monitor();
+       ckrm_start_monitor();
+       /*hzheng: uncomment the following like for hard limit support */
+       //      ckrm_start_ckrm_idle();
         return 0;
  }
  
-__initcall(ckrm_cpu_init_monitor);
+void ckrm_cpu_monitor_exit(void)
+{
+       ckrm_kill_monitor();
+}
+
+module_init(ckrm_cpu_monitor_init);
+module_exit(ckrm_cpu_monitor_exit);
  
+MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
+MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
+MODULE_LICENSE("GPL");
diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c

index 143b259..555ba0a 100644 (file)
--- a/kernel/ckrm/rbce/rbcemod.c
+++ b/kernel/ckrm/rbce/rbcemod.c
@@ -422,7 +422,7 @@ static struct rbce_class *create_rbce_class(const char *classname,
         return cls;
  }
  
-static struct rbce_class *get_class(const char *classname, int *classtype)
+static struct rbce_class *get_class(char *classname, int *classtype)
  {
         struct rbce_class *cls;
         void *classobj;
diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c

index fd7f8a2..80d5d49 100644 (file)
--- a/kernel/ckrm_classqueue.c
+++ b/kernel/ckrm_classqueue.c
@@ -27,19 +27,14 @@
  #include <linux/ckrm_classqueue.h>
  
  #define cq_nr_member(cq) (cq->array.nr_active)
-#define CLASSQUEUE_MASK   (CLASSQUEUE_SIZE - 1)  
  
  /**
- * get_node_index - 
- *      translate the logical priority to the real index in the queue
+ * get_index - translate the logical priority to the real index in the queue
   * 
   * validate the position
   * a valid prio is [cq->base,cq->base + size -1]
- * check whether node is supposed to be enqeued beyond above window and 
- * if so set the need_repos flag 
   */
-static inline unsigned long get_node_index(struct classqueue_struct *cq, 
-                                          cq_node_t * node)
+static inline unsigned long get_index(struct classqueue_struct *cq, int *prio)
  {
         unsigned long index;
         int max_prio;
@@ -48,24 +43,22 @@ static inline unsigned long get_node_index(struct classqueue_struct *cq,
                 return 0;
  
         max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
-       if (unlikely(node->prio > max_prio)) {
-               node->real_prio = node->prio;
-               node->prio = max_prio;
-               node->need_repos = 1;
-       } else
-               node->need_repos = 0;
+       if (*prio > max_prio)
+               *prio = max_prio;
+       if (*prio < cq->base)
+               *prio = cq->base;
  
-       if (unlikely(node->prio < cq->base))
-               node->prio = cq->base;
+               index = (cq->base_offset + (*prio - cq->base)) ;
+       if (index >= CLASSQUEUE_SIZE)
+               index -= CLASSQUEUE_SIZE;
  
-               index = (cq->base_offset + (node->prio - cq->base)) ;
-       return ( index & CLASSQUEUE_MASK );   // ensure its in limits
+       return index;
  }
  
  /**
   * initialize a class queue object
   */
-int classqueue_init(struct classqueue_struct *cq, int enabled)
+int classqueue_init(struct classqueue_struct *cq)
  {
         int i;
         struct cq_prio_array *array;
@@ -80,8 +73,7 @@ int classqueue_init(struct classqueue_struct *cq, int enabled)
         array->nr_active = 0;
  
         cq->base = 0;
-       cq->base_offset = 0;
-       cq->enabled = enabled;
+       cq->base_offset = -1;   //not valid yet
  
         return 0;
  }
@@ -96,7 +88,7 @@ void classqueue_enqueue(struct classqueue_struct *cq,
  
         //get real index
         if (cq_nr_member(cq)) {         
-               index = get_node_index(cq, node);
+               index = get_index(cq, &prio);
         } else {                //the first one
                 cq->base = prio;
                 cq->base_offset = 0;
@@ -131,8 +123,8 @@ void classqueue_update_prio(struct classqueue_struct *cq,
         if (! cls_in_classqueue(node)) 
                 return;
  
+       index = get_index(cq, &new_pos);
         node->prio = new_pos;
-       index = get_node_index(cq, node);
  
         //remove from the original position
         list_del_init(&(node->list));
@@ -145,32 +137,10 @@ void classqueue_update_prio(struct classqueue_struct *cq,
         node->index = index;
  }
  
-
-static inline void __classqueue_update_base(struct classqueue_struct *cq, 
-                                           int new_base)
-{
-       int max_prio; 
-       if (unlikely(new_base <= cq->base)) // base will never move back
-               return; 
-       if (unlikely(!cq_nr_member(cq))) {  
-               cq->base_offset = 0;
-               cq->base = new_base;        // is this necessary ??
-               return;
-       }
-           
-       max_prio = cq->base + (CLASSQUEUE_SIZE - 1);
-       if (unlikely(new_base > max_prio))
-               new_base = max_prio;
-
-               cq->base_offset = (cq->base_offset + (new_base - cq->base)) & CLASSQUEUE_MASK; 
-       cq->base = new_base;
-}
- 
  /**
   *classqueue_get_min_prio: return the priority of the last node in queue
   *
   * this function can be called without runqueue lock held
- * return 0 if there's nothing in the queue
   */
  static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
  {
@@ -201,13 +171,9 @@ static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
   */
  cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
  {
-       cq_node_t *node;
+       cq_node_t *result = NULL;
         int pos;
-       int index;
-       int new_base;
  
-search_again:
-       node = NULL;
         /* 
          * search over the bitmap to get the first class in the queue
          */
@@ -217,38 +183,10 @@ search_again:
                 pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
  
         if (pos < CLASSQUEUE_SIZE) {
-               //BUG_ON(list_empty(&cq->array.queue[pos]));
-               node = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+               BUG_ON(list_empty(&cq->array.queue[pos]));
+               result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
         }
-
-       //check if the node need to be repositioned
-       if (likely(! node || ! node->need_repos)) 
-               return node;
-
-       // We need to reposition this node in the class queue
-       // BUG_ON(node->prio == node->real_prio);
-       
-       //remove from the original position
-       list_del_init(&(node->list));
-       if (list_empty(&cq->array.queue[node->index]))
-         __clear_bit(node->index, cq->array.bitmap);
-       
-       new_base = classqueue_get_min_prio(cq);
-       node->prio = node->real_prio;
-       
-       if (! new_base)
-               new_base  = node->real_prio;
-       else if (node->real_prio < new_base)
-               new_base  = node->real_prio;
-       __classqueue_update_base(cq,new_base);
-       
-       index = get_node_index(cq, node);               
-       //add to new positon, round robin for classes with same priority
-       list_add_tail(&(node->list), &cq->array.queue[index]);
-       __set_bit(index, cq->array.bitmap);     
-       node->index = index;
-       
-       goto search_again;              
+       return result;
  }
  
  /**
@@ -260,11 +198,14 @@ void classqueue_update_base(struct classqueue_struct *cq)
         int new_base;
         
         if (! cq_nr_member(cq)) {
-               cq->base = 0;
-               cq->base_offset = 0;
+               cq->base_offset = -1;   //not defined
                 return;
         }
  
         new_base = classqueue_get_min_prio(cq);
-               __classqueue_update_base(cq,new_base);
+       
+       if (new_base > cq->base) {
+               cq->base_offset = get_index(cq, &new_base);
+               cq->base = new_base;
+       }
  }
diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c

index 26ffc69..7ed70d0 100644 (file)
--- a/kernel/ckrm_sched.c
+++ b/kernel/ckrm_sched.c
@@ -20,28 +20,6 @@ LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
  
  struct ckrm_cpu_class default_cpu_class_obj;
  
-unsigned int ckrm_sched_mode __cacheline_aligned_in_smp = 
-#ifdef CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT
-                       CKRM_SCHED_MODE_ENABLED;
-#else
-                       CKRM_SCHED_MODE_DISABLED;
-#endif
-
-static int __init ckrm_cpu_enabled_setup(char *str)
-{
-       ckrm_sched_mode = CKRM_SCHED_MODE_ENABLED;
-       return 1;
-}
-
-static int __init ckrm_cpu_disabled_setup(char *str)
-{
-       ckrm_sched_mode = CKRM_SCHED_MODE_DISABLED;
-       return 1;
-}
-
-__setup("ckrmcpu",  ckrm_cpu_enabled_setup);
-__setup("nockrmcpu",ckrm_cpu_disabled_setup);
-
  struct ckrm_cpu_class * get_default_cpu_class(void) {
         return (&default_cpu_class_obj);
  }
@@ -50,10 +28,7 @@ struct ckrm_cpu_class * get_default_cpu_class(void) {
  /*                CVT Management                       */
  /*******************************************************/
  
-//an absolute bonus of 200ms for classes when reactivated
-#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
-
-static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
+static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
  {
         CVT_t min_cvt;
         CVT_t bonus;
@@ -62,7 +37,6 @@ static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
         if (unlikely(! cur_cvt))
                 return; 
  
-#define INTERACTIVE_BONUS_SUPPORT 1
  #ifndef INTERACTIVE_BONUS_SUPPORT
  #warning "ACB taking out interactive bonus calculation"        
         bonus = 0;
@@ -76,32 +50,43 @@ static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
  #endif
  
         //cvt can't be negative
-       if (likely(cur_cvt > bonus))
+       if (cur_cvt > bonus)
                 min_cvt = cur_cvt - bonus;
         else
                 min_cvt = 0;
  
         if (lrq->local_cvt < min_cvt) { 
-               //      if (lrq->local_cvt < min_cvt && ! lrq_nr_running(lrq)) {
                 CVT_t lost_cvt;
  
-               if (unlikely(lrq->local_cvt == 0)) {
-                       lrq->local_cvt = cur_cvt;
-                       return;
-               }
-               lost_cvt = min_cvt - lrq->local_cvt;
-               lost_cvt *= local_class_weight(lrq);
+               lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq);
                 lrq->local_cvt = min_cvt;
-               BUG_ON(lost_cvt < 0);
  
                 /* add what the class lost to its savings*/
-#if 1 /*zhq debugging*/
                 lrq->savings += lost_cvt;              
-#endif
                 if (lrq->savings > MAX_SAVINGS)
                         lrq->savings = MAX_SAVINGS; 
-#if 0 /* zhq debugging*/
-               printk("lrq= %x savings: %llu lost= %llu\n",(int)lrq,lrq->savings,lost_cvt);
+       } else if (lrq->savings) {
+               /*
+                *if a class saving and falling behind
+                * then start to use it saving in a leaking bucket way
+                */
+               CVT_t savings_used;
+
+               savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq);
+               if (savings_used > lrq->savings)
+                       savings_used = lrq->savings;
+               
+               if (savings_used > SAVINGS_LEAK_SPEED)
+                       savings_used = SAVINGS_LEAK_SPEED;
+
+               BUG_ON(lrq->savings < savings_used);
+               lrq->savings -= savings_used;
+               unscale_cvt(savings_used,lrq);
+               BUG_ON(lrq->local_cvt < savings_used);
+#ifndef CVT_SAVINGS_SUPPORT
+#warning "ACB taking out cvt saving"
+#else
+               lrq->local_cvt -= savings_used;
  #endif
         }
  }
@@ -109,7 +94,7 @@ static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt)
  /*
   * return the max_cvt of all the classes
   */
-CVT_t get_max_cvt(int this_cpu)
+static inline CVT_t get_max_cvt(int this_cpu)
  {
          struct ckrm_cpu_class *clsptr;
          ckrm_lrq_t * lrq;
@@ -117,6 +102,7 @@ CVT_t get_max_cvt(int this_cpu)
  
          max_cvt = 0;
  
+        /*update class time, at the same time get max_cvt */
          list_for_each_entry(clsptr, &active_cpu_classes, links) {
                  lrq = get_ckrm_lrq(clsptr, this_cpu);
                  if (lrq->local_cvt > max_cvt)
@@ -126,23 +112,6 @@ CVT_t get_max_cvt(int this_cpu)
         return max_cvt;
  }
  
-CVT_t get_min_cvt(int this_cpu)
-{
-        struct ckrm_cpu_class *clsptr;
-        ckrm_lrq_t * lrq;
-        CVT_t max_cvt;
-
-        max_cvt = 0xFFFFFFFFFFFFFLLU;
-
-        list_for_each_entry(clsptr, &active_cpu_classes, links) {
-                lrq = get_ckrm_lrq(clsptr, this_cpu);
-                if (lrq->local_cvt < max_cvt)
-                        max_cvt = lrq->local_cvt;
-        }
-
-       return max_cvt;
-}
-
  /**
   * update_class_cputime - updates cvt of inactive classes
   * -- an inactive class shouldn't starve others when it comes back
@@ -151,7 +120,7 @@ CVT_t get_min_cvt(int this_cpu)
   * 
   * class_list_lock must have been acquired 
   */
-void update_class_cputime(int this_cpu, int idle)
+void update_class_cputime(int this_cpu)
  {
         struct ckrm_cpu_class *clsptr;
         ckrm_lrq_t * lrq;
@@ -209,36 +178,11 @@ void update_class_cputime(int this_cpu, int idle)
  /*******************************************************/
  /*                PID load balancing stuff             */
  /*******************************************************/
+#define PID_SAMPLE_T 32
  #define PID_KP 20
  #define PID_KI 60
  #define PID_KD 20
  
-/*
- * runqueue load is the local_weight of all the classes on this cpu
- * must be called with class_list_lock held
- */
-static unsigned long ckrm_cpu_load(int cpu)
-{
-       struct ckrm_cpu_class *clsptr;
-       ckrm_lrq_t* lrq;
-       struct ckrm_cpu_demand_stat* l_stat;
-       int total_load = 0;
-       int load;
-
-       list_for_each_entry(clsptr,&active_cpu_classes,links) {
-               lrq =  get_ckrm_lrq(clsptr,cpu);
-               l_stat = get_cls_local_stat(clsptr,cpu);
-
-               load = WEIGHT_TO_SHARE(lrq->local_weight);
-               
-               if (l_stat->cpu_demand < load)
-                       load = l_stat->cpu_demand;
-               total_load += load;
-       }       
-       return total_load;
-}
-
-
  /**
   * sample pid load periodically
   */
@@ -248,6 +192,11 @@ void ckrm_load_sample(ckrm_load_t* pid,int cpu)
         long load;
         long err;
  
+       if (jiffies % PID_SAMPLE_T)
+               return;
+
+       adjust_local_weight();  
+
         load = ckrm_cpu_load(cpu);
         err = load - pid->load_p;
         pid->load_d = err;
@@ -257,7 +206,7 @@ void ckrm_load_sample(ckrm_load_t* pid,int cpu)
         pid->load_i /= 10;
  }
  
-long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group)
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
  {
         long pressure;
         pressure = ckrm_load->load_p * PID_KP;
@@ -266,58 +215,3 @@ long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group)
         pressure /= 100;
         return pressure;
  }
-
-/*
- *  called after a task is switched out. Update the local cvt accounting 
- *  we need to stick with long instead of long long due to nonexistent 
- *  64-bit division
- */
-void update_local_cvt(struct task_struct *p, unsigned long nsec)
-{
-       ckrm_lrq_t * lrq = get_task_lrq(p);
-       unsigned long cvt_inc;
-
-       /*
-        * consume from savings if eshare is larger than egrt
-        */
-       if (lrq->savings && lrq->over_weight) {
-               unsigned long savings_used;
-
-               savings_used = nsec;
-               savings_used >>= CKRM_WEIGHT_SHIFT;
-               savings_used *= lrq->over_weight;
-               if (savings_used > lrq->savings)
-                       savings_used = lrq->savings;
-               lrq->savings -= savings_used;   
-       }
-
-       //BUG_ON(local_class_weight(lrq) == 0);
-       cvt_inc = nsec / local_class_weight(lrq); 
-
-       /* 
-        * For a certain processor, CKRM allocates CPU time propotional 
-        * to the class's local_weight. So once a class consumed nsec, 
-        * it will wait for X (nsec) for its next turn.
-        *
-        * X is calculated based on the following fomular
-        *     nsec / local_weight < X / (CKRM_MAX_WEIGHT - local_weight)
-        * if local_weight is small, then approximated as
-        *     nsec / local_weight < X / (CKRM_MAX_WEIGHT)
-        */
-#define CVT_STARVATION_LIMIT (200LL*NSEC_PER_MS)
-#define CVT_STARVATION_INC_LIMIT (CVT_STARVATION_LIMIT >> CKRM_WEIGHT_SHIFT)
-
-       if (unlikely(lrq->skewed_weight)) {
-               unsigned long long starvation_limit = CVT_STARVATION_INC_LIMIT;
-               
-               starvation_limit *= local_class_weight(lrq);
-               if (unlikely(cvt_inc > starvation_limit))         
-                       cvt_inc = nsec / lrq->skewed_weight;
-       }
-
-       /* now update the CVT accounting */
-
-       lrq->local_cvt += cvt_inc;
-       lrq->uncounted_ns += nsec;
-       update_class_priority(lrq);
-}
diff --git a/kernel/sched.c b/kernel/sched.c

index 42af615..f835611 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -45,8 +45,6 @@
  #include <asm/tlb.h>
  
  #include <asm/unistd.h>
-#include <linux/ckrm_classqueue.h>
-#include <linux/ckrm_sched.h>
  
  #ifdef CONFIG_NUMA
  #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
@@ -207,6 +205,8 @@ unsigned int task_timeslice(task_t *p)
   */
  
  typedef struct runqueue runqueue_t;
+#include <linux/ckrm_classqueue.h>
+#include <linux/ckrm_sched.h>
  
  /*
   * This is the main, per-CPU runqueue data structure.
@@ -227,19 +227,17 @@ struct runqueue {
         unsigned long cpu_load;
  #endif
         unsigned long long nr_switches, nr_preempt;
-       unsigned long nr_uninterruptible;
+       unsigned long expired_timestamp, nr_uninterruptible;
         unsigned long long timestamp_last_tick;
         task_t *curr, *idle;
         struct mm_struct *prev_mm;
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
         struct classqueue_struct classqueue;   
         ckrm_load_t ckrm_load;
-       ckrm_lrq_t   dflt_lrq; /* local runqueue of the default class */
  #else
          prio_array_t *active, *expired, arrays[2];
-       unsigned long expired_timestamp;
-       int best_expired_prio;
  #endif
+       int best_expired_prio;
         atomic_t nr_iowait;
  
  #ifdef CONFIG_SMP
@@ -322,72 +320,10 @@ static inline void rq_unlock(runqueue_t *rq)
         spin_unlock_irq(&rq->lock);
  }
  
-static inline void idle_balance(int this_cpu, runqueue_t *this_rq);
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq);
-
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
-
-#define ckrm_rq_cpu_disabled(rq) (!rq->classqueue.enabled)
-#define ckrm_rq_cpu_enabled(rq)  ( rq->classqueue.enabled)
-
-static inline void class_enqueue_task(struct task_struct *p,
-                                     prio_array_t * array)
-{
-       ckrm_lrq_t *lrq;
-       int effective_prio;
-       
-       if (ckrm_rq_cpu_disabled(task_rq(p)))
-               return;
-       
-       lrq = get_task_lrq(p);
-       // BUG_ON(lrq==NULL); 
-       
-       cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
-       lrq->lrq_load += task_load(p);
-       
-       if ((p->prio < lrq->top_priority) && (array == lrq->active))
-               set_top_priority(lrq, p->prio); 
-       
-       if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
-               cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
-               effective_prio = get_effective_prio(lrq);
-               classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, 
-                                  effective_prio);
-       } 
-       
-}
-
-static inline void class_dequeue_task(struct task_struct *p,
-                                     prio_array_t * array)
-{
-       ckrm_lrq_t *lrq;
-       unsigned long load;
-       
-       if (ckrm_rq_cpu_disabled(task_rq(p)))
-               return;
-       
-       lrq = get_task_lrq(p);
-       load = task_load(p); 
-       
-       // BUG_ON(lrq->lrq_load < load);        
-       
-       lrq->lrq_load -= load;
-       
-       cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
-       
-       if ((array == lrq->active) && (p->prio == lrq->top_priority)
-           && list_empty(&(array->queue[p->prio])))
-               set_top_priority(lrq,find_next_bit(array->bitmap, MAX_PRIO,
-                                                  p->prio));
-}
-
  static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
  {
-       cq_node_t *node;
-
-       if (ckrm_rq_cpu_disabled(rq)) 
-               return &rq->dflt_lrq;
-       node = classqueue_get_head(&rq->classqueue);
+       cq_node_t *node = classqueue_get_head(&rq->classqueue);
         return ((node) ? class_list_entry(node) : NULL);
  }
  
@@ -406,189 +342,51 @@ CVT_t get_local_cur_cvt(int cpu)
                 return 0;
  }
  
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
-                                                   int cpu) 
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
  {
         prio_array_t               *array;
         struct task_struct         *next;
         ckrm_lrq_t *queue;
         int idx;
+       int cpu = smp_processor_id();
  
-       if (ckrm_rq_cpu_disabled(rq)) {
-               /* original code from schedule(void) 
-                * see also code in non CKRM configuration
-                */
-               struct list_head *array_queue;
-               ckrm_lrq_t  *lrq = get_ckrm_lrq(get_default_cpu_class(),cpu);
-
-               if (unlikely(!rq->nr_running)) {
-                       idle_balance(cpu, rq);
-                       if (!rq->nr_running) {
-                               rq->dflt_lrq.expired_timestamp = 0;
-                               wake_sleeping_dependent(cpu, rq);
-                               return NULL;
-                       }
-               }
-
-               array = lrq->active;
-               if (unlikely(!array->nr_active)) {
-                       /*
-                        * Switch the active and expired arrays.
-                        */
-                       lrq->active = lrq->expired;
-                       lrq->expired = array;
-                       array = lrq->active; 
-                       lrq->expired_timestamp = 0;
-                       lrq->best_expired_prio = MAX_PRIO;
-               }
-
-               idx = sched_find_first_bit(array->bitmap);
-               array_queue = array->queue + idx;
-               next = list_entry(array_queue->next, task_t, run_list);
-               return next;
-       }
-
-       /*-- CKRM SCHEDULER --*/
+       // it is guaranteed be the ( rq->nr_running > 0 ) check in 
+       // schedule that a task will be found.
         
   retry_next_class:
-       /* we can't use (rq->nr_running == 0) to declare idleness
-        * first we have to make sure that the class runqueue is properly
-        * processed. This is due to two facts/requirements:
-        * (a) when the last task is removed form an lrq we do not remove
-        *     the lrq from the class runqueue. As a result the lrq is 
-        *     selected again and we can perform necessary 
-        *     expired switches.
-        * (b) perform outstanding expired switches
-        * 
-        */
-
         queue = rq_get_next_class(rq);
-       if (unlikely(queue == NULL)) {
-               idle_balance(cpu, rq);
-               if (!rq->nr_running) {
-                       rq->dflt_lrq.expired_timestamp = 0;
-                       wake_sleeping_dependent(cpu, rq);
-                       return NULL;
-               }
-               goto retry_next_class; // try again
-       }
+       // BUG_ON( !queue );
  
         array = queue->active;
         if (unlikely(!array->nr_active)) {
                 queue->active = queue->expired;
                 queue->expired = array;
-               array = queue->active;
                 queue->expired_timestamp = 0;
  
-               if (array->nr_active)
+               if (queue->active->nr_active)
                         set_top_priority(queue,
-                                        find_first_bit(array->bitmap,MAX_PRIO));
+                                        find_first_bit(queue->active->bitmap, MAX_PRIO));
                 else {
-                       /* since we do not dequeue a lrq when it becomes empty
-                        * but rely on the switching mechanism, we must dequeue
-                        * at this point
-                        */
                         classqueue_dequeue(queue->classqueue,
                                            &queue->classqueue_linkobj);
-                       cpu_demand_event(get_rq_local_stat(queue,cpu),
-                                        CPU_DEMAND_DEQUEUE,0);
+                       cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
                 }
                 goto retry_next_class;                          
         }
+       // BUG_ON(!array->nr_active);
  
         idx = queue->top_priority;
-       //BUG_ON(!array->nr_active);
         //BUG_ON(idx == MAX_PRIO);
-       //BUG_ON(list_empty(array->queue+idx));
         next = task_list_entry(array->queue[idx].next);
         return next;
  }
-
-static inline void ckrm_account_task(struct runqueue* rq, 
-                                    struct task_struct *prev, 
-                                    unsigned long long now)
-{
-       if ((prev != rq->idle) && ckrm_rq_cpu_enabled(rq) ) {
-               unsigned long long run = now - prev->timestamp;
-               ckrm_lrq_t * lrq = get_task_lrq(prev);
-
-               lrq->lrq_load -= task_load(prev);
-               cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
-               lrq->lrq_load += task_load(prev);
-
-               cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
-               update_local_cvt(prev, run);
-       }
-
-}
-
-#ifdef CONFIG_SMP
-#define COND_SMP(dflt,cond) (cond)
-#else
-#define COND_SMP(dflt,cond) (dflt)
-#endif
-
-static inline void ckrm_sched_tick(unsigned long j,int this_cpu, int idle,
-                                  runqueue_t *rq)
-{
-       /* first determine whether we have to do anything
-        * without grabing the global lock
-        */
-
-       int sample, update;
-
-#ifdef __SIMULATOR__
-       if ((this_cpu == 0) && (j % 1000) == 0) {
-               ckrm_cpu_monitor(1);
-       }
-#endif
-       
-       if (ckrm_rq_cpu_disabled(rq))
-               return;
-       
-       update = (j % CVT_UPDATE_TICK);
-       sample = COND_SMP(1,(j % CPU_PID_CTRL_TICK)); 
-       
-// avoid taking the global class_list lock on every tick 
-       if (likely(update && sample))
-               return;   // nothing to be done;
-       
-       read_lock(&class_list_lock);
-       
-#ifdef CONFIG_SMP
-       if (sample==0) {
-               ckrm_load_sample(rq_ckrm_load(rq),this_cpu);
-       }
-#endif
-       
-       if (update==0) {
-               classqueue_update_base(get_cpu_classqueue(this_cpu));
-               update_class_cputime(this_cpu,idle);
-               // occasionally we need to call the weight adjustment
-               // for SMP systems
-               if (COND_SMP(0,(this_cpu==0)))
-                       adjust_local_weight();   
-       }
-       
-       read_unlock(&class_list_lock);
-}
-
  #else /*! CONFIG_CKRM_CPU_SCHEDULE*/
-static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
-                                                   int cpu) 
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
  {
         prio_array_t *array;
          struct list_head *queue;
         int idx;
  
-       if (unlikely(!rq->nr_running)) {
-               idle_balance(cpu, rq);
-                if (!rq->nr_running) {
-                        rq->expired_timestamp = 0;
-                        wake_sleeping_dependent(cpu, rq);
-                        return NULL;
-                }
-       }
         array = rq->active;
         if (unlikely(!array->nr_active)) {
                 /*
@@ -606,17 +404,11 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
         return list_entry(queue->next, task_t, run_list);
  }
  
-static inline void class_enqueue_task(struct task_struct* p, 
-                                     prio_array_t *array) { }
-static inline void class_dequeue_task(struct task_struct* p, 
-                                     prio_array_t *array) { }
+static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
  static inline void init_cpu_classes(void) { }
-static inline void ckrm_sched_tick(int j,int this_cpu,int idle, void* arg) {}
-static inline void ckrm_account_task(struct runqueue* rq, struct 
-                                    task_struct *prev, 
-                                    unsigned long long now)  { }
  #define rq_ckrm_load(rq) NULL
-
+static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
  #endif  /* CONFIG_CKRM_CPU_SCHEDULE */
  
  /*
@@ -1766,48 +1558,61 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
         return 1;
  }
  
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
+                                     int phase, enum idle_type idle)
+{
+       long pressure = task_load(tmp);
+       
+       if (pressure > max) 
+               return 0;
+
+       if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
+               return 0;
+       return 1;
+}
+
  /*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
- *
- * Called with both runqueues locked.
+ * move tasks for a specic local class
+ * return number of tasks pulled
   */
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-                     unsigned long max_nr_move, struct sched_domain *sd,
-                     enum idle_type idle)
+static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
+                                     runqueue_t *this_rq,
+                                     runqueue_t *busiest,
+                                     struct sched_domain *sd,
+                                     int this_cpu,
+                                     enum idle_type idle,
+                                     long* pressure_imbalance) 
  {
         prio_array_t *array, *dst_array;
         struct list_head *head, *curr;
-       int idx, pulled = 0;
         task_t *tmp;
-#if CONFIG_CKRM_CPU_SCHEDULE
-       /* need to distinguish between the runqueues and the class
-         * local runqueues.
-        * we know we can get here only if the dflt class is present
+       int idx;
+       int pulled = 0;
+       int phase = -1;
+       long pressure_min, pressure_max;
+       /*hzheng: magic : 90% balance is enough*/
+       long balance_min = *pressure_imbalance / 10; 
+/*
+ * we don't want to migrate tasks that will reverse the balance
+ *     or the tasks that make too small difference
          */
-       ckrm_lrq_t *l_this_rq = &this_rq->dflt_lrq;
-       ckrm_lrq_t *l_busiest = &busiest->dflt_lrq;
-#else
-#define l_busiest busiest
-#define l_this_rq this_rq
-#endif
-
-       if (max_nr_move <= 0 || busiest->nr_running <= 1)
-               goto out;
-
+#define CKRM_BALANCE_MAX_RATIO 100
+#define CKRM_BALANCE_MIN_RATIO 1
+ start:
+       phase ++;
         /*
          * We first consider expired tasks. Those will likely not be
          * executed in the near future, and they are most likely to
          * be cache-cold, thus switching CPUs has the least effect
          * on them.
          */
-       if (l_busiest->expired->nr_active) {
-               array = l_busiest->expired;
-               dst_array = l_this_rq->expired;
+       if (src_lrq->expired->nr_active) {
+               array = src_lrq->expired;
+               dst_array = dst_lrq->expired;
         } else {
-               array = l_busiest->active;
-               dst_array = l_this_rq->active;
+               array = src_lrq->active;
+               dst_array = dst_lrq->active;
         }
  
  new_array:
@@ -1819,12 +1624,15 @@ skip_bitmap:
         else
                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
         if (idx >= MAX_PRIO) {
-               if (array == l_busiest->expired && l_busiest->active->nr_active) {
-                       array = l_busiest->active;
-                       dst_array = l_this_rq->active;
+               if (array == src_lrq->expired && src_lrq->active->nr_active) {
+                       array = src_lrq->active;
+                       dst_array = dst_lrq->active;
                         goto new_array;
                 }
-               goto out;
+               if ((! phase) && (! pulled) && (idle != IDLE))
+                       goto start; //try again
+               else 
+                       goto out; //finished search for this lrq
         }
  
         head = array->queue + idx;
@@ -1840,63 +1648,179 @@ skip_queue:
                 idx++;
                 goto skip_bitmap;
         }
-       pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+
+       pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
+       pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
+       /*
+        * skip the tasks that will reverse the balance too much
+        */
+       if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
+               *pressure_imbalance -= task_load(tmp);
+               pull_task(busiest, array, tmp, 
+                         this_rq, dst_array, this_cpu);
         pulled++;
  
-       /* We only want to steal up to the prescribed number of tasks. */
-       if (pulled < max_nr_move) {
+               if (*pressure_imbalance <= balance_min)
+                       goto out;
+       }
+               
                 if (curr != head)
                         goto skip_queue;
                 idx++;
                 goto skip_bitmap;
-       }
  out:
         return pulled;
  }
  
+static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
+{
+       long imbalance;
  /*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
- * moved to restore balance via the imbalance parameter.
+        * make sure after balance, imbalance' > - imbalance/2
+        * we don't want the imbalance be reversed too much
   */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum idle_type idle)
+       imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) 
+               - pid_get_pressure(rq_ckrm_load(this_rq),1);
+       imbalance /= 2;
+       return imbalance;
+}
+
+/*
+ * try to balance the two runqueues
+ *
+ * Called with both runqueues locked.
+ * if move_tasks is called, it will try to move at least one task over
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+                     unsigned long max_nr_move, struct sched_domain *sd,
+                     enum idle_type idle)
  {
-       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+       struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
+       ckrm_lrq_t* src_lrq,*dst_lrq;
+       long pressure_imbalance, pressure_imbalance_old;
+       int src_cpu = task_cpu(busiest->curr);
+       struct list_head *list;
+       int pulled = 0;
+       long imbalance;
  
-       max_load = this_load = total_load = total_pwr = 0;
+       imbalance =  ckrm_rq_imbalance(this_rq,busiest);
  
-       do {
-               cpumask_t tmp;
-               unsigned long load;
-               int local_group;
-               int i, nr_cpus = 0;
+       if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
+               goto out;
  
-               local_group = cpu_isset(this_cpu, group->cpumask);
+       //try to find the vip class
+        list_for_each_entry(clsptr,&active_cpu_classes,links) {
+               src_lrq = get_ckrm_lrq(clsptr,src_cpu);
  
-               /* Tally up the load of all CPUs in the group */
-               avg_load = 0;
-               cpus_and(tmp, group->cpumask, cpu_online_map);
-               if (unlikely(cpus_empty(tmp)))
-                       goto nextgroup;
+               if (! lrq_nr_running(src_lrq))
+                       continue;
  
-               for_each_cpu_mask(i, tmp) {
-                       /* Bias balancing toward cpus of our domain */
-                       if (local_group)
-                               load = target_load(i);
-                       else
-                               load = source_load(i);
+               if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
+                       {
+                               vip_cls = clsptr;
+                       }
+       }
  
-                       nr_cpus++;
-                       avg_load += load;
-               }
+       /*
+        * do search from the most significant class
+        * hopefully, less tasks will be migrated this way
+        */
+       clsptr = vip_cls;
  
-               if (!nr_cpus)
-                       goto nextgroup;
+ move_class:
+       if (! clsptr)
+               goto out;
+       
  
-               total_load += avg_load;
+       src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+       if (! lrq_nr_running(src_lrq))
+               goto other_class;
+       
+       dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
+
+       //how much pressure for this class should be transferred
+       pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
+       if (pulled && ! pressure_imbalance) 
+               goto other_class;
+       
+       pressure_imbalance_old = pressure_imbalance;
+       
+       //move tasks
+       pulled += 
+               ckrm_cls_move_tasks(src_lrq,dst_lrq,
+                                   this_rq,
+                                   busiest,
+                                   sd,this_cpu,idle,
+                                   &pressure_imbalance);
+
+       /* 
+        * hzheng: 2 is another magic number
+        * stop balancing if the imbalance is less than 25% of the orig
+        */
+       if (pressure_imbalance <= (pressure_imbalance_old >> 2))
+               goto out;
+               
+       //update imbalance
+       imbalance *= pressure_imbalance / pressure_imbalance_old;
+ other_class:
+       //who is next?
+       list = clsptr->links.next;
+       if (list == &active_cpu_classes)
+               list = list->next;
+       clsptr = list_entry(list, typeof(*clsptr), links);
+       if (clsptr != vip_cls)
+               goto move_class;
+ out:
+       return pulled;
+}
+
+/**
+ * ckrm_check_balance - is load balancing necessary?
+ * return 0 if load balancing is not necessary
+ * otherwise return the average load of the system
+ * also, update nr_group
+ *
+ * heuristics: 
+ *   no load balancing if it's load is over average
+ *   no load balancing if it's load is far more than the min
+ * task:
+ *   read the status of all the runqueues
+ */
+static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
+                                            enum idle_type idle, int* nr_group)
+{
+       struct sched_group *group = sd->groups;
+       unsigned long min_load, max_load, avg_load;
+       unsigned long total_load, this_load, total_pwr;
+
+       max_load = this_load = total_load = total_pwr = 0;
+       min_load = 0xFFFFFFFF;
+       *nr_group = 0;
+
+       do {
+               cpumask_t tmp;
+               unsigned long load;
+               int local_group;
+               int i, nr_cpus = 0;
+
+               /* Tally up the load of all CPUs in the group */
+               cpus_and(tmp, group->cpumask, cpu_online_map);
+               if (unlikely(cpus_empty(tmp)))
+                       goto nextgroup;
+
+               avg_load = 0;
+               local_group = cpu_isset(this_cpu, group->cpumask);
+
+               for_each_cpu_mask(i, tmp) {
+                       load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+                       nr_cpus++;
+                       avg_load += load;
+               }
+
+               if (!nr_cpus)
+                       goto nextgroup;
+
+               total_load += avg_load;
                 total_pwr += group->cpu_power;
  
                 /* Adjust by relative CPU power of the group */
@@ -1904,156 +1828,106 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  
                 if (local_group) {
                         this_load = avg_load;
-                       this = group;
                         goto nextgroup;
                 } else if (avg_load > max_load) {
                         max_load = avg_load;
-                       busiest = group;
+               }      
+               if (avg_load < min_load) {
+                       min_load = avg_load;
                 }
  nextgroup:
                 group = group->next;
+               *nr_group = *nr_group + 1;
         } while (group != sd->groups);
  
-       if (!busiest || this_load >= max_load)
+       if (!max_load || this_load >= max_load)
                 goto out_balanced;
  
         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
  
-       if (this_load >= avg_load ||
-                       100*max_load <= sd->imbalance_pct*this_load)
-               goto out_balanced;
-
-       /*
-        * We're trying to get all the cpus to the average_load, so we don't
-        * want to push ourselves above the average load, nor do we wish to
-        * reduce the max loaded cpu below the average load, as either of these
-        * actions would just result in more rebalancing later, and ping-pong
-        * tasks around. Thus we look for the minimum possible imbalance.
-        * Negative imbalances (*we* are more loaded than anyone else) will
-        * be counted as no imbalance for these purposes -- we can't fix that
-        * by pulling tasks to us.  Be careful of negative numbers as they'll
-        * appear as very large values with unsigned longs.
-        */
-       *imbalance = min(max_load - avg_load, avg_load - this_load);
-
-       /* How much load to actually move to equalise the imbalance */
-       *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
-                               / SCHED_LOAD_SCALE;
-
-       if (*imbalance < SCHED_LOAD_SCALE - 1) {
-               unsigned long pwr_now = 0, pwr_move = 0;
-               unsigned long tmp;
-
-               if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-                       *imbalance = 1;
-                       return busiest;
-               }
-
-               /*
-                * OK, we don't have enough imbalance to justify moving tasks,
-                * however we may be able to increase total CPU power used by
-                * moving them.
+       /* hzheng: debugging: 105 is a magic number
+        * 100*max_load <= sd->imbalance_pct*this_load)
+        * should use imbalance_pct instead
                  */
-
-               pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-               pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
-               pwr_now /= SCHED_LOAD_SCALE;
-
-               /* Amount of load we'd subtract */
-               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
-               if (max_load > tmp)
-                       pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
-                                                       max_load - tmp);
-
-               /* Amount of load we'd add */
-               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-               if (max_load < tmp)
-                       tmp = max_load;
-               pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
-               pwr_move /= SCHED_LOAD_SCALE;
-
-               /* Move if we gain another 8th of a CPU worth of throughput */
-               if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+       if (this_load > avg_load 
+           || 100*max_load < 105*this_load
+           || 100*min_load < 70*this_load
+           )
                         goto out_balanced;
  
-               *imbalance = 1;
-               return busiest;
-       }
-
-       /* Get rid of the scaling factor, rounding down as we divide */
-       *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
-
-       return busiest;
-
+       return avg_load;
  out_balanced:
-       if (busiest && (idle == NEWLY_IDLE ||
-                       (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
-               *imbalance = 1;
-               return busiest;
-       }
-
-       *imbalance = 0;
-       return NULL;
+       return 0;
  }
  
-/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
+/**
+ * any group that has above average load is considered busy
+ * find the busiest queue from any of busy group
   */
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *
+ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
+                    unsigned long avg_load, enum idle_type idle,
+                    int nr_group)
  {
-       cpumask_t tmp;
-       unsigned long load, max_load = 0;
+       struct sched_group *group;
         runqueue_t *busiest = NULL;
+       unsigned long rand;
+       
+       group = sd->groups;
+       rand = get_ckrm_rand(nr_group);
+       nr_group = 0;
+
+       do {
+               unsigned long load,total_load,max_load;
+               cpumask_t tmp;
         int i;
+               runqueue_t * grp_busiest;
  
         cpus_and(tmp, group->cpumask, cpu_online_map);
-       for_each_cpu_mask(i, tmp) {
-               load = source_load(i);
+               if (unlikely(cpus_empty(tmp)))
+                       goto find_nextgroup;
  
+               total_load = 0;
+               max_load = 0;
+               grp_busiest = NULL;
+               for_each_cpu_mask(i, tmp) {
+                       load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
+                       total_load += load;
                 if (load > max_load) {
                         max_load = load;
-                       busiest = cpu_rq(i);
+                               grp_busiest = cpu_rq(i);
                 }
         }
  
-       return busiest;
+               total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
+               if (total_load > avg_load) {
+                       busiest = grp_busiest;
+                       if (nr_group >= rand)
+                               break;
  }
+       find_nextgroup:         
+               group = group->next;
+               nr_group ++;
+       } while (group != sd->groups);
  
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called with this_rq unlocked.
- */
-
-static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
-                                   struct sched_domain *sd, 
-                                   enum idle_type idle)
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
-{
-       return -1;
+       return busiest;
  }
-#endif
-;
  
-static int load_balance(int this_cpu, runqueue_t *this_rq,
+/**
+ * load_balance - pressure based load balancing algorithm used by ckrm
+ */
+static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
                         struct sched_domain *sd, enum idle_type idle)
  {
-       struct sched_group *group;
         runqueue_t *busiest;
-       unsigned long imbalance;
-       int nr_moved;
-
-       spin_lock(&this_rq->lock);
-
-       if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,idle)) != -1)
-               goto out_balanced;
+       unsigned long avg_load;
+       int nr_moved,nr_group;
  
-       group = find_busiest_group(sd, this_cpu, &imbalance, idle);
-       if (!group)
+       avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
+       if (! avg_load)
                 goto out_balanced;
  
-       busiest = find_busiest_queue(group);
+       busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
         if (!busiest)
                 goto out_balanced;
         /*
@@ -2076,34 +1950,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                  */
                 double_lock_balance(this_rq, busiest);
                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                               imbalance, sd, idle);
+                                     0,sd, idle);              
                 spin_unlock(&busiest->lock);
+               if (nr_moved) {
+                       adjust_local_weight();
         }
-       spin_unlock(&this_rq->lock);
+                       }
  
-       if (!nr_moved) {
-               sd->nr_balance_failed++;
-
-               if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                       int wake = 0;
-
-                       spin_lock(&busiest->lock);
-                       if (!busiest->active_balance) {
-                               busiest->active_balance = 1;
-                               busiest->push_cpu = this_cpu;
-                               wake = 1;
-                       }
-                       spin_unlock(&busiest->lock);
-                       if (wake)
-                               wake_up_process(busiest->migration_thread);
-
-                       /*
-                        * We've kicked active balancing, reset the failure
-                        * counter.
-                        */
-                       sd->nr_balance_failed = sd->cache_nice_tries;
-               }
-       } else
+       if (!nr_moved) 
+               sd->nr_balance_failed ++;
+       else
                 sd->nr_balance_failed = 0;
  
         /* We were unbalanced, so reset the balancing interval */
@@ -2112,8 +1968,6 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
         return nr_moved;
  
  out_balanced:
-       spin_unlock(&this_rq->lock);
-
         /* tune up the balancing interval */
         if (sd->balance_interval < sd->max_interval)
                 sd->balance_interval *= 2;
@@ -2122,282 +1976,629 @@ out_balanced:
  }
  
  /*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
- * this_rq is locked.
+ * this_rq->lock is already held
   */
-static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
                                 struct sched_domain *sd)
  {
-       struct sched_group *group;
-       runqueue_t *busiest = NULL;
-       unsigned long imbalance;
-       int nr_moved;
+       int ret;
+       read_lock(&class_list_lock);
+       ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+       read_unlock(&class_list_lock);
+       return ret;
+}
  
-       if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE)) != -1)
-               goto out;
+static inline int load_balance(int this_cpu, runqueue_t *this_rq,
+                       struct sched_domain *sd, enum idle_type idle)
+{
+       int ret;
  
-       nr_moved = 0;
-       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
-       if (!group)
-               goto out;
+       spin_lock(&this_rq->lock);
+       read_lock(&class_list_lock);
+       ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+       read_unlock(&class_list_lock);
+       spin_unlock(&this_rq->lock);
+       return ret;
+}
+#else /*! CONFIG_CKRM_CPU_SCHEDULE */
+/*
+ * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
+ * as part of a balancing operation within "domain". Returns the number of
+ * tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+                     unsigned long max_nr_move, struct sched_domain *sd,
+                     enum idle_type idle)
+{
+       prio_array_t *array, *dst_array;
+       struct list_head *head, *curr;
+       int idx, pulled = 0;
+       task_t *tmp;
  
-       busiest = find_busiest_queue(group);
-       if (!busiest || busiest == this_rq)
+       if (max_nr_move <= 0 || busiest->nr_running <= 1)
                 goto out;
  
-       /* Attempt to move tasks */
-       double_lock_balance(this_rq, busiest);
-
-       nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                       imbalance, sd, NEWLY_IDLE);
-
-       spin_unlock(&busiest->lock);
-
-out:
-       return nr_moved;
-}
-
  /*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
+        * We first consider expired tasks. Those will likely not be
+        * executed in the near future, and they are most likely to
+        * be cache-cold, thus switching CPUs has the least effect
+        * on them.
   */
-static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
-{
-       struct sched_domain *sd;
+       if (busiest->expired->nr_active) {
+               array = busiest->expired;
+               dst_array = this_rq->expired;
+       } else {
+               array = busiest->active;
+               dst_array = this_rq->active;
+       }
  
-       for_each_domain(this_cpu, sd) {
-               if (sd->flags & SD_BALANCE_NEWIDLE) {
-                       if (load_balance_newidle(this_cpu, this_rq, sd)) {
-                               /* We've pulled tasks over so stop searching */
-                               break;
+new_array:
+       /* Start searching at priority 0: */
+       idx = 0;
+skip_bitmap:
+       if (!idx)
+               idx = sched_find_first_bit(array->bitmap);
+       else
+               idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+       if (idx >= MAX_PRIO) {
+               if (array == busiest->expired && busiest->active->nr_active) {
+                       array = busiest->active;
+                       dst_array = this_rq->active;
+                       goto new_array;
+               }
+               goto out;
                         }
+
+       head = array->queue + idx;
+       curr = head->prev;
+skip_queue:
+       tmp = list_entry(curr, task_t, run_list);
+
+       curr = curr->prev;
+
+       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+               if (curr != head)
+                       goto skip_queue;
+               idx++;
+               goto skip_bitmap;
                 }
+       pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+       pulled++;
+
+       /* We only want to steal up to the prescribed number of tasks. */
+       if (pulled < max_nr_move) {
+               if (curr != head)
+                       goto skip_queue;
+               idx++;
+               goto skip_bitmap;
         }
+out:
+       return pulled;
  }
  
  /*
- * active_load_balance is run by migration threads. It pushes a running
- * task off the cpu. It can be required to correctly have at least 1 task
- * running on each physical CPU where possible, and not have a physical /
- * logical imbalance.
- *
- * Called with busiest locked.
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the number of tasks which should be
+ * moved to restore balance via the imbalance parameter.
   */
-static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+                  unsigned long *imbalance, enum idle_type idle)
  {
-       struct sched_domain *sd;
-       struct sched_group *group, *busy_group;
-       int i;
-
-       if (busiest->nr_running <= 1)
-               return;
-
-       for_each_domain(busiest_cpu, sd)
-               if (cpu_isset(busiest->push_cpu, sd->span))
-                       break;
-       if (!sd) {
-               WARN_ON(1);
-               return;
-       }
+       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
  
-       group = sd->groups;
-       while (!cpu_isset(busiest_cpu, group->cpumask))
-               group = group->next;
-       busy_group = group;
+       max_load = this_load = total_load = total_pwr = 0;
  
-       group = sd->groups;
         do {
                 cpumask_t tmp;
-               runqueue_t *rq;
-               int push_cpu = 0;
+               unsigned long load;
+               int local_group;
+               int i, nr_cpus = 0;
  
-               if (group == busy_group)
-                       goto next_group;
+               local_group = cpu_isset(this_cpu, group->cpumask);
  
+               /* Tally up the load of all CPUs in the group */
+               avg_load = 0;
                 cpus_and(tmp, group->cpumask, cpu_online_map);
-               if (!cpus_weight(tmp))
-                       goto next_group;
+               if (unlikely(cpus_empty(tmp)))
+                       goto nextgroup;
  
                 for_each_cpu_mask(i, tmp) {
-                       if (!idle_cpu(i))
-                               goto next_group;
-                       push_cpu = i;
+                       /* Bias balancing toward cpus of our domain */
+                       if (local_group)
+                               load = target_load(i);
+                       else
+                               load = source_load(i);
+
+                       nr_cpus++;
+                       avg_load += load;
                 }
  
-               rq = cpu_rq(push_cpu);
+               if (!nr_cpus)
+                       goto nextgroup;
  
-               /*
-                * This condition is "impossible", but since load
-                * balancing is inherently a bit racy and statistical,
-                * it can trigger.. Reported by Bjorn Helgaas on a
-                * 128-cpu setup.
-                */
-               if (unlikely(busiest == rq))
-                       goto next_group;
-               double_lock_balance(busiest, rq);
-               move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
-               spin_unlock(&rq->lock);
-next_group:
+               total_load += avg_load;
+               total_pwr += group->cpu_power;
+
+               /* Adjust by relative CPU power of the group */
+               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+               if (local_group) {
+                       this_load = avg_load;
+                       this = group;
+                       goto nextgroup;
+               } else if (avg_load > max_load) {
+                       max_load = avg_load;
+                       busiest = group;
+               }
+nextgroup:
                 group = group->next;
         } while (group != sd->groups);
-}
-
-/*
- * rebalance_tick will get called every timer tick, on every CPU.
- *
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
  
-/* Don't have all balancing operations going off at once */
-#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+       if (!busiest || this_load >= max_load)
+               goto out_balanced;
  
-static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
-                          enum idle_type idle)
-{
-       unsigned long old_load, this_load;
-       unsigned long j = jiffies + CPU_OFFSET(this_cpu);
-       struct sched_domain *sd;
+       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
  
-       ckrm_sched_tick(j,this_cpu,(idle != NOT_IDLE),this_rq);
+       if (this_load >= avg_load ||
+                       100*max_load <= sd->imbalance_pct*this_load)
+               goto out_balanced;
  
-       /* Update our load */
-       old_load = this_rq->cpu_load;
-       this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
         /*
-        * Round up the averaging division if load is increasing. This
-        * prevents us from getting stuck on 9 if the load is 10, for
-        * example.
+        * We're trying to get all the cpus to the average_load, so we don't
+        * want to push ourselves above the average load, nor do we wish to
+        * reduce the max loaded cpu below the average load, as either of these
+        * actions would just result in more rebalancing later, and ping-pong
+        * tasks around. Thus we look for the minimum possible imbalance.
+        * Negative imbalances (*we* are more loaded than anyone else) will
+        * be counted as no imbalance for these purposes -- we can't fix that
+        * by pulling tasks to us.  Be careful of negative numbers as they'll
+        * appear as very large values with unsigned longs.
          */
-       if (this_load > old_load)
-               old_load++;
-       this_rq->cpu_load = (old_load + this_load) / 2;
+       *imbalance = min(max_load - avg_load, avg_load - this_load);
  
-       for_each_domain(this_cpu, sd) {
-               unsigned long interval = sd->balance_interval;
+       /* How much load to actually move to equalise the imbalance */
+       *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
+                               / SCHED_LOAD_SCALE;
  
-               if (idle != IDLE)
-                       interval *= sd->busy_factor;
+       if (*imbalance < SCHED_LOAD_SCALE - 1) {
+               unsigned long pwr_now = 0, pwr_move = 0;
+               unsigned long tmp;
  
-               /* scale ms to jiffies */
-               interval = msecs_to_jiffies(interval);
-               if (unlikely(!interval))
-                       interval = 1;
+               if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
+                       *imbalance = 1;
+                       return busiest;
+               }
  
-               if (j - sd->last_balance >= interval) {
-                       if (load_balance(this_cpu, this_rq, sd, idle)) {
-                               /* We've pulled tasks over so no longer idle */
-                               idle = NOT_IDLE;
-                       }
-                       sd->last_balance += interval;
+       /*
+                * OK, we don't have enough imbalance to justify moving tasks,
+                * however we may be able to increase total CPU power used by
+                * moving them.
+        */
+
+               pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+               pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+               pwr_now /= SCHED_LOAD_SCALE;
+
+               /* Amount of load we'd subtract */
+               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+               if (max_load > tmp)
+                       pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+                                                       max_load - tmp);
+
+               /* Amount of load we'd add */
+               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+               if (max_load < tmp)
+                       tmp = max_load;
+               pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+               pwr_move /= SCHED_LOAD_SCALE;
+
+               /* Move if we gain another 8th of a CPU worth of throughput */
+               if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+                       goto out_balanced;
+
+               *imbalance = 1;
+               return busiest;
                 }
+
+       /* Get rid of the scaling factor, rounding down as we divide */
+       *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
+
+       return busiest;
+
+out_balanced:
+       if (busiest && (idle == NEWLY_IDLE ||
+                       (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
+               *imbalance = 1;
+               return busiest;
         }
+
+       *imbalance = 0;
+       return NULL;
  }
-#else /* SMP*/
+
  /*
- * on UP we do not need to balance between CPUs:
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
   */
-static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+static runqueue_t *find_busiest_queue(struct sched_group *group)
  {
-       ckrm_sched_tick(jiffies,cpu,(idle != NOT_IDLE),rq);
+       cpumask_t tmp;
+       unsigned long load, max_load = 0;
+       runqueue_t *busiest = NULL;
+       int i;
+
+       cpus_and(tmp, group->cpumask, cpu_online_map);
+       for_each_cpu_mask(i, tmp) {
+               load = source_load(i);
+
+               if (load > max_load) {
+                       max_load = load;
+                       busiest = cpu_rq(i);
+               }
  }
  
-static inline void idle_balance(int cpu, runqueue_t *rq)
-{
+       return busiest;
  }
-#endif
  
-static inline int wake_priority_sleeper(runqueue_t *rq)
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called with this_rq unlocked.
+ */
+static int load_balance(int this_cpu, runqueue_t *this_rq,
+                       struct sched_domain *sd, enum idle_type idle)
  {
-#ifdef CONFIG_SCHED_SMT
+       struct sched_group *group;
+       runqueue_t *busiest;
+       unsigned long imbalance;
+       int nr_moved;
+
+       spin_lock(&this_rq->lock);
+
+       group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+       if (!group)
+               goto out_balanced;
+
+       busiest = find_busiest_queue(group);
+       if (!busiest)
+               goto out_balanced;
         /*
-        * If an SMT sibling task has been put to sleep for priority
-        * reasons reschedule the idle task to see if it can now run.
+        * This should be "impossible", but since load
+        * balancing is inherently racy and statistical,
+        * it could happen in theory.
          */
-       if (rq->nr_running) {
-               resched_task(rq->idle);
-               return 1;
+       if (unlikely(busiest == this_rq)) {
+               WARN_ON(1);
+               goto out_balanced;
         }
-#endif
-       return 0;
+
+       nr_moved = 0;
+       if (busiest->nr_running > 1) {
+               /*
+                * Attempt to move tasks. If find_busiest_group has found
+                * an imbalance but busiest->nr_running <= 1, the group is
+                * still unbalanced. nr_moved simply stays zero, so it is
+                * correctly treated as an imbalance.
+                */
+               double_lock_balance(this_rq, busiest);
+               nr_moved = move_tasks(this_rq, this_cpu, busiest,
+                                               imbalance, sd, idle);
+               spin_unlock(&busiest->lock);
  }
+       spin_unlock(&this_rq->lock);
  
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-EXPORT_PER_CPU_SYMBOL(kstat);
+       if (!nr_moved) {
+               sd->nr_balance_failed++;
+
+               if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+                       int wake = 0;
+
+                       spin_lock(&busiest->lock);
+                       if (!busiest->active_balance) {
+                               busiest->active_balance = 1;
+                               busiest->push_cpu = this_cpu;
+                               wake = 1;
+                       }
+                       spin_unlock(&busiest->lock);
+                       if (wake)
+                               wake_up_process(busiest->migration_thread);
  
  /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
+                        * We've kicked active balancing, reset the failure
+                        * counter.
   */
+                       sd->nr_balance_failed = sd->cache_nice_tries;
+               }
+       } else
+               sd->nr_balance_failed = 0;
  
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
-#define EXPIRED_STARVING(rq) \
-               ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
-               (jiffies - (rq)->expired_timestamp >= \
-                       STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
-                       ((rq)->curr->static_prio > (rq)->best_expired_prio))
-#else
-/* we need to scale the starvation based on weight 
- * classes with small weight have longer expiration starvation
- */
-#define EXPIRED_STARVING(rq) \
-                ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
-               (jiffies - (rq)->expired_timestamp >= \
-                       (((STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)*CKRM_MAX_WEIGHT)/rq->local_weight)))) || \
-                       (this_rq()->curr->static_prio > (rq)->best_expired_prio))
-#endif
+       /* We were unbalanced, so reset the balancing interval */
+       sd->balance_interval = sd->min_interval;
+
+       return nr_moved;
+
+out_balanced:
+       spin_unlock(&this_rq->lock);
+
+       /* tune up the balancing interval */
+       if (sd->balance_interval < sd->max_interval)
+               sd->balance_interval *= 2;
+
+       return 0;
+}
  
  /*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
   *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
+ * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * this_rq is locked.
   */
-void scheduler_tick(int user_ticks, int sys_ticks)
+static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+                               struct sched_domain *sd)
  {
-       int cpu = smp_processor_id();
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       runqueue_t *rq = this_rq();
-       task_t *p = current;
+       struct sched_group *group;
+       runqueue_t *busiest = NULL;
+       unsigned long imbalance;
+       int nr_moved = 0;
  
-       rq->timestamp_last_tick = sched_clock();
+       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+       if (!group)
+               goto out;
  
-       if (rcu_pending(cpu))
-               rcu_check_callbacks(cpu, user_ticks);
+       busiest = find_busiest_queue(group);
+       if (!busiest || busiest == this_rq)
+               goto out;
  
-       /* note: this timer irq context must be accounted for as well */
-       if (hardirq_count() - HARDIRQ_OFFSET) {
-               cpustat->irq += sys_ticks;
-               sys_ticks = 0;
-       } else if (softirq_count()) {
-               cpustat->softirq += sys_ticks;
-               sys_ticks = 0;
-       }
+       /* Attempt to move tasks */
+       double_lock_balance(this_rq, busiest);
  
-       if (p == rq->idle) {
-#ifdef CONFIG_VSERVER_HARDCPU
-               if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
-                       set_need_resched();     
-#endif
+       nr_moved = move_tasks(this_rq, this_cpu, busiest,
+                                       imbalance, sd, NEWLY_IDLE);
  
-               if (atomic_read(&rq->nr_iowait) > 0)
-                       cpustat->iowait += sys_ticks;
+       spin_unlock(&busiest->lock);
+
+out:
+       return nr_moved;
+}
+#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
+
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+{
+       struct sched_domain *sd;
+
+       for_each_domain(this_cpu, sd) {
+               if (sd->flags & SD_BALANCE_NEWIDLE) {
+                       if (load_balance_newidle(this_cpu, this_rq, sd)) {
+                               /* We've pulled tasks over so stop searching */
+                               break;
+                       }
+               }
+       }
+}
+
+/*
+ * active_load_balance is run by migration threads. It pushes a running
+ * task off the cpu. It can be required to correctly have at least 1 task
+ * running on each physical CPU where possible, and not have a physical /
+ * logical imbalance.
+ *
+ * Called with busiest locked.
+ */
+static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
+{
+       struct sched_domain *sd;
+       struct sched_group *group, *busy_group;
+       int i;
+
+       if (busiest->nr_running <= 1)
+               return;
+
+       for_each_domain(busiest_cpu, sd)
+               if (cpu_isset(busiest->push_cpu, sd->span))
+                       break;
+       if (!sd) {
+               WARN_ON(1);
+               return;
+       }
+
+       group = sd->groups;
+       while (!cpu_isset(busiest_cpu, group->cpumask))
+               group = group->next;
+       busy_group = group;
+
+       group = sd->groups;
+       do {
+               cpumask_t tmp;
+               runqueue_t *rq;
+               int push_cpu = 0;
+
+               if (group == busy_group)
+                       goto next_group;
+
+               cpus_and(tmp, group->cpumask, cpu_online_map);
+               if (!cpus_weight(tmp))
+                       goto next_group;
+
+               for_each_cpu_mask(i, tmp) {
+                       if (!idle_cpu(i))
+                               goto next_group;
+                       push_cpu = i;
+               }
+
+               rq = cpu_rq(push_cpu);
+
+               /*
+                * This condition is "impossible", but since load
+                * balancing is inherently a bit racy and statistical,
+                * it can trigger.. Reported by Bjorn Helgaas on a
+                * 128-cpu setup.
+                */
+               if (unlikely(busiest == rq))
+                       goto next_group;
+               double_lock_balance(busiest, rq);
+               move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
+               spin_unlock(&rq->lock);
+next_group:
+               group = group->next;
+       } while (group != sd->groups);
+}
+
+/*
+ * rebalance_tick will get called every timer tick, on every CPU.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+
+/* Don't have all balancing operations going off at once */
+#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+
+static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
+                          enum idle_type idle)
+{
+       unsigned long old_load, this_load;
+       unsigned long j = jiffies + CPU_OFFSET(this_cpu);
+       struct sched_domain *sd;
+
+       /* Update our load */
+       old_load = this_rq->cpu_load;
+       this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+       /*
+        * Round up the averaging division if load is increasing. This
+        * prevents us from getting stuck on 9 if the load is 10, for
+        * example.
+        */
+       if (this_load > old_load)
+               old_load++;
+       this_rq->cpu_load = (old_load + this_load) / 2;
+
+       for_each_domain(this_cpu, sd) {
+               unsigned long interval = sd->balance_interval;
+
+               if (idle != IDLE)
+                       interval *= sd->busy_factor;
+
+               /* scale ms to jiffies */
+               interval = msecs_to_jiffies(interval);
+               if (unlikely(!interval))
+                       interval = 1;
+
+               if (j - sd->last_balance >= interval) {
+                       if (load_balance(this_cpu, this_rq, sd, idle)) {
+                               /* We've pulled tasks over so no longer idle */
+                               idle = NOT_IDLE;
+                       }
+                       sd->last_balance += interval;
+               }
+       }
+}
+#else /* SMP*/
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+{
+}
+static inline void idle_balance(int cpu, runqueue_t *rq)
+{
+}
+#endif
+
+static inline int wake_priority_sleeper(runqueue_t *rq)
+{
+#ifdef CONFIG_SCHED_SMT
+       /*
+        * If an SMT sibling task has been put to sleep for priority
+        * reasons reschedule the idle task to see if it can now run.
+        */
+       if (rq->nr_running) {
+               resched_task(rq->idle);
+               return 1;
+       }
+#endif
+       return 0;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+EXPORT_PER_CPU_SYMBOL(kstat);
+
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+#define EXPIRED_STARVING(rq) \
+       ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+               (jiffies - (rq)->expired_timestamp >= \
+                       STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+                       ((rq)->curr->static_prio > (rq)->best_expired_prio))
+#else
+#define EXPIRED_STARVING(rq) \
+               (STARVATION_LIMIT && ((rq)->expired_timestamp && \
+               (jiffies - (rq)->expired_timestamp >= \
+                       STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
+#endif
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(int user_ticks, int sys_ticks)
+{
+       int cpu = smp_processor_id();
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       runqueue_t *rq = this_rq();
+       task_t *p = current;
+
+       rq->timestamp_last_tick = sched_clock();
+
+       if (rcu_pending(cpu))
+               rcu_check_callbacks(cpu, user_ticks);
+
+       /* note: this timer irq context must be accounted for as well */
+       if (hardirq_count() - HARDIRQ_OFFSET) {
+               cpustat->irq += sys_ticks;
+               sys_ticks = 0;
+       } else if (softirq_count()) {
+               cpustat->softirq += sys_ticks;
+               sys_ticks = 0;
+       }
+
+       if (p == rq->idle) {
+#ifdef CONFIG_VSERVER_HARDCPU
+               if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
+                       set_need_resched();     
+#endif
+
+               if (atomic_read(&rq->nr_iowait) > 0)
+                       cpustat->iowait += sys_ticks;
                 else
                         cpustat->idle += sys_ticks;
                 if (wake_priority_sleeper(rq))
                         goto out;
+               ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
                 rebalance_tick(cpu, rq, IDLE);
                 return;
         }
@@ -2438,11 +2639,8 @@ void scheduler_tick(int user_ticks, int sys_ticks)
         }
         if (vx_need_resched(p)) {
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
-               /* we redefine RQ to be a local runqueue */
-               ckrm_lrq_t* rq;
-               runqueue_t *cpu_rq = this_rq();
-               rq = ckrm_rq_cpu_enabled(cpu_rq) ? get_task_lrq(p) 
-                                                : &(cpu_rq->dflt_lrq);
+               /* Hubertus ... we can abstract this out */
+               ckrm_lrq_t* rq = get_task_lrq(p);
  #endif
                 dequeue_task(p, rq->active);
                 set_tsk_need_resched(p);
@@ -2454,8 +2652,8 @@ void scheduler_tick(int user_ticks, int sys_ticks)
                         rq->expired_timestamp = jiffies;
                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
                         enqueue_task(p, rq->expired);
-                       if (p->static_prio < rq->best_expired_prio)
-                               rq->best_expired_prio = p->static_prio;
+                       if (p->static_prio < this_rq()->best_expired_prio)
+                               this_rq()->best_expired_prio = p->static_prio;
                 } else
                         enqueue_task(p, rq->active);
         } else {
@@ -2489,6 +2687,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
  out_unlock:
         spin_unlock(&rq->lock);
  out:
+       ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
         rebalance_tick(cpu, rq, NOT_IDLE);
  }
  
@@ -2589,7 +2788,10 @@ asmlinkage void __sched schedule(void)
         unsigned long long now;
         unsigned long run_time;
         int cpu;
-
+#ifdef CONFIG_VSERVER_HARDCPU          
+       struct vx_info *vxi;
+       int maxidle = -HZ;
+#endif
  
         /*
          * If crash dump is in progress, this other cpu's
@@ -2600,6 +2802,7 @@ asmlinkage void __sched schedule(void)
          if (unlikely(dump_oncpu))
                  goto dump_scheduling_disabled;
  
+       //WARN_ON(system_state == SYSTEM_BOOTING);
         /*
          * Test if we are atomic.  Since do_exit() needs to call into
          * schedule() atomically, we ignore that path for now.
@@ -2634,8 +2837,19 @@ need_resched:
  
         spin_lock_irq(&rq->lock);
  
-       ckrm_account_task(rq,prev,now);
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+       if (prev != rq->idle) {
+               unsigned long long run = now - prev->timestamp;
+               ckrm_lrq_t * lrq = get_task_lrq(prev);
+
+               lrq->lrq_load -= task_load(prev);
+               cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
+               lrq->lrq_load += task_load(prev);
  
+               cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+               update_local_cvt(prev, run);
+       }
+#endif
         /*
          * if entering off of a kernel preemption go straight
          * to picking the next task.
@@ -2672,7 +2886,7 @@ need_resched:
                                 next->state &= ~TASK_ONHOLD;
                                 recalc_task_prio(next, now);
                                 __activate_task(next, rq);
-                               // printk("×·· unhold %p\n", next);
+                               // printk("··· unhold %p\n", next);
                                 break;
                         }
                         if ((ret < 0) && (maxidle < ret))
@@ -2683,11 +2897,19 @@ need_resched:
         
   pick_next:
  #endif
-       next = rq_get_next_task(rq,cpu);
-       if (unlikely(next == NULL)) {
+       if (unlikely(!rq->nr_running)) {
+               idle_balance(cpu, rq);
+                if (!rq->nr_running) {
                 next = rq->idle;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+                        rq->expired_timestamp = 0;
+#endif
+                        wake_sleeping_dependent(cpu, rq);
                 goto switch_tasks;
         }
+       }
+
+       next = rq_get_next_task(rq);
  
         if (dependent_sleeper(cpu, rq, next)) {
                 next = rq->idle;
@@ -2759,7 +2981,6 @@ switch_tasks:
         if (test_thread_flag(TIF_NEED_RESCHED))
                 goto need_resched;
  
-       
         return;
         
   dump_scheduling_disabled:
@@ -2995,22 +3216,7 @@ long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long
  
  EXPORT_SYMBOL(interruptible_sleep_on_timeout);
  
-void fastcall __sched sleep_on(wait_queue_head_t *q)
-{
-       SLEEP_ON_VAR
-
-        SLEEP_ON_BKLCHECK
-
-       current->state = TASK_UNINTERRUPTIBLE;
-
-       SLEEP_ON_HEAD
-       schedule();
-       SLEEP_ON_TAIL
-}
-
-EXPORT_SYMBOL(sleep_on);
-
-long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
  {
         SLEEP_ON_VAR
  
@@ -3763,6 +3969,8 @@ void show_state(void)
         read_unlock(&tasklist_lock);
  }
  
+EXPORT_SYMBOL_GPL(show_state);
+
  void __devinit init_idle(task_t *idle, int cpu)
  {
         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
@@ -4442,19 +4650,20 @@ void __init sched_init(void)
                         for (k = 0; k < MAX_PRIO; k++) {
                                 INIT_LIST_HEAD(array->queue + k);
                                 __clear_bit(k, array->bitmap);
-                       }
+               }
                         // delimiter for bitsearch
                         __set_bit(MAX_PRIO, array->bitmap);
-               }
+       }
  
                 rq->active = rq->arrays;
                 rq->expired = rq->arrays + 1;
-               rq->best_expired_prio = MAX_PRIO;
  #else
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
  #endif
  
+               rq->best_expired_prio = MAX_PRIO;
+
  #ifdef CONFIG_SMP
                 rq->sd = &sched_domain_init;
                 rq->cpu_load = 0;
@@ -4467,7 +4676,7 @@ void __init sched_init(void)
                 INIT_LIST_HEAD(&rq->migration_queue);
  #endif
  #ifdef CONFIG_VSERVER_HARDCPU          
-               INIT_LIST_HEAD(&rq->hold_queue);
+               INIT_LIST_HEAD(&rq->hold_queue);
  #endif
                 atomic_set(&rq->nr_iowait, 0);
         }
@@ -4503,17 +4712,17 @@ void __might_sleep(char *file, int line, int atomic_depth)
  #ifndef CONFIG_PREEMPT
         atomic_depth = 0;
  #endif
-       if ((in_atomic() || irqs_disabled()) &&
+       if (((in_atomic() != atomic_depth) || irqs_disabled()) &&
             system_state == SYSTEM_RUNNING) {
                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                         return;
                 prev_jiffy = jiffies;
                 printk(KERN_ERR "Debug: sleeping function called from invalid"
                                 " context at %s:%d\n", file, line);
-               printk("in_atomic():%d, irqs_disabled():%d\n",
-                       in_atomic(), irqs_disabled());
+               printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
+                       in_atomic(), atomic_depth, irqs_disabled());
                 dump_stack();
-       }
+}
  #endif
  }
  EXPORT_SYMBOL(__might_sleep);
@@ -4530,7 +4739,7 @@ EXPORT_SYMBOL(__might_sleep);
   * hand while permitting preemption.
   *
   * Called inside preempt_disable().
- */
+        */
  void __sched __preempt_spin_lock(spinlock_t *lock)
  {
         if (preempt_count() > 1) {
@@ -4569,28 +4778,14 @@ EXPORT_SYMBOL(__preempt_write_lock);
  int task_running_sys(struct task_struct *p)
  {
         return task_running(task_rq(p),p);
-}
+               }
  EXPORT_SYMBOL(task_running_sys);
  #endif
  
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
-
-/********************************************************************
- *
- *  CKRM Scheduler additions
- * 
- *  (a) helper functions
- *  (b) load balancing code
- *
- *  These are required here to avoid having to externalize many
- *  of the definitions in sched.c
- *
- * 
- ********************************************************************/
-
  /**
   * return the classqueue object of a certain processor
- */
+                        */
  struct classqueue_struct * get_cpu_classqueue(int cpu)
  {
         return (& (cpu_rq(cpu)->classqueue) );
@@ -4604,7 +4799,7 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
         prio_array_t *array;
         struct runqueue *rq;
         unsigned long flags;
-
+       
         rq = task_rq_lock(tsk,&flags); 
         array = tsk->array;
         if (array) {
@@ -4616,559 +4811,4 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
  
         task_rq_unlock(rq,&flags);
  }
-
-/**
- * get_min_cvt_locking  - get the mininum cvt on a particular cpu under rqlock
- */
-
-CVT_t get_min_cvt(int cpu);
-
-CVT_t get_min_cvt_locking(int cpu)
-{
-       CVT_t cvt;
-       struct runqueue *rq = cpu_rq(cpu);
-       spin_lock(&rq->lock);
-       cvt = get_min_cvt(cpu);
-       spin_unlock(&rq->lock);
-       return cvt;
-}
-
-ckrm_lrq_t *rq_get_dflt_lrq(int cpu)
-{
-       return &(cpu_rq(cpu)->dflt_lrq);
-}
-
-#ifdef CONFIG_SMP
-
-/**************  CKRM Load Balancing code ************************/
-
-static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
-                                     int phase, enum idle_type idle)
-{
-       long pressure = task_load(tmp);
-       
-       if (pressure > max) 
-               return 0;
-
-       if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
-               return 0;
-       return 1;
-}
-
-/*
- * move tasks for a specic local class
- * return number of tasks pulled
- */
-static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
-                                     runqueue_t *this_rq,
-                                     runqueue_t *busiest,
-                                     struct sched_domain *sd,
-                                     int this_cpu,
-                                     enum idle_type idle,
-                                     long* pressure_imbalance) 
-{
-       prio_array_t *array, *dst_array;
-       struct list_head *head, *curr;
-       task_t *tmp;
-       int idx;
-       int pulled = 0;
-       int phase = -1;
-       long pressure_min, pressure_max;
-       /*hzheng: magic : 90% balance is enough*/
-       long balance_min = *pressure_imbalance / 10; 
-/*
- * we don't want to migrate tasks that will reverse the balance
- *     or the tasks that make too small difference
- */
-#define CKRM_BALANCE_MAX_RATIO 100
-#define CKRM_BALANCE_MIN_RATIO 1
- start:
-       phase ++;
-       /*
-        * We first consider expired tasks. Those will likely not be
-        * executed in the near future, and they are most likely to
-        * be cache-cold, thus switching CPUs has the least effect
-        * on them.
-        */
-       if (src_lrq->expired->nr_active) {
-               array = src_lrq->expired;
-               dst_array = dst_lrq->expired;
-       } else {
-               array = src_lrq->active;
-               dst_array = dst_lrq->active;
-       }
-       
- new_array:
-       /* Start searching at priority 0: */
-       idx = 0;
- skip_bitmap:
-       if (!idx)
-               idx = sched_find_first_bit(array->bitmap);
-       else
-               idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
-       if (idx >= MAX_PRIO) {
-               if (array == src_lrq->expired && src_lrq->active->nr_active) {
-                       array = src_lrq->active;
-                       dst_array = dst_lrq->active;
-                       goto new_array;
-               }
-               if ((! phase) && (! pulled) && (idle != IDLE))
-                       goto start; //try again
-               else 
-                       goto out; //finished search for this lrq
-       }
-       
-       head = array->queue + idx;
-       curr = head->prev;
- skip_queue:
-       tmp = list_entry(curr, task_t, run_list);
-       
-       curr = curr->prev;
-       
-       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
-               if (curr != head)
-                       goto skip_queue;
-               idx++;
-               goto skip_bitmap;
-       }
-
-       pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
-       pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
-       /*
-        * skip the tasks that will reverse the balance too much
-        */
-       if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
-               *pressure_imbalance -= task_load(tmp);
-               pull_task(busiest, array, tmp, 
-                         this_rq, dst_array, this_cpu);
-               pulled++;
-
-               if (*pressure_imbalance <= balance_min)
-                       goto out;
-       }
-               
-       if (curr != head)
-               goto skip_queue;
-       idx++;
-       goto skip_bitmap;
- out:         
-       return pulled;
-}
-
-static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
-{
-       long imbalance;
-       /*
-        * make sure after balance, imbalance' > - imbalance/2
-        * we don't want the imbalance be reversed too much
-        */
-       imbalance = ckrm_get_pressure(rq_ckrm_load(dst_rq),0) 
-               - ckrm_get_pressure(rq_ckrm_load(this_rq),1);
-       imbalance /= 2;
-       return imbalance;
-}
-
-/*
- * try to balance the two runqueues
- *
- * Called with both runqueues locked.
- * if move_tasks is called, it will try to move at least one task over
- */
-static int ckrm_move_tasks(runqueue_t *this_rq, int this_cpu, 
-                          runqueue_t *busiest,
-                          unsigned long max_nr_move, struct sched_domain *sd,
-                          enum idle_type idle)
-{
-       struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
-       ckrm_lrq_t* src_lrq,*dst_lrq;
-       long pressure_imbalance, pressure_imbalance_old;
-       int src_cpu = task_cpu(busiest->curr);
-       struct list_head *list;
-       int pulled = 0;
-       long imbalance;
-
-       imbalance =  ckrm_rq_imbalance(this_rq,busiest);
-
-       if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
-               goto out;
-
-       //try to find the vip class
-        list_for_each_entry(clsptr,&active_cpu_classes,links) {
-               src_lrq = get_ckrm_lrq(clsptr,src_cpu);
-
-               if (! lrq_nr_running(src_lrq))
-                       continue;
-
-               if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
-                       {
-                               vip_cls = clsptr;
-                       }
-       }
-
-       /*
-        * do search from the most significant class
-        * hopefully, less tasks will be migrated this way
-        */
-       clsptr = vip_cls;
-
- move_class:
-       if (! clsptr)
-               goto out;
-       
-
-       src_lrq = get_ckrm_lrq(clsptr,src_cpu);
-       if (! lrq_nr_running(src_lrq))
-               goto other_class;
-       
-       dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
-
-       //how much pressure for this class should be transferred
-       pressure_imbalance = (src_lrq->lrq_load * imbalance)/WEIGHT_TO_SHARE(src_lrq->local_weight);
-       if (pulled && ! pressure_imbalance) 
-               goto other_class;
-       
-       pressure_imbalance_old = pressure_imbalance;
-       
-       //move tasks
-       pulled += 
-               ckrm_cls_move_tasks(src_lrq,dst_lrq,
-                                   this_rq,
-                                   busiest,
-                                   sd,this_cpu,idle,
-                                   &pressure_imbalance);
-
-       /* 
-        * hzheng: 2 is another magic number
-        * stop balancing if the imbalance is less than 25% of the orig
-        */
-       if (pressure_imbalance <= (pressure_imbalance_old >> 2))
-               goto out;
-               
-       //update imbalance
-       imbalance *= pressure_imbalance / pressure_imbalance_old;
- other_class:
-       //who is next?
-       list = clsptr->links.next;
-       if (list == &active_cpu_classes)
-               list = list->next;
-       clsptr = list_entry(list, typeof(*clsptr), links);
-       if (clsptr != vip_cls)
-               goto move_class;
- out:
-       return pulled;
-}
-
-/**
- * ckrm_check_balance - is load balancing necessary?
- * return 0 if load balancing is not necessary
- * otherwise return the average load of the system
- * also, update nr_group
- *
- * heuristics: 
- *   no load balancing if it's load is over average
- *   no load balancing if it's load is far more than the min
- * task:
- *   read the status of all the runqueues
- */
-static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
-                                            enum idle_type idle, int* nr_group)
-{
-       struct sched_group *group = sd->groups;
-       unsigned long min_load, max_load, avg_load;
-       unsigned long total_load, this_load, total_pwr;
-
-       max_load = this_load = total_load = total_pwr = 0;
-       min_load = 0xFFFFFFFF;
-       *nr_group = 0;
-
-       do {
-               cpumask_t tmp;
-               unsigned long load;
-               int local_group;
-               int i, nr_cpus = 0;
-
-               /* Tally up the load of all CPUs in the group */
-               cpus_and(tmp, group->cpumask, cpu_online_map);
-               if (unlikely(cpus_empty(tmp)))
-                       goto nextgroup;
-
-               avg_load = 0;
-               local_group = cpu_isset(this_cpu, group->cpumask);
-
-               for_each_cpu_mask(i, tmp) {
-                       load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
-                       nr_cpus++;
-                       avg_load += load;
-               }
-
-               if (!nr_cpus)
-                       goto nextgroup;
-
-               total_load += avg_load;
-               total_pwr += group->cpu_power;
-
-               /* Adjust by relative CPU power of the group */
-               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
-
-               if (local_group) {
-                       this_load = avg_load;
-                       goto nextgroup;
-               } else if (avg_load > max_load) {
-                       max_load = avg_load;
-               }      
-               if (avg_load < min_load) {
-                       min_load = avg_load;
-               }
-nextgroup:
-               group = group->next;
-               *nr_group = *nr_group + 1;
-       } while (group != sd->groups);
-
-       if (!max_load || this_load >= max_load)
-               goto out_balanced;
-
-       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
-
-       /* hzheng: debugging: 105 is a magic number
-        * 100*max_load <= sd->imbalance_pct*this_load)
-        * should use imbalance_pct instead
-        */
-       if (this_load > avg_load 
-           || 100*max_load < 105*this_load
-           || 100*min_load < 70*this_load
-           )
-               goto out_balanced;
-
-       return avg_load;
- out_balanced:
-       return 0;
-}
-
-/**
- * any group that has above average load is considered busy
- * find the busiest queue from any of busy group
- */
-static runqueue_t *
-ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
-                    unsigned long avg_load, enum idle_type idle,
-                    int nr_group)
-{
-       struct sched_group *group;
-       runqueue_t * busiest=NULL;
-       unsigned long rand;
-       
-       group = sd->groups;
-       rand = get_ckrm_rand(nr_group);
-       nr_group = 0;
-
-       do {
-               unsigned long load,total_load,max_load;
-               cpumask_t tmp;
-               int i;
-               runqueue_t * grp_busiest;
-
-               cpus_and(tmp, group->cpumask, cpu_online_map);
-               if (unlikely(cpus_empty(tmp)))
-                       goto find_nextgroup;
-
-               total_load = 0;
-               max_load = 0;
-               grp_busiest = NULL;
-               for_each_cpu_mask(i, tmp) {
-                       load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
-                       total_load += load;
-                       if (load > max_load) {
-                               max_load = load;
-                               grp_busiest = cpu_rq(i);
-                       }                               
-               }
-
-               total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
-               if (total_load > avg_load) {
-                       busiest = grp_busiest;
-                       if (nr_group >= rand)
-                               break;
-               }
-       find_nextgroup:         
-               group = group->next;
-               nr_group ++;
-       } while (group != sd->groups);
-
-       return busiest;
-}
-
-/**
- * load_balance - pressure based load balancing algorithm used by ckrm
- */
-static int ckrm_load_balance_locked(int this_cpu, runqueue_t *this_rq,
-                                   struct sched_domain *sd, 
-                                   enum idle_type idle)
-{
-       runqueue_t *busiest;
-       unsigned long avg_load;
-       int nr_moved,nr_group;
-
-       avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
-       if (! avg_load)
-               goto out_balanced;
-
-       busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
-       if (! busiest)
-               goto out_balanced;
-       /*
-        * This should be "impossible", but since load
-        * balancing is inherently racy and statistical,
-        * it could happen in theory.
-        */
-       if (unlikely(busiest == this_rq)) {
-               WARN_ON(1);
-               goto out_balanced;
-       }
-
-       nr_moved = 0;
-       if (busiest->nr_running > 1) {
-               /*
-                * Attempt to move tasks. If find_busiest_group has found
-                * an imbalance but busiest->nr_running <= 1, the group is
-                * still unbalanced. nr_moved simply stays zero, so it is
-                * correctly treated as an imbalance.
-                */
-               double_lock_balance(this_rq, busiest);
-               nr_moved = ckrm_move_tasks(this_rq, this_cpu, busiest,
-                                          0,sd, idle);         
-               spin_unlock(&busiest->lock);
-               if (nr_moved) {
-                       adjust_local_weight();
-               }
-       }
-
-       if (!nr_moved) 
-               sd->nr_balance_failed ++;
-       else
-               sd->nr_balance_failed  = 0;             
-
-       /* We were unbalanced, so reset the balancing interval */
-       sd->balance_interval = sd->min_interval;
-
-       return nr_moved;
-
-out_balanced:
-       /* tune up the balancing interval */
-       if (sd->balance_interval < sd->max_interval)
-               sd->balance_interval *= 2;
-
-       return 0;
-}
-
-static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
-                                   struct sched_domain *sd, 
-                                   enum idle_type idle)
-{
-       int ret;
-
-       if (ckrm_rq_cpu_disabled(this_rq)) 
-               return -1;
-       //spin_lock(&this_rq->lock);
-       read_lock(&class_list_lock);
-       ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,idle);
-       // ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,NEWLY_IDLE);
-       read_unlock(&class_list_lock);
-       //spin_unlock(&this_rq->lock);
-       return ret;
-}
-
-#endif   // CONFIG_SMP
-
-
-void ckrm_cpu_class_queue_update(int on)
-{
-       /* This is called when the mode changes from disabled
-        * to enabled (on=1) or vice versa (on=0).
-        * we make sure that all classqueues on all cpus
-        * either have the default class enqueued (on=1) or 
-        * all classes dequeued (on=0). 
-        * if not done a race condition will persist
-        * when flipping the ckrm_sched_mode.
-        * Otherwise will lead to more complicated code
-        * in rq_get_next_task, where we despite knowing of
-        * runnable tasks can not find an enqueued class.
-        */
-
-       int i;
-       runqueue_t *rq;
-       ckrm_lrq_t *lrq;
-       struct ckrm_cpu_class *clsptr;
-
-       if (on) {       
-               BUG_ON(ckrm_cpu_enabled());
-               for_each_cpu(i) {
-                       rq = cpu_rq(i);
-                       BUG_ON(ckrm_rq_cpu_enabled(rq));
-                       lrq = &rq->dflt_lrq;
-                       spin_lock(&rq->lock);
-
-                       BUG_ON(cls_in_classqueue(&lrq->classqueue_linkobj));
-
-                       classqueue_init(&rq->classqueue,1);
-                       lrq->top_priority = find_first_bit(lrq->active->bitmap,
-                                                          MAX_PRIO),
-                       classqueue_enqueue(lrq->classqueue, 
-                                          &lrq->classqueue_linkobj, 0);
-                       spin_unlock(&rq->lock);
-#if 0
-                       printk("UPDATE(%d) run=%lu:%d:%d %d:%d->%d\n", i,
-                               rq->nr_running,lrq->active->nr_active,
-                               lrq->expired->nr_active,
-                               find_first_bit(lrq->active->bitmap,MAX_PRIO),
-                               find_first_bit(lrq->expired->bitmap,MAX_PRIO),
-                               lrq->top_priority);
  #endif
-               }
-       } else {
-               for_each_cpu(i) {
-                       rq = cpu_rq(i);
-                       spin_lock(&rq->lock);
-
-                       /* walk through all classes and make sure they
-                        * are not enqueued
-                        */
-                       write_lock(&class_list_lock);
-                       list_for_each_entry(clsptr,&active_cpu_classes,links) {
-                               lrq = get_ckrm_lrq(clsptr,i);
-                               BUG_ON((lrq != &rq->dflt_lrq) && lrq_nr_running(lrq));  // must be empty
-                               if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
-                                       classqueue_dequeue(lrq->classqueue,
-                                                       &lrq->classqueue_linkobj);
-                       }
-                       rq->classqueue.enabled = 0;
-                       write_unlock(&class_list_lock);
-                       spin_unlock(&rq->lock);
-               }
-       }
-}
-
-/*
- * callback when a class is getting deleted
- * need to remove it from the class runqueue. see (class_queue_update)
- */
-
-void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr)
-{
-       int i;
-       
-       for_each_cpu(i) {
-               runqueue_t *rq = cpu_rq(i);
-               ckrm_lrq_t *lrq = get_ckrm_lrq(clsptr,i);
-
-               spin_lock(&rq->lock);
-               write_lock(&class_list_lock);
-               BUG_ON(lrq_nr_running(lrq));  // must be empty
-               if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
-                       classqueue_dequeue(lrq->classqueue,
-                                          &lrq->classqueue_linkobj);
-               write_unlock(&class_list_lock);
-               spin_unlock(&rq->lock);
-       }
-}
-
-#endif  // CONFIG_CKRM_CPU_SCHEDULE
author	Marc Fiuczynski <mef@cs.princeton.edu>
	Tue, 11 Jan 2005 04:56:07 +0000 (04:56 +0000)
committer	Marc Fiuczynski <mef@cs.princeton.edu>
	Tue, 11 Jan 2005 04:56:07 +0000 (04:56 +0000)
configs/kernel-2.6.8-i686-planetlab.config		patch \| blob \| history
include/linux/ckrm_ce.h		patch \| blob \| history
include/linux/ckrm_classqueue.h		patch \| blob \| history
include/linux/ckrm_rc.h		patch \| blob \| history
include/linux/ckrm_sched.h		patch \| blob \| history
include/linux/ckrm_tc.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/ckrm/ckrm.c		patch \| blob \| history
kernel/ckrm/ckrm_cpu_class.c		patch \| blob \| history
kernel/ckrm/ckrm_cpu_monitor.c		patch \| blob \| history
kernel/ckrm/rbce/rbcemod.c		patch \| blob \| history
kernel/ckrm_classqueue.c		patch \| blob \| history
kernel/ckrm_sched.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history