various fixes to ckrm core and the cpu controller

author Marc Fiuczynski <mef@cs.princeton.edu>

Tue, 7 Sep 2004 19:54:07 +0000 (19:54 +0000)

committer Marc Fiuczynski <mef@cs.princeton.edu>

Tue, 7 Sep 2004 19:54:07 +0000 (19:54 +0000)
author Marc Fiuczynski <mef@cs.princeton.edu>
Tue, 7 Sep 2004 19:54:07 +0000 (19:54 +0000)
committer Marc Fiuczynski <mef@cs.princeton.edu>
Tue, 7 Sep 2004 19:54:07 +0000 (19:54 +0000)
diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h

index 1bdf9b7..a825336 100644 (file)
--- a/include/linux/ckrm_classqueue.h
+++ b/include/linux/ckrm_classqueue.h
@@ -116,7 +116,7 @@ void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int
  cq_node_t *classqueue_get_head(struct classqueue_struct *cq);
  
  /*update the base priority of the classqueue*/
-void classqueue_update_base(struct classqueue_struct *cq, int new_base);
+void classqueue_update_base(struct classqueue_struct *cq);
  
  /**
   * class_compare_prio: compare the priority of this two nodes
diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h

index 9d82214..6b55e2c 100644 (file)
--- a/include/linux/ckrm_sched.h
+++ b/include/linux/ckrm_sched.h
@@ -15,30 +15,35 @@
  #ifndef _CKRM_SCHED_H
  #define _CKRM_SCHED_H
  
-#define CC_BUG_ON_DO(cond,action)  do { if (cond)  action; BUG_ON(cond); } while(0)
-#define CC_BUG_ON(cond)            BUG_ON(cond)
-
  #include <linux/sched.h>
  #include <linux/ckrm_rc.h>
  #include <linux/ckrm_classqueue.h>
-
-//update every second
-#define CVT_UPDATE_TICK     (1*HZ/1 ?: 1)
-#define CLASS_BONUS_RATE 22    // shift from ns to increase class bonus
-#define PRIORITY_BONUS_RATE 0  // ??  Hubertus
+#include <linux/random.h>
  
  #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+
  struct prio_array {
-       int nr_active;
+       unsigned int nr_active;
         unsigned long bitmap[BITMAP_SIZE];
         struct list_head queue[MAX_PRIO];
  };
  
-struct ckrm_local_runqueue {
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#define rq_active(p,rq)   (get_task_lrq(p)->active)
+#define rq_expired(p,rq)  (get_task_lrq(p)->expired)
+int __init init_ckrm_sched_res(void);
+#else
+#define rq_active(p,rq)   (rq->active)
+#define rq_expired(p,rq)  (rq->expired)
+static inline void init_ckrm_sched_res(void) {}
+static inline int ckrm_cpu_monitor_init(void) {return 0;}
+#endif
+
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+struct ckrm_runqueue {
         cq_node_t classqueue_linkobj;   /*links in classqueue */
         struct ckrm_cpu_class *cpu_class;       // class it belongs to
         struct classqueue_struct *classqueue;   // classqueue it belongs tow
-       CVT_t uncounted_cvt;
         unsigned long long uncounted_ns;
  
         prio_array_t *active, *expired, arrays[2];
@@ -55,19 +60,15 @@ struct ckrm_local_runqueue {
          * updated on enqueue, dequeue
          */
         int top_priority;
-       CVT_t local_cvt;        // snapshot of local_cvt, update on every loadbalance
+       CVT_t local_cvt;
+
+       unsigned long lrq_load;
+       int local_weight; 
+
         unsigned long magic;    //for debugging
  };
  
-/**
- * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping
- */
-struct ckrm_cpu_class_local_stat {
-       unsigned long long run;
-       unsigned long long total;
-       unsigned long long last_sleep;
-       unsigned long cpu_demand; /*estimated cpu demand */
-};
+typedef struct ckrm_runqueue ckrm_lrq_t;
  
  /**
   * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
@@ -78,23 +79,24 @@ struct ckrm_cpu_class_stat {
  
         unsigned long long total_ns;    /*how much nano-secs it has consumed */
  
-       struct ckrm_cpu_class_local_stat local_stats[NR_CPUS];
-       unsigned long cpu_demand;
+       struct ckrm_cpu_demand_stat local_stats[NR_CPUS];
+
+       /* 
+        * 
+        */
+       unsigned long max_demand; /* the maximun a class can consume */
+       int egrt,megrt; /*effective guarantee*/
+       int ehl,mehl; /*effective hard limit, my effective hard limit*/
  
-       /*temp stat used by cpu monitor */
-       int effective_guarantee;
-       int effective_limit;
-       int glut;               //true or false
         /*
-        * effective_share: for both default class and its children
-        * self_effective_share: just for the default class
+        * eshare: for both default class and its children
+        * meshare: just for the default class
          */
-       int effective_share;
-       int self_effective_share;
+       int eshare;
+       int meshare;
  };
  
-typedef struct ckrm_cpu_class_stat ckrm_stat_t;
-
+#define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
  /*
   * manages the class status
   * there should be only one instance of this object for each class in the whole system  
@@ -104,72 +106,67 @@ struct ckrm_cpu_class {
         struct ckrm_core_class *parent;
         struct ckrm_shares shares;
         spinlock_t cnt_lock;    // always grab parent's lock first and then child's
-       CVT_t global_cvt;       // total cummulative virtual time
         struct ckrm_cpu_class_stat stat;
         struct list_head links; // for linking up in cpu classes
-       struct ckrm_local_runqueue local_queues[NR_CPUS];       // runqueues 
+       ckrm_lrq_t local_queues[NR_CPUS];       // runqueues 
+       unsigned long magic;    //for debugging
  };
  
-#if CONFIG_CKRM_CPU_SCHEDULE
-#define rq_active(p,rq)   (get_task_class_queue(p)->active)
-#define rq_expired(p,rq)  (get_task_class_queue(p)->expired)
-#else
-#define rq_active(p,rq)   (rq->active)
-#define rq_expired(p,rq)  (rq->expired)
-#endif
-
-//#define cpu_class_weight(cls) (cls->shares.my_guarantee)
-#define cpu_class_weight(cls) (cls->stat.self_effective_share)
+#define cpu_class_weight(cls) (cls->stat.meshare)
+#define local_class_weight(lrq) (lrq->local_weight)
  
-#define bpt_queue(cpu) (& (cpu_rq(cpu)->classqueue) )
-CVT_t get_min_cvt(int cpu);
+static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
+{
+       return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC);
+}
  
  struct classqueue_struct *get_cpu_classqueue(int cpu);
+struct ckrm_cpu_class * get_default_cpu_class(void);
  
-extern struct ckrm_cpu_class default_cpu_class_obj;
-#define default_cpu_class (&default_cpu_class_obj)
+#define lrq_nr_running(lrq) \
+             (lrq->active->nr_active + lrq->expired->nr_active)
  
-#define local_queue_nr_running(local_queue) \
-             (local_queue->active->nr_active + local_queue->expired->nr_active)
-
-static inline struct ckrm_local_runqueue *
-get_ckrm_local_runqueue(struct ckrm_cpu_class*cls, int cpu)
+static inline ckrm_lrq_t *
+get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
  {
         return &(cls->local_queues[cpu]);
  }
  
-static inline struct ckrm_local_runqueue *get_task_class_queue(struct task_struct *p)
+static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
  {
         return &(p->cpu_class->local_queues[task_cpu(p)]);
  }
  
  #define task_list_entry(list)  list_entry(list,struct task_struct,run_list)
-#define class_list_entry(list) list_entry(list,struct ckrm_local_runqueue,classqueue_linkobj)
+#define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj)
  
  /* some additional interfaces exported from sched.c */
  struct runqueue;
-void dequeue_task(struct task_struct *p, prio_array_t * array);
-void enqueue_task(struct task_struct *p, prio_array_t * array);
-struct runqueue *task_rq_lock(task_t * p, unsigned long *flags);
-void task_rq_unlock(struct runqueue *rq, unsigned long *flags);
-extern spinlock_t cvt_lock;
  extern rwlock_t class_list_lock;
  extern struct list_head active_cpu_classes;
+unsigned int task_timeslice(task_t *p);
+void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls);
  
-/*functions exported by ckrm_cpu_class.c*/
-int __init init_ckrm_sched_res(void);
  void init_cpu_classes(void);
+void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
+void ckrm_cpu_change_class(void *task, void *old, void *new);
+
  
-/*functions exported by ckrm_cpu_monitor.c*/
-void ckrm_cpu_monitor(void);
-void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
  #define CPU_DEMAND_ENQUEUE 0
  #define CPU_DEMAND_DEQUEUE 1
  #define CPU_DEMAND_DESCHEDULE 2
-void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len);
+#define CPU_DEMAND_INIT 3
+
+/*functions exported by ckrm_cpu_monitor.c*/
+void ckrm_cpu_monitor(void);
+int ckrm_cpu_monitor_init(void);
+void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
+void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
+void adjust_local_weight(void);
  
-#define get_task_local_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
-#define get_rq_local_stat(lrq,cpu) (&(lrq)->cpu_class->stat.local_stats[cpu])
+#define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
+#define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu])
+#define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu))
  
  /**
   * get_effective_prio: return the effective priority of a class local queue
@@ -181,14 +178,15 @@ void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, u
   * currently, prio increases by 1 if either: top_priority increase by one
   *                                   or, local_cvt increases by 4ms
   */
-static inline int get_effective_prio(struct ckrm_local_runqueue * lcq)
+#define CLASS_QUANTIZER 22     //shift from ns to increase class bonus
+#define PRIORITY_QUANTIZER 0   //controls how much a high prio task can borrow
+#define CVT_INTERACTIVE_BONUS ((CLASSQUEUE_SIZE << CLASS_QUANTIZER)*2)
+static inline int get_effective_prio(ckrm_lrq_t * lrq)
  {
         int prio;
  
-       // cumulative usage
-       prio = lcq->local_cvt >> CLASS_BONUS_RATE;
-       // queue urgency
-       prio += lcq->top_priority >> PRIORITY_BONUS_RATE;
+       prio = lrq->local_cvt >> CLASS_QUANTIZER;  // cumulative usage
+       prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency
  
         return prio;
  }
@@ -206,9 +204,8 @@ static inline int get_effective_prio(struct ckrm_local_runqueue * lcq)
   *      -- rq_get_next_task (queue switch)
   *   -- update_local_cvt
   *      -- schedule
- *   -- update_global_cvt
   */
-static inline void update_class_priority(struct ckrm_local_runqueue *local_rq)
+static inline void update_class_priority(ckrm_lrq_t *local_rq)
  {
         int effective_prio = get_effective_prio(local_rq);
         classqueue_update_prio(local_rq->classqueue,
@@ -220,42 +217,81 @@ static inline void update_class_priority(struct ckrm_local_runqueue *local_rq)
   *  set the new top priority and reposition the queue
   *  called when: task enqueue/dequeue and queue switch
   */
-static inline void set_top_priority(struct ckrm_local_runqueue *class_queue,
+static inline void set_top_priority(ckrm_lrq_t *lrq,
                                     int new_priority)
  {
-       class_queue->top_priority = new_priority;
-       update_class_priority(class_queue);
+       lrq->top_priority = new_priority;
+       update_class_priority(lrq);
+}
+
+/*
+ * task_load: how much load this task counts
+ */
+static inline unsigned long task_load(struct task_struct* p)
+{
+       return (task_timeslice(p) * p->demand_stat.cpu_demand);
+}
+
+/*
+ * runqueue load is the local_weight of all the classes on this cpu
+ * must be called with class_list_lock held
+ */
+static inline unsigned long ckrm_cpu_load(int cpu)
+{
+       struct ckrm_cpu_class *clsptr;
+       ckrm_lrq_t* lrq;
+       struct ckrm_cpu_demand_stat* l_stat;
+       int total_load = 0;
+       int load;
+
+       list_for_each_entry(clsptr,&active_cpu_classes,links) {
+               lrq =  get_ckrm_lrq(clsptr,cpu);
+               l_stat = get_cls_local_stat(clsptr,cpu);
+               load = lrq->local_weight;
+               if (l_stat->cpu_demand < load)
+                       load = l_stat->cpu_demand;
+               total_load += load;
+       }       
+       return total_load;
  }
  
  static inline void class_enqueue_task(struct task_struct *p,
                                       prio_array_t * array)
  {
-       struct ckrm_local_runqueue *queue;
+       ckrm_lrq_t *lrq;
         int effective_prio;
  
-       queue = get_task_class_queue(p);
  
-       if (! cls_in_classqueue(&queue->classqueue_linkobj)) {
-               cpu_demand_event(get_task_local_stat(p),CPU_DEMAND_ENQUEUE,0);
-               /*make sure the cvt of this class is up to date*/
-               queue->local_cvt = get_min_cvt(task_cpu(p));
-               effective_prio = get_effective_prio(queue);
-               classqueue_enqueue(queue->classqueue, &queue->classqueue_linkobj, effective_prio);
+       lrq = get_task_lrq(p);
+
+       cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
+       lrq->lrq_load += task_load(p);
+
+       if ((p->prio < lrq->top_priority) && (array == lrq->active))
+               set_top_priority(lrq, p->prio); 
+
+       if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
+               cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
+               effective_prio = get_effective_prio(lrq);
+               classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio);
         } 
-       
-       if ((p->prio < queue->top_priority) && (array == queue->active))
-               set_top_priority(queue, p->prio);       
  
  }
  
  static inline void class_dequeue_task(struct task_struct *p,
                                       prio_array_t * array)
  {
-       struct ckrm_local_runqueue *queue = get_task_class_queue(p);
+       ckrm_lrq_t *lrq = get_task_lrq(p);
+       unsigned long load = task_load(p);
+       
+       BUG_ON(lrq->lrq_load < load);
+       lrq->lrq_load -= load;
  
-       if ((array == queue->active) && (p->prio == queue->top_priority)
+       cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
+
+       if ((array == lrq->active) && (p->prio == lrq->top_priority)
             && list_empty(&(array->queue[p->prio])))
-               set_top_priority(queue,
+               set_top_priority(lrq,
                                  find_next_bit(array->bitmap, MAX_PRIO,
                                                p->prio));
  }
@@ -266,32 +302,81 @@ static inline void class_dequeue_task(struct task_struct *p,
   */
  static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
  {
-       struct ckrm_local_runqueue *class_queue = get_task_class_queue(p);
-       struct ckrm_cpu_class *cls = class_queue->cpu_class;
+       ckrm_lrq_t * lrq = get_task_lrq(p);
+
+       unsigned long cvt_inc = nsec / local_class_weight(lrq);
  
-       unsigned long cvt_inc = nsec / cpu_class_weight(cls);
+       lrq->local_cvt += cvt_inc;
+       lrq->uncounted_ns += nsec;
  
-       class_queue->local_cvt += cvt_inc;
-       class_queue->uncounted_cvt += cvt_inc;
+       update_class_priority(lrq);
+}
+
+static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
+{
+       struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
+       struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj);
  
-       class_queue->uncounted_ns += nsec;
-       update_class_priority(class_queue);
+       return (class_compare_prio(node1,node2) < 0);
  }
  
  /*
- * called during loadbalancing 
- * to charge the class with locally accumulated cvt
+ * return a random value with range [0, (val-1)]
   */
-void update_global_cvts(int this_cpu);
+static inline int get_ckrm_rand(unsigned long val)
+{
+       int rand;
  
-/**
- * 
- */
-static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
+       if (! val)
+               return 0;
+
+       get_random_bytes(&rand,sizeof(rand));
+       return (rand % val);
+}
+
+void update_class_cputime(int this_cpu);
+
+/**********************************************/
+/*          PID_LOAD_BALANCING                */
+/**********************************************/
+struct ckrm_load_struct {
+       unsigned long load_p;   /*propotional*/
+       unsigned long load_i;   /*integral   */
+       long load_d;   /*derivative */
+};
+
+typedef struct ckrm_load_struct ckrm_load_t;
+
+static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
+       ckrm_load->load_p = 0;
+       ckrm_load->load_i = 0;
+       ckrm_load->load_d = 0;
+}
+
+void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
+#define rq_ckrm_load(rq) (&((rq)->ckrm_load))
+
+static inline void ckrm_sched_tick(int j,int this_cpu,struct ckrm_load_struct* ckrm_load)
  {
-       struct cq_node_struct* node1 = &(get_task_class_queue(p)->classqueue_linkobj);
-       struct cq_node_struct* node2 = &(get_task_class_queue(curr)->classqueue_linkobj);
+#define CVT_UPDATE_TICK     ((HZ/2)?:1)
+#define CKRM_BASE_UPDATE_RATE 400
  
-       return (class_compare_prio(node1,node2) < 0);
+       read_lock(&class_list_lock);
+
+#ifdef CONFIG_SMP
+       ckrm_load_sample(ckrm_load,this_cpu);
+#endif
+
+       if (!(j % CVT_UPDATE_TICK))
+               update_class_cputime(this_cpu);
+
+       if (! (j % CKRM_BASE_UPDATE_RATE))
+               classqueue_update_base(get_cpu_classqueue(this_cpu));
+
+       read_unlock(&class_list_lock);
  }
+
+#endif /*CONFIG_CKRM_CPU_SCHEDULE */
+
  #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h

index b922e87..0199d9d 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -398,6 +398,24 @@ int set_current_groups(struct group_info *group_info);
  struct audit_context;          /* See audit.c */
  struct mempolicy;
  
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/**
+ * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class
+ * @run: how much time it has been running since the counter started
+ * @total: total time since the counter started
+ * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping
+ * @recalc_interval: how often do we recalculate the cpu_demand
+ * @cpu_demand: moving average of run/total
+ */
+struct ckrm_cpu_demand_stat {
+       unsigned long long run;
+       unsigned long long total;
+       unsigned long long last_sleep;
+       unsigned long long recalc_interval;
+       unsigned long cpu_demand; /*estimated cpu demand */
+};
+#endif
+
  struct task_struct {
         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
         struct thread_info *thread_info;
@@ -489,7 +507,6 @@ struct task_struct {
  /* signal handlers */
         struct signal_struct *signal;
         struct sighand_struct *sighand;
-
         sigset_t blocked, real_blocked;
         struct sigpending pending;
  
@@ -548,7 +565,9 @@ struct task_struct {
         struct list_head        taskclass_link;
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
          struct ckrm_cpu_class *cpu_class;
-#endif
+       //track cpu demand of this task
+       struct ckrm_cpu_demand_stat demand_stat;
+#endif //CONFIG_CKRM_CPU_SCHEDULE
  #endif // CONFIG_CKRM_TYPE_TASKCLASS
  #endif // CONFIG_CKRM
  
@@ -874,6 +893,7 @@ static inline int capable(int cap)
  }
  #endif
  
+
  /*
   * Routines for handling mm_structs
   */
@@ -1007,7 +1027,7 @@ static inline struct mm_struct * get_task_mm(struct task_struct * task)
  
         return mm;
  }
-
+ 
  /* set thread flags in other task's structures
   * - see asm/thread_info.h for TIF_xxxx flags available
   */
diff --git a/init/Kconfig b/init/Kconfig

index 7738741..1c01815 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -172,21 +172,12 @@ config CKRM_RES_NUMTASKS
  config CKRM_CPU_SCHEDULE
         bool "CKRM CPU scheduler"
         depends on CKRM_TYPE_TASKCLASS
-       default m
+       default y
         help
           Use CKRM CPU scheduler instead of Linux Scheduler
         
           Say N if unsure, Y to use the feature.
  
-config CKRM_CPU_MONITOR
-       bool "CKRM CPU Resoure Monitor"
-       depends on CKRM_CPU_SCHEDULE
-       default m
-       help
-         Monitor CPU Resource Usage of the classes
-       
-         Say N if unsure, Y to use the feature.
-
  config CKRM_TYPE_SOCKETCLASS
         bool "Class Manager for socket groups"
         depends on CKRM
diff --git a/init/main.c b/init/main.c

index 5c3a795..502ae94 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -50,11 +50,7 @@
  #include <asm/setup.h>
  
  #include <linux/ckrm.h>
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-int __init init_ckrm_sched_res(void);
-#else
-#define init_ckrm_sched_res() ((void)0)
-#endif
+#include <linux/ckrm_sched.h>
  
  /*
   * This is one of the first .c files built. Error out early
@@ -466,6 +462,7 @@ asmlinkage void __init start_kernel(void)
          * printk() and can access its per-cpu storage.
          */
         smp_prepare_boot_cpu();
+
         /*
          * Set up the scheduler prior starting any interrupts (such as the
          * timer interrupt). Full topology setup happens at smp_init()
diff --git a/kernel/Makefile b/kernel/Makefile

index 905f3c5..ec50010 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,12 +27,9 @@ obj-$(CONFIG_COMPAT) += compat.o
  obj-$(CONFIG_IKCONFIG) += configs.o
  obj-$(CONFIG_IKCONFIG_PROC) += configs.o
  obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
-obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o
-obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_sched.o
+obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o
  obj-$(CONFIG_AUDIT) += audit.o
  obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_KGDB) += kgdbstub.o
-
  
  ifneq ($(CONFIG_IA64),y)
  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile

index 58b9aad..c039f8e 100644 (file)
--- a/kernel/ckrm/Makefile
+++ b/kernel/ckrm/Makefile
@@ -1,14 +1,12 @@
  #
-# Makefile for CKRM 
+# Makefile for CKRM
  #
  
  ifeq ($(CONFIG_CKRM),y)
-       obj-y = ckrm.o ckrmutils.o 
-endif
-
-obj-$(CONFIG_CKRM_TYPE_TASKCLASS) += ckrm_tc.o 
-obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_tasks.o
-
-obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o        
-obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o  
-
+    obj-y = ckrm.o ckrmutils.o ckrm_tasks_stub.o rbce/
+endif  
+    obj-$(CONFIG_CKRM_TYPE_TASKCLASS)  += ckrm_tc.o
+    obj-$(CONFIG_CKRM_RES_NUMTASKS)    += ckrm_tasks.o
+    obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
+    obj-$(CONFIG_CKRM_RES_LISTENAQ)    += ckrm_listenaq.o
+    obj-$(CONFIG_CKRM_CPU_SCHEDULE)    += ckrm_cpu_class.o ckrm_cpu_monitor.o
diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c

index 0ded7f3..a066e73 100644 (file)
--- a/kernel/ckrm/ckrm_cpu_class.c
+++ b/kernel/ckrm/ckrm_cpu_class.c
@@ -23,17 +23,31 @@
  #include <linux/ckrm_classqueue.h>
  #include <linux/seq_file.h>
  
-
  struct ckrm_res_ctlr cpu_rcbs;
  
+/**
+ * insert_cpu_class - insert a class to active_cpu_class list
+ *
+ * insert the class in decreasing order of class weight
+ */
+static inline void insert_cpu_class(struct ckrm_cpu_class *cls)
+{
+       list_add(&cls->links,&active_cpu_classes);
+}
+
  /*
   *  initialize a class object and its local queues
   */
- static void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
+void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) 
  {
         int i,j,k;      
         prio_array_t *array;    
-       struct ckrm_local_runqueue* queue;
+       ckrm_lrq_t* queue;
+
+       cls->shares = *shares;
+       cls->cnt_lock = SPIN_LOCK_UNLOCKED;
+       ckrm_cpu_stat_init(&cls->stat);
+       cls->magic = CKRM_CPU_CLASS_MAGIC;
  
         for (i = 0 ; i < NR_CPUS ; i++) {
                 queue = &cls->local_queues[i];
@@ -57,35 +71,37 @@ struct ckrm_res_ctlr cpu_rcbs;
                 queue->classqueue = get_cpu_classqueue(i);
                 queue->top_priority = MAX_PRIO;
                 cq_node_init(&queue->classqueue_linkobj);
-               queue->local_cvt = 0;
-               queue->uncounted_cvt = 0;
+               queue->local_cvt = CVT_INTERACTIVE_BONUS;
+               queue->lrq_load = 0;
+               queue->local_weight = cpu_class_weight(cls);
                 queue->uncounted_ns = 0;
                 queue->magic = 0x43FF43D7;
         }
  
-       cls->shares = *shares;
-       cls->global_cvt = 0;
-       cls->cnt_lock = SPIN_LOCK_UNLOCKED;
-       ckrm_cpu_stat_init(&cls->stat);
-
         // add to class list
         write_lock(&class_list_lock);
-       list_add(&cls->links,&active_cpu_classes);
+       insert_cpu_class(cls);
         write_unlock(&class_list_lock);
  }
  
  static inline void set_default_share(ckrm_shares_t *shares)
  {
         shares->my_guarantee     = 0;
-       shares->my_limit         = CKRM_SHARE_DFLT_MAX_LIMIT;
         shares->total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       shares->max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
         shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       shares->cur_max_limit    = CKRM_SHARE_DFLT_MAX_LIMIT;
+       shares->my_limit         = CKRM_SHARE_DFLT_MAX_LIMIT;
+       shares->max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+       shares->cur_max_limit    = 0;
  }
  
-struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) {
-       return ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
+struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core)
+{
+       struct ckrm_cpu_class * cls;
+       cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class);
+       if (valid_cpu_class(cls))
+               return cls;
+       else
+               return NULL;
  }
  
  
@@ -94,7 +110,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
         struct ckrm_cpu_class *cls;
  
         if (! parent) /*root class*/
-               cls =  default_cpu_class;
+               cls =  get_default_cpu_class();
         else
                 cls = (struct ckrm_cpu_class *) kmalloc(sizeof(struct ckrm_cpu_class),GFP_ATOMIC);
  
@@ -113,7 +129,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class
                         cls->parent = parent;
                 }
         } else
-               printk("alloc_cpu_class failed GFP_ATOMIC\n");
+               printk(KERN_ERR"alloc_cpu_class failed\n");
  
         return cls;
  }              
@@ -132,7 +148,7 @@ static void ckrm_free_cpu_class(void *my_res)
                 return;
  
         /*the default class can't be freed*/
-       if (cls == default_cpu_class) 
+       if (cls == get_default_cpu_class()) 
                 return;
  
         // Assuming there will be no children when this function is called
@@ -187,7 +203,16 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
                  parres = NULL;
          }
  
+       /*
+        * hzheng: CKRM_SHARE_DONTCARE should be handled
+        */
+       if (new_share->my_guarantee == CKRM_SHARE_DONTCARE)
+               new_share->my_guarantee = 0;
+
         rc = set_shares(new_share, cur, par);
+       if (cur->my_limit == CKRM_SHARE_DONTCARE)
+               cur->my_limit = cur->max_limit;
+
  
         spin_unlock(&cls->cnt_lock);
         if (cls->parent) {
@@ -196,9 +221,6 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share)
         return rc;
  }                                                      
                         
-/*
- * translate the global_CVT to ticks
- */
  static int ckrm_cpu_get_share(void *my_res,
                               struct ckrm_shares *shares)
  {                      
@@ -213,35 +235,37 @@ static int ckrm_cpu_get_share(void *my_res,
  int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
  {
         struct ckrm_cpu_class *cls = my_res;
+       struct ckrm_cpu_class_stat* stat = &cls->stat;
+       ckrm_lrq_t* lrq;
+       int i;
  
         if (!cls) 
                 return -EINVAL;
  
         seq_printf(sfile, "-------- CPU Class Status Start---------\n");
-       seq_printf(sfile, "  gua= %d limit= %d\n",
+       seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n",
                    cls->shares.my_guarantee,
-                  cls->shares.my_limit);
-       seq_printf(sfile, "  total_gua= %d limit= %d\n",
+                  cls->shares.my_limit,
                    cls->shares.total_guarantee,
                    cls->shares.max_limit);
-       seq_printf(sfile, "  used_gua= %d cur_limit= %d\n",
+       seq_printf(sfile, "\tunused_grt= %d cur_max_limit= %d\n",
                    cls->shares.unused_guarantee,
                    cls->shares.cur_max_limit);
  
-       seq_printf(sfile, "  Share= %d\n",cpu_class_weight(cls));
-       seq_printf(sfile, "  cvt= %llu\n",cls->local_queues[0].local_cvt);
-       seq_printf(sfile, "  total_ns= %llu\n",cls->stat.total_ns);
-       seq_printf(sfile, "  prio= %d\n",cls->local_queues[0].classqueue_linkobj.prio);
-       seq_printf(sfile, "  index= %d\n",cls->local_queues[0].classqueue_linkobj.index);
-       seq_printf(sfile, "  run= %llu\n",cls->stat.local_stats[0].run);
-       seq_printf(sfile, "  total= %llu\n",cls->stat.local_stats[0].total);
-       seq_printf(sfile, "  cpu_demand= %lu\n",cls->stat.cpu_demand);
-
-       seq_printf(sfile, "  effective_guarantee= %d\n",cls->stat.effective_guarantee);
-       seq_printf(sfile, "  effective_limit= %d\n",cls->stat.effective_limit);
-       seq_printf(sfile, "  effective_share= %d\n",cls->stat.effective_share);
-       seq_printf(sfile, "-------- CPU Class Status END ---------\n");
+       seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt);
+       seq_printf(sfile, "\tmegrt= %d\n",stat->megrt);
+       seq_printf(sfile, "\tehl= %d\n",stat->ehl);
+       seq_printf(sfile, "\tmehl= %d\n",stat->mehl);
+       seq_printf(sfile, "\teshare= %d\n",stat->eshare);
+       seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls));
+       seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns);
+       seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand);
+       for_each_online_cpu(i) {
+               lrq = get_ckrm_lrq(cls,i);              
+               seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt);
+       }
  
+       seq_printf(sfile, "-------- CPU Class Status END ---------\n");
  
         return 0;
  }
@@ -249,28 +273,16 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile)
  /*
   * task will remain in the same cpu but on a different local runqueue
   */
-static void ckrm_cpu_change_class(void *task, void *old, void *new)
+void ckrm_cpu_change_class(void *task, void *old, void *new)
  {              
         struct task_struct *tsk = task;                    
         struct ckrm_cpu_class *newcls = new;
-       unsigned long flags;
-       struct runqueue *rq;
-       prio_array_t *array;
  
         /*sanity checking*/
         if (!task || ! old || !new)
                 return; 
  
-       rq = task_rq_lock(tsk,&flags); 
-       array = tsk->array;
-       if (array) {
-               dequeue_task(tsk,array);
-               tsk->cpu_class = newcls;
-               enqueue_task(tsk,rq_active(tsk,rq));
-       } else {
-               tsk->cpu_class = newcls;
-       }
-       task_rq_unlock(rq,&flags);
+       _ckrm_cpu_change_class(tsk,newcls);
  }                                                      
  
  /*dummy function, not used*/
@@ -297,7 +309,7 @@ static int ckrm_cpu_set_config(void *my_res, const char *cfgstr)
  }
         
  struct ckrm_res_ctlr cpu_rcbs = {
-       .res_name          = "CKRM CPU Class",
+       .res_name          = "cpu",
         .res_hdepth        = 1,
         .resid             = -1,
         .res_alloc         = ckrm_alloc_cpu_class,
@@ -339,10 +351,11 @@ void init_cpu_classes(void)
         //init classqueues for each processor
         for (i=0; i < NR_CPUS; i++)
                 classqueue_init(get_cpu_classqueue(i)); 
-/*
- * hzheng: initialize the default cpu class
- *         required for E14 since ckrm_init is called after sched_init
- */
+
+       /*
+        * hzheng: initialize the default cpu class
+        *  required for E14/E15 since ckrm_init is called after sched_init
+        */
         ckrm_alloc_cpu_class(NULL,NULL);
  }
  
diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c

index 674ee6e..8d6f301 100644 (file)
--- a/kernel/ckrm/ckrm_cpu_monitor.c
+++ b/kernel/ckrm/ckrm_cpu_monitor.c
@@ -28,36 +28,85 @@
  #include <asm/div64.h>
  #include <linux/ckrm_sched.h>
  
-#define CPU_MONITOR_INTERVAL (4*HZ) /*how often do we adjust the shares*/
-#define CKRM_SHARE_ACCURACY 7
+#define CPU_MONITOR_INTERVAL (2*HZ) /*how often do we adjust the shares*/
+#define CKRM_SHARE_ACCURACY 10
  #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
  
+#define CKRM_CPU_DEMAND_RUN 0
+#define CKRM_CPU_DEMAND_SLEEP 1
+//sample task cpu demand every 64ms
+#define CPU_DEMAND_TASK_RECALC  (64000000LL)
+#define CPU_DEMAND_CLASS_RECALC (256000000LL)
+#define CPU_DEMAND_TP_CLASS 0
+#define CPU_DEMAND_TP_TASK 1
+
  extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
+void update_ckrm_idle(unsigned long surplus);
+
+/*interface to share definition*/
+static inline int get_soft_limit(struct ckrm_cpu_class *cls)
+{
+       return cls->shares.my_limit;
+}
+
+static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
+{
+       return cls->shares.total_guarantee;
+}
+
+static inline int get_hard_limit(struct ckrm_cpu_class *cls)
+{
+       return cls->shares.total_guarantee;
+}
+
+static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
+{
+       return cls->shares.total_guarantee;
+}
+
+
+static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
+{
+       unsigned long long now = sched_clock();
+
+       local_stat->run = 0;
+       local_stat->total = 0;
+       local_stat->last_sleep = now;
+       switch (type) {
+       case CPU_DEMAND_TP_CLASS:
+               local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
+               local_stat->cpu_demand = 0; 
+               break;
+       case CPU_DEMAND_TP_TASK:
+               local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
+               //for task, the init cpu_demand is copied from its parent
+               break;
+       default:
+               BUG();
+       }
+}
  
  void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
  {
         int i;
-       struct ckrm_cpu_class_local_stat* local_stat;
-       unsigned long long now = sched_clock();
  
         stat->stat_lock = SPIN_LOCK_UNLOCKED;
         stat->total_ns = 0;
-       stat->cpu_demand = 0;
+       stat->max_demand = 0;
  
         for (i=0; i< NR_CPUS; i++) {
-               local_stat = &stat->local_stats[i];
-               local_stat->run = 0;
-               local_stat->total = 0;
-               local_stat->last_sleep = now;
-               local_stat->cpu_demand = 0;             
+               cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
         }
  
-       stat->effective_guarantee = 0;
-       stat->effective_limit = 0;
-       stat->glut = 0;
-       stat->effective_share = 100;
-       stat->self_effective_share = 100;
+       stat->egrt = 0;
+       stat->megrt = 0;
+       stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
+       stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
+
+       stat->eshare = CKRM_SHARE_MAX;
+       stat->meshare = CKRM_SHARE_MAX;
  }
+
  /**********************************************/
  /*          cpu demand                        */
  /**********************************************/
@@ -77,52 +126,42 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
   */
  
  /**
- * update_cpu_demand - update a state change
+ * update_cpu_demand_stat - 
   * 
- * should be called whenever the state of a local queue changes
+ * should be called whenever the state of a task/task local queue changes
   * -- when deschedule : report how much run
   * -- when enqueue: report how much sleep
   *
- * to deal with excessive long run/sleep state
- * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
+ * how often should we recalculate the cpu demand
+ * the number is in ns
   */
-#define CKRM_CPU_DEMAND_RUN 0
-#define CKRM_CPU_DEMAND_SLEEP 1
-//how often should we recalculate the cpu demand, in ns
-#define CPU_DEMAND_CAL_THRESHOLD (1000000000LL)
-static inline void update_local_cpu_demand(struct ckrm_cpu_class_local_stat* local_stat,int state, unsigned long long len)
+static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
  {      
         local_stat->total += len;
         if (state == CKRM_CPU_DEMAND_RUN)
                 local_stat->run += len;
  
-       if (local_stat->total >= CPU_DEMAND_CAL_THRESHOLD) {
+       if (local_stat->total >= local_stat->recalc_interval) {
                 local_stat->total >>= CKRM_SHARE_ACCURACY;
-               if (local_stat->total > 0xFFFFFFFF)
-                       local_stat->total = 0xFFFFFFFF;
+               if (unlikely(local_stat->run > 0xFFFFFFFF))
+                       local_stat->run = 0xFFFFFFFF;
  
+               if (local_stat->total > 0xFFFFFFFF) 
+                       local_stat->total = 0xFFFFFFFF;
+                       
                 do_div(local_stat->run,(unsigned long)local_stat->total);
-               local_stat->cpu_demand +=local_stat->run;
-               local_stat->cpu_demand >>= 1;
+
+               if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
+                       local_stat->cpu_demand = local_stat->run;
+               else {
+                       local_stat->cpu_demand += local_stat->run;
+                       local_stat->cpu_demand >>= 1;
+               }
                 local_stat->total = 0;
                 local_stat->run = 0;
         }
  }
  
-static inline void cpu_demand_update_run(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
-{
-       update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_RUN,len);
-}
-
-static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
-{
-       update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
-}
-
-#define CPU_DEMAND_ENQUEUE 0
-#define CPU_DEMAND_DEQUEUE 1
-#define CPU_DEMAND_DESCHEDULE 2
-
  /**
   * cpu_demand_event - and cpu_demand event occured
   * @event: one of the following three events:
@@ -131,19 +170,24 @@ static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* loc
   *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
   * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
   */
-void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len) 
+void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) 
  {      
         switch (event) {
         case CPU_DEMAND_ENQUEUE: 
                 len = sched_clock() - local_stat->last_sleep;
                 local_stat->last_sleep = 0;
-               cpu_demand_update_sleep(local_stat,len);
+               update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
                 break;
         case CPU_DEMAND_DEQUEUE:
-               local_stat->last_sleep = sched_clock();
+               if (! local_stat->last_sleep) {
+                       local_stat->last_sleep = sched_clock();
+               }
                 break;
         case CPU_DEMAND_DESCHEDULE:
-               cpu_demand_update_run(local_stat,len);          
+               update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
+               break;
+       case CPU_DEMAND_INIT: //for task init only
+               cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
                 break;
         default:
                 BUG();
@@ -152,18 +196,19 @@ void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, u
  
  /** 
   * check all the class local queue
- * if local queueu is not in runqueue, then it's in sleep state
- * if compare to last sleep, 
+ * 
+ * to deal with excessive long run/sleep state
+ * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
   */
  static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
  {
-       struct ckrm_cpu_class_local_stat * local_stat = &stat->local_stats[cpu];
+       struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
         unsigned long long sleep,now;
         if (local_stat->last_sleep) {
                 now = sched_clock();
                 sleep = now - local_stat->last_sleep;
                 local_stat->last_sleep = now;
-               cpu_demand_update_sleep(local_stat,sleep);
+               update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
         }
  }
  
@@ -172,51 +217,72 @@ static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int
   *
   * self_cpu_demand = sum(cpu demand of all local queues) 
   */
-static unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat
-                                               *stat)
+static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
  {
         int cpu_demand = 0;
         int i;
+       int cpuonline = 0;
  
         for_each_online_cpu(i) {
                 cpu_demand_check_sleep(stat,i);
                 cpu_demand += stat->local_stats[i].cpu_demand;
+               cpuonline ++;
         }
  
-       if (cpu_demand > CKRM_SHARE_MAX)
-               cpu_demand = CKRM_SHARE_MAX;
-       return cpu_demand;
+       return (cpu_demand/cpuonline);
  }
  
  /*
- * update effective cpu demand for each class
- * assume the root_core->parent == NULL
+ * my max demand = min(cpu_demand, my effective hard limit)
   */
-static void update_cpu_demand(struct ckrm_core_class *root_core)
+static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
+{
+       unsigned long mmax_demand = get_self_cpu_demand(stat);
+       if (mmax_demand > stat->mehl)
+               mmax_demand = stat->mehl;
+
+       return mmax_demand;
+}
+
+/**
+ * update_max_demand: update effective cpu demand for each class
+ * return -1 on error
+ * 
+ * Assume: the root_core->parent == NULL
+ */
+static int update_max_demand(struct ckrm_core_class *root_core)
  {
         struct ckrm_core_class *cur_core, *child_core;
-       struct ckrm_cpu_class *cls;
+       struct ckrm_cpu_class *cls,*c_cls;
+       int ret = -1;
  
         cur_core = root_core;
         child_core = NULL;
-       /*
-        * iterate the tree
-        * update cpu_demand of each node
-        */
-      repeat:
-       if (!cur_core)
-               return;
+       
+ repeat:
+       if (!cur_core) { //normal exit
+               ret = 0;
+               goto out;
+       }
  
         cls = ckrm_get_cpu_class(cur_core);
+       if (! cls) //invalid c_cls, abort
+               goto out;
+
         if (!child_core)        //first child
-               cls->stat.cpu_demand = get_self_cpu_demand(&cls->stat);
+               cls->stat.max_demand = get_mmax_demand(&cls->stat);
         else {
-               cls->stat.cpu_demand +=
-                   ckrm_get_cpu_class(child_core)->stat.cpu_demand;
-               if (cls->stat.cpu_demand > CKRM_SHARE_MAX)
-                       cls->stat.cpu_demand = CKRM_SHARE_MAX;
+               c_cls = ckrm_get_cpu_class(child_core);
+               if (c_cls)
+                       cls->stat.max_demand += c_cls->stat.max_demand;
+               else //invalid c_cls, abort
+                       goto out;
         }
  
+       //check class hard limit
+       if (cls->stat.max_demand > cls->stat.ehl)
+               cls->stat.max_demand = cls->stat.ehl;
+
         //next child
         child_core = ckrm_get_next_child(cur_core, child_core);
         if (child_core) {
@@ -229,78 +295,111 @@ static void update_cpu_demand(struct ckrm_core_class *root_core)
                 cur_core = child_core->hnode.parent;
         }
         goto repeat;
+ out:
+       return ret;
  }
  
  /**********************************************/
  /*          effective guarantee & limit       */
  /**********************************************/
-static inline void set_effective_share(struct ckrm_cpu_class_stat *stat,
+static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
                                        int new_share)
  {
         if (!new_share)
                 new_share = 1;
-       stat->effective_share = new_share;
+       stat->eshare = new_share;
  }
  
-static inline void set_self_effective_share(struct ckrm_cpu_class_stat *stat,
+static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
                                             int new_share)
  {
         if (!new_share)
                 new_share = 1;
-       stat->self_effective_share = new_share;
+       stat->meshare = new_share;
  }
  
-static inline void update_child_effective(struct ckrm_core_class *parent)
+/**
+ *update_child_effective - update egrt, ehl, mehl for all children of parent
+ *@parent: the parent node
+ *return -1 if anything wrong
+ *
+ */
+static int update_child_effective(struct ckrm_core_class *parent)
  {
         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
-       struct ckrm_core_class *child_core = ckrm_get_next_child(parent, NULL);
+       struct ckrm_core_class *child_core;
+
+       if (! p_cls)
+               return -1;
  
+       child_core = ckrm_get_next_child(parent, NULL);
         while (child_core) {
                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
+               if (! c_cls)
+                       return -1;
  
-               c_cls->stat.effective_guarantee =
-                   p_cls->stat.effective_guarantee *
+               c_cls->stat.egrt =
+                   p_cls->stat.egrt *
                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
-               c_cls->stat.effective_limit =
-                   p_cls->stat.effective_guarantee * c_cls->shares.my_limit /
-                   p_cls->shares.total_guarantee;
+
+               c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
+                       / c_cls->shares.total_guarantee;
+               
+               c_cls->stat.ehl =
+                   p_cls->stat.ehl *
+                   get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
+
+               c_cls->stat.mehl =
+                   c_cls->stat.ehl *
+                   get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
  
                 child_core = ckrm_get_next_child(parent, child_core);
         };
-
+       return 0;
  }
  
-/*
- * update effective guarantee and effective limit
- * -- effective share = parent->effective->share * share/parent->total_share
- * -- effective limit = parent->effective->share * limit/parent->total_share
+/**
+ * update_effectives: update egrt, ehl, mehl for the whole tree
   * should be called only when class structure changed
+ *
+ * return -1 if anything wrong happened (eg: the structure changed during the process)
   */
-static void update_effective_guarantee_limit(struct ckrm_core_class *root_core)
+static int update_effectives(struct ckrm_core_class *root_core)
  {
-       struct ckrm_core_class *cur_core, *child_core = NULL;
+       struct ckrm_core_class *cur_core, *child_core;
         struct ckrm_cpu_class *cls;
  
         cur_core = root_core;
+       child_core = NULL;
         cls = ckrm_get_cpu_class(cur_core);
-       cls->stat.effective_guarantee = CKRM_SHARE_MAX;
-       cls->stat.effective_limit = cls->stat.effective_guarantee;
  
-      repeat:
+       //initialize the effectives for root 
+       cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
+       cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
+               / cls->shares.total_guarantee;
+       cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
+               / cls->shares.total_guarantee;
+       cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
+               / cls->shares.total_guarantee;
+       
+ repeat:
         //check exit
         if (!cur_core)
-               return;
+               return 0;
  
         //visit this node
-       update_child_effective(cur_core);
+       if (update_child_effective(cur_core) == -1) {
+               return -1; //invalid cur_core node
+       }
+
         //next child
         child_core = ckrm_get_next_child(cur_core, child_core);
+
         if (child_core) {
-               //go down
+               //go down to the next hier
                 cur_core = child_core;
                 child_core = NULL;
-               goto repeat;
-       } else {                //no more child, go back
+       } else { //no more child, go back
                 child_core = cur_core;
                 cur_core = child_core->hnode.parent;
         }
@@ -312,12 +411,12 @@ static void update_effective_guarantee_limit(struct ckrm_core_class *root_core)
  /**********************************************/
  
  /*
- * surplus = my_effective_share - demand
+ * surplus = egrt - demand
   * if surplus < 0, surplus = 0 
   */
  static inline int get_node_surplus(struct ckrm_cpu_class *cls)
  {
-       int surplus = cls->stat.effective_guarantee - cls->stat.cpu_demand;
+       int surplus = cls->stat.egrt - cls->stat.max_demand;
  
         if (surplus < 0)
                 surplus = 0;
@@ -325,47 +424,81 @@ static inline int get_node_surplus(struct ckrm_cpu_class *cls)
         return surplus;
  }
  
-/*
- * consume the surplus
+static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
+{
+       int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
+
+       if (surplus < 0)
+               surplus = 0;
+
+       return surplus;
+}
+
+/**
+ * node_surplus_consume: consume the surplus
+ * @ckeck_sl: if check_sl is set, then check soft_limit
+ * @total_grt: total guarantee 
   * return how much consumed
- * set glut when necessary
+ *
+ * implements all the CKRM Scheduling Requirement
+ * update total_grt if necessary 
   */
-static inline int node_surplus_consume(int old_surplus,
+static inline int node_surplus_consume(int surplus,
                                        struct ckrm_core_class *child_core,
-                                      struct ckrm_cpu_class *p_cls)
+                                      struct ckrm_cpu_class *p_cls,
+                                      int check_sl,
+                                      int *total_grt
+                                      )
  {
         int consumed = 0;
         int inc_limit;
+       int glut = 1;
  
         struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
  
-       if (c_cls->stat.glut)
+       if (! c_cls || ! *total_grt)
                 goto out;
  
-       //check demand
-       if (c_cls->stat.effective_share >= c_cls->stat.cpu_demand) {
-               c_cls->stat.glut = 1;
+       /*can't consume more than demand or hard limit*/
+       if (c_cls->stat.eshare >= c_cls->stat.max_demand)
                 goto out;
-       }
  
         consumed =
-           old_surplus * c_cls->shares.my_guarantee /
-           p_cls->shares.total_guarantee;
+               surplus * c_cls->shares.my_guarantee / *total_grt;
  
-       //check limit
-       inc_limit = c_cls->stat.effective_limit - c_cls->stat.effective_share;
-       if (inc_limit <= consumed) {
-               c_cls->stat.glut = 1;
-               consumed = inc_limit;
+       if (! consumed) //no more share
+               goto out;
+
+       //hard limit and demand limit
+       inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
+
+       if (check_sl) {
+               int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
+                       /p_cls->shares.total_guarantee;
+               if (esl < c_cls->stat.max_demand)
+                       inc_limit = esl - c_cls->stat.eshare;
         }
  
-       c_cls->stat.effective_share += consumed;
-      out:
+
+       if (consumed > inc_limit)
+               consumed = inc_limit;
+       else
+               glut = 0;
+
+       c_cls->stat.eshare += consumed;
+
+ out:
+       if (glut) 
+               *total_grt -= c_cls->shares.my_guarantee;
+
         return consumed;
  }
  
-/*
- * re-allocate the shares for all the childs under this node
+/**
+ * alloc_surplus_node: re-allocate the shares for children under parent
+ * @parent: parent node
+ * return the remaining surplus
+ *
   * task:
   *  1. get total surplus
   *  2. allocate surplus
@@ -373,71 +506,99 @@ static inline int node_surplus_consume(int old_surplus,
   */
  static void alloc_surplus_node(struct ckrm_core_class *parent)
  {
-       int total_surplus = 0, old_surplus = 0;
+       int total_surplus , old_surplus;
         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
         struct ckrm_core_class *child_core = NULL;
         int self_share;
+       int total_grt = p_cls->shares.total_guarantee;
+       int check_sl;
+
+       if (! p_cls)
+               return;
  
+       total_surplus = get_my_node_surplus(p_cls);
         /*
-        * calculate surplus 
-        * total_surplus = sum(child_surplus)
-        * reset glut flag
          * initialize effective_share
          */
         do {
                 child_core = ckrm_get_next_child(parent, child_core);
                 if (child_core) {
-                       struct ckrm_cpu_class *c_cls =
-                           ckrm_get_cpu_class(child_core);
-                       ckrm_stat_t *stat = &c_cls->stat;
+                       struct ckrm_cpu_class *c_cls;
+
+                       c_cls = ckrm_get_cpu_class(child_core);                                         if (! c_cls)
+                               return;
  
                         total_surplus += get_node_surplus(c_cls);
-                       stat->glut = 0;
-                       set_effective_share(stat, stat->effective_guarantee);
+
+                       set_eshare(&c_cls->stat, c_cls->stat.egrt);
                 }
         } while (child_core);
  
-       /*distribute the surplus */
+       if (! total_surplus)
+               goto realloc_out;
+
+       /* distribute the surplus */
         child_core = NULL;
+       check_sl = 1;
+       old_surplus = 0;
         do {
-               if (!child_core)        //keep the surplus of last round
+               if (!child_core) {//start a new round
+
+                       //ok, everybody reached the soft limit
+                       if (old_surplus == total_surplus) 
+                               check_sl = 0;
+
                         old_surplus = total_surplus;
+               }
  
                 child_core = ckrm_get_next_child(parent, child_core);
-               if (child_core) {
+               if (child_core) 
                         total_surplus -=
-                           node_surplus_consume(old_surplus, child_core,
-                                                p_cls);
-               }
+                               node_surplus_consume(old_surplus, child_core,
+                                                    p_cls,check_sl,&total_grt);
                 //start a new round if something is allocated in the last round
-       } while (child_core || (total_surplus != old_surplus));
+       } while (child_core || check_sl || total_surplus != old_surplus);
  
-       //any remaining surplus goes to the default class
-       self_share = p_cls->stat.effective_share *
+ realloc_out:
+       /*how much for itself*/
+       self_share = p_cls->stat.eshare *
             p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
-       self_share += total_surplus;
  
-       set_self_effective_share(&p_cls->stat, self_share);
+       if (self_share < p_cls->stat.max_demand) {
+               /*any remaining surplus goes to the default class*/
+               self_share += total_surplus;    
+               if (self_share > p_cls->stat.max_demand)
+                       self_share = p_cls->stat.max_demand;
+       }
+       
+       set_meshare(&p_cls->stat, self_share);
  }
  
  /**
   * alloc_surplus - reallocate unused shares
   *
   * class A's usused share should be allocated to its siblings
+ * the re-allocation goes downward from the top
   */
-static void alloc_surplus(struct ckrm_core_class *root_core)
+static int alloc_surplus(struct ckrm_core_class *root_core)
  {
-       struct ckrm_core_class *cur_core, *child_core = NULL;
+       struct ckrm_core_class *cur_core, *child_core;
         struct ckrm_cpu_class *cls;
+       int ret = 0;
  
+       /*initialize*/
         cur_core = root_core;
+       child_core = NULL;
         cls = ckrm_get_cpu_class(cur_core);
-       cls->stat.glut = 0;
-       set_effective_share(&cls->stat, cls->stat.effective_guarantee);
+       set_eshare(&cls->stat, cls->stat.egrt);
+       /*the ckrm idle tasks get all what's remaining*/
+       /*hzheng: uncomment the following like for hard limit support */
+       //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
+       
        repeat:
         //check exit
         if (!cur_core)
-               return;
+               return ret;
  
         //visit this node
         alloc_surplus_node(cur_core);
@@ -455,6 +616,199 @@ static void alloc_surplus(struct ckrm_core_class *root_core)
         goto repeat;
  }
  
+/**********************************************/
+/*           CKRM Idle Tasks                  */
+/**********************************************/
+struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
+struct task_struct* ckrm_idle_tasks[NR_CPUS];
+
+/*how many ckrm idle tasks should I wakeup*/
+static inline int get_nr_idle(unsigned long surplus)
+{
+       int cpu_online = cpus_weight(cpu_online_map);   
+       int nr_idle = 0; 
+       
+       nr_idle = surplus * cpu_online;
+       nr_idle >>= CKRM_SHARE_ACCURACY;
+
+       if (surplus) 
+               nr_idle ++;
+
+       if (nr_idle > cpu_online)  
+               nr_idle = cpu_online;
+
+       return nr_idle;
+}
+
+/**
+ * update_ckrm_idle: update the status of the idle class according to the new surplus
+ * surplus: new system surplus
+ *
+ * Task:
+ * -- update share of the idle class 
+ * -- wakeup idle tasks according to surplus
+ */
+void update_ckrm_idle(unsigned long surplus)
+{
+       int nr_idle = get_nr_idle(surplus);
+       int i;
+       struct task_struct* idle_task;
+
+       set_eshare(&ckrm_idle_class->stat,surplus);
+       set_meshare(&ckrm_idle_class->stat,surplus);
+       /*wake up nr_idle idle tasks*/
+       for_each_online_cpu(i) {
+               idle_task = ckrm_idle_tasks[i];
+               if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
+                       ckrm_cpu_change_class(idle_task,
+                                             idle_task->cpu_class,
+                                             ckrm_idle_class);
+               }
+               if (! idle_task)
+                       continue;
+               if (i < nr_idle) {
+                       //activate it
+                       wake_up_process(idle_task);
+               } else {
+                       //deactivate it
+                       idle_task->state = TASK_INTERRUPTIBLE;
+                       set_tsk_need_resched(idle_task);
+               }
+       }
+}
+
+static int ckrm_cpu_idled(void *nothing)
+{
+       set_user_nice(current,19);
+       daemonize("ckrm_idle_task");
+
+       //deactivate it, it will be waked up by ckrm_cpu_monitor
+       current->state = TASK_INTERRUPTIBLE;
+       schedule();             
+
+       /*similar to cpu_idle */
+       while (1) {
+               while (!need_resched()) {
+                       ckrm_cpu_monitor();
+                       if (current_cpu_data.hlt_works_ok) {
+                               local_irq_disable();
+                               if (!need_resched()) {
+                                       set_tsk_need_resched(current);
+                                       safe_halt();
+                               } else
+                                       local_irq_enable();
+                       }
+               }
+               schedule();             
+       }
+       return 0;
+}
+
+/**
+ * ckrm_start_ckrm_idle: 
+ *  create the ckrm_idle_class and starts the idle tasks
+ *
+ */
+void ckrm_start_ckrm_idle(void)
+{
+       int i;
+       int ret;
+       ckrm_shares_t shares;
+       
+       ckrm_idle_class = &ckrm_idle_class_obj; 
+       memset(ckrm_idle_class,0,sizeof(shares));
+       /*don't care about the shares */
+       init_cpu_class(ckrm_idle_class,&shares);
+       printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
+       
+       for_each_online_cpu(i) {
+               ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
+               
+               /*warn on error, but the system should still work without it*/
+               if (ret < 0)
+                       printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
+               else {
+                       ckrm_idle_tasks[i] = find_task_by_pid(ret);
+                       if (!ckrm_idle_tasks[i])
+                               printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
+               }
+       }
+}
+
+/**********************************************/
+/*          Local Weight                      */
+/**********************************************/
+/**
+ * adjust_class_local_weight: adjust the local weight for each cpu
+ *
+ * lrq->weight = lpr->pressure * class->weight / total_pressure
+ */
+static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
+{
+       unsigned long total_pressure = 0;
+       ckrm_lrq_t* lrq;
+       int i;
+       unsigned long class_weight;
+       unsigned long long lw;  
+
+       //get total pressure
+       for_each_online_cpu(i) {
+               lrq = get_ckrm_lrq(clsptr,i);
+               total_pressure += lrq->lrq_load;
+       }
+
+       if (! total_pressure)
+               return;
+       
+       class_weight = cpu_class_weight(clsptr) * cpu_online;
+
+       /*
+        * update weight for each cpu, minimun is 1
+        */
+       for_each_online_cpu(i) {
+               lrq = get_ckrm_lrq(clsptr,i);
+               if (! lrq->lrq_load)
+                       /*give idle class a high share to boost interactiveness */
+                       lw = cpu_class_weight(clsptr); 
+               else {
+                       lw = lrq->lrq_load * class_weight;
+                       do_div(lw,total_pressure);
+                       if (!lw)
+                               lw = 1;
+                       else if (lw > CKRM_SHARE_MAX)
+                               lw = CKRM_SHARE_MAX;
+               }
+               
+               lrq->local_weight = lw;
+       }
+}
+
+/*
+ * assume called with class_list_lock read lock held
+ */
+void adjust_local_weight(void)
+{
+       static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
+       struct ckrm_cpu_class *clsptr;
+       int cpu_online;
+
+       //do nothing if someone already holding the lock
+       if (! spin_trylock(&lock))
+               return;
+
+       cpu_online = cpus_weight(cpu_online_map);       
+
+       //class status: demand, share,total_ns prio, index
+       list_for_each_entry(clsptr,&active_cpu_classes,links) {
+               adjust_lrq_weight(clsptr,cpu_online);
+       }
+
+       spin_unlock(&lock);
+}
+
+/**********************************************/
+/*          Main                              */
+/**********************************************/
  /**
   *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
   *
@@ -464,13 +818,43 @@ static void alloc_surplus(struct ckrm_core_class *root_core)
   */
  void ckrm_cpu_monitor(void)
  {
-       struct ckrm_core_class *root_core = default_cpu_class->core;
+       static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
+       static unsigned long long last_check = 0;
+       struct ckrm_core_class *root_core = get_default_cpu_class()->core;
+       unsigned long long now; 
+#define MIN_CPU_MONITOR_INTERVAL 100000000UL
+
         if (!root_core)
                 return;
  
-       update_effective_guarantee_limit(root_core);
-       update_cpu_demand(root_core);
-       alloc_surplus(root_core);
+       //do nothing if someone already holding the lock
+       if (! spin_trylock(&lock))
+               return;
+
+       read_lock(&class_list_lock);
+
+       now = sched_clock();
+
+       //consecutive check should be at least 100ms apart
+       if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
+               goto outunlock;
+       }
+       last_check = now;
+
+
+       if (update_effectives(root_core) != 0)
+               goto outunlock;
+       
+       if (update_max_demand(root_core) != 0)
+               goto outunlock;
+       
+       if (alloc_surplus(root_core) != 0)
+               goto outunlock;
+       
+       adjust_local_weight();
+ outunlock:    
+       read_unlock(&class_list_lock);
+       spin_unlock(&lock);
  }
  
  /*****************************************************/
@@ -526,6 +910,8 @@ void ckrm_kill_monitor(void)
  int ckrm_cpu_monitor_init(void)
  {
         ckrm_start_monitor();
+       /*hzheng: uncomment the following like for hard limit support */
+       //      ckrm_start_ckrm_idle();
         return 0;
  }
  
diff --git a/kernel/ckrm/ckrm_tc.c b/kernel/ckrm/ckrm_tc.c

index 3162664..5909727 100644 (file)
--- a/kernel/ckrm/ckrm_tc.c
+++ b/kernel/ckrm/ckrm_tc.c
@@ -490,6 +490,7 @@ static void ckrm_reclassify_all_tasks(void)
                                 } else {
                                         read_unlock(&tasklist_lock);
                                 }
+                               pos++;
                         }
                 }
  
diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c

index fa8d2c4..4ecb673 100644 (file)
--- a/kernel/ckrm/rbce/rbcemod.c
+++ b/kernel/ckrm/rbce/rbcemod.c
@@ -1374,28 +1374,32 @@ int reclassify_pid(int pid)
  int set_tasktag(int pid, char *tag)
  {
         char *tp;
+       int rc = 0;
         struct task_struct *tsk;
         struct rbce_private_data *pdata;
+       int len;
  
         if (!tag) {
                 return -EINVAL;
         }
-
-       if ((tsk = find_task_by_pid(pid)) == NULL) {
-               return -EINVAL;
-       }
-
-       tp = kmalloc(strlen(tag) + 1, GFP_ATOMIC);
-
+       len = strlen(tag) + 1;
+       tp = kmalloc(len, GFP_ATOMIC);
         if (!tp) {
                 return -ENOMEM;
         }
+       strncpy(tp,tag,len);
+
+       read_lock(&tasklist_lock);
+       if ((tsk = find_task_by_pid(pid)) == NULL) {
+               rc = -EINVAL;
+               goto out;
+       }
  
         if (unlikely(!RBCE_DATA(tsk))) {
                 RBCE_DATAP(tsk) = create_private_data(NULL, 0);
                 if (!RBCE_DATA(tsk)) {
-                       kfree(tp);
-                       return -ENOMEM;
+                       rc = -ENOMEM;
+                       goto out;
                 }
         }
         pdata = RBCE_DATA(tsk);
@@ -1403,10 +1407,14 @@ int set_tasktag(int pid, char *tag)
                 kfree(pdata->app_tag);
         }
         pdata->app_tag = tp;
-       strcpy(pdata->app_tag, tag);
-       rbce_ckrm_reclassify(pid);
  
-       return 0;
+ out:
+       read_unlock(&tasklist_lock);
+       if (rc != 0) 
+               kfree(tp);
+       else 
+               rbce_ckrm_reclassify(pid);
+       return rc;
  }
  
  /*====================== Classification Functions =======================*/
diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c

index 1929aaf..0400844 100644 (file)
--- a/kernel/ckrm_classqueue.c
+++ b/kernel/ckrm_classqueue.c
@@ -133,12 +133,16 @@ void classqueue_update_prio(struct classqueue_struct *cq,
         
         //add to new positon, round robin for classes with same priority
         list_add_tail(&(node->list), &cq->array.queue[index]);
-       __set_bit(index, cq->array.bitmap);
-       
+       __set_bit(index, cq->array.bitmap);     
         node->index = index;
  }
  
-cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
+/**
+ *classqueue_get_min_prio: return the priority of the last node in queue
+ *
+ * this function can be called without runqueue lock held
+ */
+static inline int classqueue_get_min_prio(struct classqueue_struct *cq)
  {
         cq_node_t *result = NULL;
         int pos;
@@ -147,9 +151,36 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
          * search over the bitmap to get the first class in the queue
          */
         pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset);
-       if (pos >= CLASSQUEUE_SIZE) {   //do circular search from the beginning
+       //do circular search from the beginning
+       if (pos >= CLASSQUEUE_SIZE) 
                 pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
+
+       if (pos < CLASSQUEUE_SIZE) {
+               result = list_entry(cq->array.queue[pos].next, cq_node_t, list);
+               if (list_empty(&cq->array.queue[pos]))
+                       result = NULL;
         }
+       if (result)
+               return result->prio;
+       else 
+               return 0;
+}
+
+/**
+ * this function must be called with runqueue lock held
+ */
+cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
+{
+       cq_node_t *result = NULL;
+       int pos;
+
+       /* 
+        * search over the bitmap to get the first class in the queue
+        */
+       pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset);
+       //do circular search from the beginning
+       if (pos >= CLASSQUEUE_SIZE) 
+               pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE);
  
         if (pos < CLASSQUEUE_SIZE) {
                 BUG_ON(list_empty(&cq->array.queue[pos]));
@@ -162,15 +193,17 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq)
   * Moving the end of queue forward
   * the new_base here is logical, we need to translate to the abosule position
   */
-void classqueue_update_base(struct classqueue_struct *cq, int new_base)
+void classqueue_update_base(struct classqueue_struct *cq)
  {
-       if (!cq_nr_member(cq)) {
+       int new_base;
+       
+       if (! cq_nr_member(cq)) {
                 cq->base_offset = -1;   //not defined
                 return;
         }
  
-       //      assert(new_base >= cq->base);
-
+       new_base = classqueue_get_min_prio(cq);
+       
         if (new_base > cq->base) {
                 cq->base_offset = get_index(cq, &new_base);
                 cq->base = new_base;
diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c

index ba716d4..5ba06e1 100644 (file)
--- a/kernel/ckrm_sched.c
+++ b/kernel/ckrm_sched.c
@@ -15,57 +15,90 @@
  #include <linux/init.h>
  #include <linux/ckrm_sched.h>
  
+rwlock_t   class_list_lock = RW_LOCK_UNLOCKED;
+LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
+
+struct ckrm_cpu_class default_cpu_class_obj;
+
+struct ckrm_cpu_class * get_default_cpu_class(void) {
+       return (&default_cpu_class_obj);
+}
+
  /*******************************************************/
  /*                CVT Management                       */
  /*******************************************************/
-#define CVT_WINDOW_SIZE (CLASSQUEUE_SIZE << CLASS_BONUS_RATE)
-static CVT_t max_CVT = CVT_WINDOW_SIZE;
-
-/*
- *  Also ensure that the classes global cvt is upgraded to the 
- * minimum CVT in the system, as a class might not have run for a while
+/**
+ * update_class_cputime - update the total cpu time received by a class
+ * 
+ * class_list_lock must have been acquired 
   */
-static void update_global_cvt(struct ckrm_cpu_class *cpu_class, int cpu)
+void update_class_cputime(int this_cpu)
  {
-       struct ckrm_local_runqueue *class_queue =
-           get_ckrm_local_runqueue(cpu_class, cpu);
-       CVT_t min_cvt;
-       CVT_t local_cvt_old = class_queue->local_cvt;
+       struct ckrm_cpu_class *clsptr;
+       ckrm_lrq_t * lrq;
+       CVT_t max_cvt, min_cvt;
  
-       spin_lock(&cvt_lock);
-       if (class_queue->uncounted_cvt) {
-               cpu_class->global_cvt += class_queue->uncounted_cvt;
-               class_queue->uncounted_cvt = 0;
-       }
-       min_cvt = max_CVT - CVT_WINDOW_SIZE;
-       if (cpu_class->global_cvt < min_cvt)
-               cpu_class->global_cvt = min_cvt;
-       else  if (cpu_class->global_cvt > max_CVT)
-               max_CVT = cpu_class->global_cvt;
+       max_cvt = 0;
+
+       /*update class time, at the same time get max_cvt */
+       list_for_each_entry(clsptr, &active_cpu_classes, links) {
+               lrq = get_ckrm_lrq(clsptr, this_cpu);
  
-/* update local cvt from global cvt*/
-#if 0
-       class_queue->local_cvt = cpu_class->global_cvt;
-#endif
-       spin_unlock(&cvt_lock);
+               spin_lock(&clsptr->stat.stat_lock);
+               clsptr->stat.total_ns += lrq->uncounted_ns;
+               spin_unlock(&clsptr->stat.stat_lock);
  
-       if (class_queue->local_cvt != local_cvt_old)
-               update_class_priority(class_queue);
+               lrq->uncounted_ns = 0;
+               if (lrq->local_cvt > max_cvt)
+                       max_cvt = lrq->local_cvt;
+       }
+       min_cvt = max_cvt - CVT_INTERACTIVE_BONUS;
+       BUG_ON(min_cvt < 0);
+
+       /*check again, make sure no one get too small cvt*/
+       list_for_each_entry(clsptr, &active_cpu_classes, links) {
+               lrq = get_ckrm_lrq(clsptr, this_cpu);
+               if (lrq->local_cvt < min_cvt)
+                       lrq->local_cvt = min_cvt;
+       }
  }
  
-/*
- * class_list_lock must have been acquired 
+/*******************************************************/
+/*                PID load balancing stuff             */
+/*******************************************************/
+#define PID_SAMPLE_T 32
+#define PID_KP 20
+#define PID_KI 60
+#define PID_KD 20
+
+/**
+ * sample pid load periodically
   */
-void update_global_cvts(int this_cpu)
+void ckrm_load_sample(ckrm_load_t* pid,int cpu)
  {
-       struct ckrm_cpu_class *clsptr;
-       struct ckrm_local_runqueue *class_queue;
+       long load;
+       long err;
  
-       /*for each class*/
-       list_for_each_entry(clsptr, &active_cpu_classes, links) {
-               update_global_cvt(clsptr, this_cpu);
-               class_queue = get_ckrm_local_runqueue(clsptr, this_cpu);
-               clsptr->stat.total_ns += class_queue->uncounted_ns;
-               class_queue->uncounted_ns = 0;
-       }
+       if (jiffies % PID_SAMPLE_T)
+               return;
+
+       adjust_local_weight();  
+
+       load = ckrm_cpu_load(cpu);
+       err = load - pid->load_p;
+       pid->load_d = err;
+       pid->load_p = load;
+       pid->load_i *= 9;
+       pid->load_i += load;
+       pid->load_i /= 10;
+}
+
+long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group)
+{
+       long pressure;
+       pressure = ckrm_load->load_p * PID_KP;
+       pressure += ckrm_load->load_i * PID_KI;
+       pressure += ckrm_load->load_d * PID_KD;
+       pressure /= 100;
+       return pressure;
  }
diff --git a/kernel/sched.c b/kernel/sched.c

index fa04c39..f0e2dce 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -159,6 +159,20 @@
  #define LOW_CREDIT(p) \
         ((p)->interactive_credit < -CREDIT_LIMIT)
  
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/*
+ *  if belong to different class, compare class priority
+ *  otherwise compare task priority 
+ */
+#define TASK_PREEMPTS_CURR(p, rq) \
+       (((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio)
+
+#else
+
+#define TASK_PREEMPTS_CURR(p, rq) \
+       ((p)->prio < (rq)->curr->prio)
+#endif
+
  /*
   * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
   * to time slice values.
@@ -174,7 +188,7 @@
                 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
                         (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
  
-static unsigned int task_timeslice(task_t *p)
+unsigned int task_timeslice(task_t *p)
  {
         return BASE_TIMESLICE(p);
  }
@@ -185,32 +199,8 @@ static unsigned int task_timeslice(task_t *p)
   * These are the runqueue data structures:
   */
  typedef struct runqueue runqueue_t;
-
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
  #include <linux/ckrm_classqueue.h>
-#endif
-
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-
-/**
- *  if belong to different class, compare class priority
- *  otherwise compare task priority 
- */
-#define TASK_PREEMPTS_CURR(p, rq) \
-       (((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio)
-#else
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-struct prio_array {
-       unsigned int nr_active;
-       unsigned long bitmap[BITMAP_SIZE];
-       struct list_head queue[MAX_PRIO];
-};
-#define rq_active(p,rq)   (rq->active)
-#define rq_expired(p,rq)  (rq->expired)
-#define ckrm_rebalance_tick(j,this_cpu) do {} while (0)
-#define TASK_PREEMPTS_CURR(p, rq) \
-       ((p)->prio < (rq)->curr->prio)
-#endif
+#include <linux/ckrm_sched.h>
  
  /*
   * This is the main, per-CPU runqueue data structure.
@@ -227,7 +217,7 @@ struct runqueue {
          * remote CPUs use both these fields when doing load calculation.
          */
         unsigned long nr_running;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
         unsigned long cpu_load;
  #endif
         unsigned long long nr_switches, nr_preempt;
@@ -236,8 +226,8 @@ struct runqueue {
         task_t *curr, *idle;
         struct mm_struct *prev_mm;
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
-       unsigned long ckrm_cpu_load;
         struct classqueue_struct classqueue;   
+       ckrm_load_t ckrm_load;
  #else
          prio_array_t *active, *expired, arrays[2];
  #endif
@@ -277,77 +267,52 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
  # define task_running(rq, p)           ((rq)->curr == (p))
  #endif
  
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-#include <linux/ckrm_sched.h>
-spinlock_t cvt_lock        = SPIN_LOCK_UNLOCKED;
-rwlock_t   class_list_lock = RW_LOCK_UNLOCKED;
-LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
-struct ckrm_cpu_class default_cpu_class_obj;
-
  /*
- * the minimum CVT allowed is the base_cvt
- * otherwise, it will starve others
+ * task_rq_lock - lock the runqueue a given task resides on and disable
+ * interrupts.  Note the ordering: we can safely lookup the task_rq without
+ * explicitly disabling preemption.
   */
-CVT_t get_min_cvt(int cpu)
+static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
  {
-       cq_node_t *node;
-       struct ckrm_local_runqueue * lrq;
-       CVT_t min_cvt;
+       struct runqueue *rq;
  
-       node = classqueue_get_head(bpt_queue(cpu));
-       lrq =  (node) ? class_list_entry(node) : NULL;
-       
-       if (lrq) 
-               min_cvt = lrq->local_cvt;
-       else 
-               min_cvt = 0;
-               
-       return min_cvt;
+repeat_lock_task:
+       local_irq_save(*flags);
+       rq = task_rq(p);
+       spin_lock(&rq->lock);
+       if (unlikely(rq != task_rq(p))) {
+               spin_unlock_irqrestore(&rq->lock, *flags);
+               goto repeat_lock_task;
+       }
+       return rq;
+}
+
+static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+{
+       spin_unlock_irqrestore(&rq->lock, *flags);
  }
  
  /*
- * update the classueue base for all the runqueues
- * TODO: we can only update half of the min_base to solve the movebackward issue
+ * rq_lock - lock a given runqueue and disable interrupts.
   */
-static inline void check_update_class_base(int this_cpu) {
-       unsigned long min_base = 0xFFFFFFFF; 
-       cq_node_t *node;
-       int i;
+static runqueue_t *this_rq_lock(void)
+{
+       runqueue_t *rq;
  
-       if (! cpu_online(this_cpu)) return;
+       local_irq_disable();
+       rq = this_rq();
+       spin_lock(&rq->lock);
  
-       /*
-        * find the min_base across all the processors
-        */
-       for_each_online_cpu(i) {
-               /*
-                * I should change it to directly use bpt->base
-                */
-               node = classqueue_get_head(bpt_queue(i));
-               if (node && node->prio < min_base) {
-                       min_base = node->prio;
-               }
-       }
-       if (min_base != 0xFFFFFFFF) 
-               classqueue_update_base(bpt_queue(this_cpu),min_base);
+       return rq;
  }
  
-static inline void ckrm_rebalance_tick(int j,int this_cpu)
+static inline void rq_unlock(runqueue_t *rq)
  {
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-       read_lock(&class_list_lock);
-       if (!(j % CVT_UPDATE_TICK))
-               update_global_cvts(this_cpu);
-
-#define CKRM_BASE_UPDATE_RATE 400
-       if (! (jiffies % CKRM_BASE_UPDATE_RATE))
-               check_update_class_base(this_cpu);
-
-       read_unlock(&class_list_lock);
-#endif
+       spin_unlock_irq(&rq->lock);
  }
  
-static inline struct ckrm_local_runqueue *rq_get_next_class(struct runqueue *rq)
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
  {
         cq_node_t *node = classqueue_get_head(&rq->classqueue);
         return ((node) ? class_list_entry(node) : NULL);
@@ -357,7 +322,8 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
  {
         prio_array_t               *array;
         struct task_struct         *next;
-       struct ckrm_local_runqueue *queue;
+       ckrm_lrq_t *queue;
+       int idx;
         int cpu = smp_processor_id();
         
         next = rq->idle;
@@ -365,7 +331,7 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
         if ((queue = rq_get_next_class(rq))) {
                 array = queue->active;
                 //check switch active/expired queue
-               if (unlikely(!queue->active->nr_active)) {
+               if (unlikely(!array->nr_active)) {
                         queue->active = queue->expired;
                         queue->expired = array;
                         queue->expired_timestamp = 0;
@@ -378,20 +344,20 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
                                                    &queue->classqueue_linkobj);
                                 cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
                         }
-
                         goto retry_next_class;                          
                 }
-               BUG_ON(!queue->active->nr_active);
-               next = task_list_entry(array->queue[queue->top_priority].next);
+               BUG_ON(!array->nr_active);
+
+               idx = queue->top_priority;
+               if (queue->top_priority == MAX_PRIO) {
+                       BUG_ON(1);
+               }
+
+               next = task_list_entry(array->queue[idx].next);
         }
         return next;
  }
-
-static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load += cpu_class_weight(p->cpu_class); }
-static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load -= cpu_class_weight(p->cpu_class); }
-
-#else /*CONFIG_CKRM_CPU_SCHEDULE*/
-
+#else /*! CONFIG_CKRM_CPU_SCHEDULE*/
  static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
  {
         prio_array_t *array;
@@ -418,59 +384,14 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
  static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
  static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
  static inline void init_cpu_classes(void) { }
-static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { }
-static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { }
+#define rq_ckrm_load(rq) NULL
+static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
  #endif  /* CONFIG_CKRM_CPU_SCHEDULE */
  
-
-/*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts.  Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
- */
-runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
-{
-       struct runqueue *rq;
-
-repeat_lock_task:
-       local_irq_save(*flags);
-       rq = task_rq(p);
-       spin_lock(&rq->lock);
-       if (unlikely(rq != task_rq(p))) {
-               spin_unlock_irqrestore(&rq->lock, *flags);
-               goto repeat_lock_task;
-       }
-       return rq;
-}
-
-void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
-{
-       spin_unlock_irqrestore(&rq->lock, *flags);
-}
-
-/*
- * rq_lock - lock a given runqueue and disable interrupts.
- */
-static runqueue_t *this_rq_lock(void)
-{
-       runqueue_t *rq;
-
-       local_irq_disable();
-       rq = this_rq();
-       spin_lock(&rq->lock);
-
-       return rq;
-}
-
-static inline void rq_unlock(runqueue_t *rq)
-{
-       spin_unlock_irq(&rq->lock);
-}
-
  /*
   * Adding/removing a task to/from a priority array:
   */
-void dequeue_task(struct task_struct *p, prio_array_t *array)
+static void dequeue_task(struct task_struct *p, prio_array_t *array)
  {
         BUG_ON(! array);
         array->nr_active--;
@@ -480,7 +401,7 @@ void dequeue_task(struct task_struct *p, prio_array_t *array)
         class_dequeue_task(p,array);
  }
  
-void enqueue_task(struct task_struct *p, prio_array_t *array)
+static void enqueue_task(struct task_struct *p, prio_array_t *array)
  {
         list_add_tail(&p->run_list, array->queue + p->prio);
         __set_bit(p->prio, array->bitmap);
@@ -544,7 +465,6 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
  {
         enqueue_task(p, rq_active(p,rq));
         rq->nr_running++;
-       rq_load_inc(rq,p);
  }
  
  /*
@@ -554,7 +474,6 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
  {
         enqueue_task_head(p, rq_active(p,rq));
         rq->nr_running++;
-       rq_load_inc(rq,p);
  }
  
  static void recalc_task_prio(task_t *p, unsigned long long now)
@@ -686,7 +605,6 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
  static void deactivate_task(struct task_struct *p, runqueue_t *rq)
  {
         rq->nr_running--;
-       rq_load_dec(rq,p);
         if (p->state == TASK_UNINTERRUPTIBLE)
                 rq->nr_uninterruptible++;
         dequeue_task(p, p->array);
@@ -1060,6 +978,10 @@ void fastcall sched_fork(task_t *p)
         INIT_LIST_HEAD(&p->run_list);
         p->array = NULL;
         spin_lock_init(&p->switch_lock);
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+       cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0);
+#endif
+
  #ifdef CONFIG_PREEMPT
         /*
          * During context-switch we hold precisely one spinlock, which
@@ -1135,7 +1057,7 @@ void fastcall wake_up_forked_process(task_t * p)
                 p->array = current->array;
                 p->array->nr_active++;
                 rq->nr_running++;
-               rq_load_inc(rq,p);
+               class_enqueue_task(p,p->array);
         }
         task_rq_unlock(rq, &flags);
  }
@@ -1468,7 +1390,7 @@ lock_again:
                         p->array = current->array;
                         p->array->nr_active++;
                         rq->nr_running++;
-                       rq_load_inc(rq,p);
+                       class_enqueue_task(p,p->array);
                 }
         } else {
                 /* Not the local CPU - must adjust timestamp */
@@ -1573,13 +1495,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
  {
         dequeue_task(p, src_array);
         src_rq->nr_running--;
-       rq_load_dec(src_rq,p);
-
         set_task_cpu(p, this_cpu);
         this_rq->nr_running++;
-       rq_load_inc(this_rq,p);
         enqueue_task(p, this_array);
-
         p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
                                 + this_rq->timestamp_last_tick;
         /*
@@ -1619,133 +1537,61 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
  }
  
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
-
-struct ckrm_cpu_class *find_unbalanced_class(int busiest_cpu, int this_cpu, unsigned long *cls_imbalance)
+static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
+                                     int phase, enum idle_type idle)
  {
-       struct ckrm_cpu_class *most_unbalanced_class = NULL;
-       struct ckrm_cpu_class *clsptr;
-       int max_unbalance = 0;
-
-       list_for_each_entry(clsptr,&active_cpu_classes,links) {
-               struct ckrm_local_runqueue *this_lrq    = get_ckrm_local_runqueue(clsptr,this_cpu);
-               struct ckrm_local_runqueue *busiest_lrq = get_ckrm_local_runqueue(clsptr,busiest_cpu);
-               int unbalance_degree;
-               
-               unbalance_degree = (local_queue_nr_running(busiest_lrq) - local_queue_nr_running(this_lrq)) * cpu_class_weight(clsptr);
-               if (unbalance_degree >= *cls_imbalance) 
-                       continue;  // already looked at this class
+       long pressure = task_load(tmp);
+       
+       if (pressure > max) 
+               return 0;
  
-               if (unbalance_degree > max_unbalance) {
-                       max_unbalance = unbalance_degree;
-                       most_unbalanced_class = clsptr;
-               }
-       }
-       *cls_imbalance = max_unbalance;
-       return most_unbalanced_class;
+       if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
+               return 0;
+       return 1;
  }
  
-
  /*
- * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
+ * move tasks for a specic local class
+ * return number of tasks pulled
   */
-static int find_busiest_cpu(runqueue_t *this_rq, int this_cpu, int idle, 
-                           int *imbalance)
+static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
+                                     runqueue_t *this_rq,
+                                     runqueue_t *busiest,
+                                     struct sched_domain *sd,
+                                     int this_cpu,
+                                     enum idle_type idle,
+                                     long* pressure_imbalance) 
  {
-       int cpu_load, load, max_load, i, busiest_cpu;
-       runqueue_t *busiest, *rq_src;
-
-
-       /*Hubertus ... the concept of nr_running is replace with cpu_load */
-       cpu_load = this_rq->ckrm_cpu_load;
-
-       busiest = NULL;
-       busiest_cpu = -1;
-
-       max_load = -1;
-       for_each_online_cpu(i) {
-               rq_src = cpu_rq(i);
-               load = rq_src->ckrm_cpu_load;
-
-               if ((load > max_load) && (rq_src != this_rq)) {
-                       busiest = rq_src;
-                       busiest_cpu = i;
-                       max_load = load;
-               }
-       }
-
-       if (likely(!busiest))
-               goto out;
-
-       *imbalance = max_load - cpu_load;
-
-       /* It needs an at least ~25% imbalance to trigger balancing. */
-       if (!idle && ((*imbalance)*4 < max_load)) {
-               busiest = NULL;
-               goto out;
-       }
-
-       double_lock_balance(this_rq, busiest);
-       /*
-        * Make sure nothing changed since we checked the
-        * runqueue length.
-        */
-       if (busiest->ckrm_cpu_load <= cpu_load) {
-               spin_unlock(&busiest->lock);
-               busiest = NULL;
-       }
-out:
-       return (busiest ? busiest_cpu : -1);
-}
-
-static int load_balance(int this_cpu, runqueue_t *this_rq,
-                       struct sched_domain *sd, enum idle_type idle)
-{
-       int imbalance, idx;
-       int busiest_cpu;
-       runqueue_t *busiest;
-       prio_array_t *array;
+       prio_array_t *array, *dst_array;
         struct list_head *head, *curr;
         task_t *tmp;
-        struct ckrm_local_runqueue * busiest_local_queue;
-       struct ckrm_cpu_class *clsptr;
-       int weight;
-       unsigned long cls_imbalance;      // so we can retry other classes
-
-       // need to update global CVT based on local accumulated CVTs
-       read_lock(&class_list_lock);
-       busiest_cpu = find_busiest_cpu(this_rq, this_cpu, idle, &imbalance);
-       if (busiest_cpu == -1)
-               goto out;
-
-       busiest = cpu_rq(busiest_cpu);
-
-       /*
-        * We only want to steal a number of tasks equal to 1/2 the imbalance,
-        * otherwise we'll just shift the imbalance to the new queue:
-        */
-       imbalance /= 2;
-               
-       /* now find class on that runqueue with largest inbalance */
-       cls_imbalance = 0xFFFFFFFF; 
-
- retry_other_class:
-       clsptr = find_unbalanced_class(busiest_cpu, this_cpu, &cls_imbalance);
-       if (!clsptr) 
-               goto out_unlock;
-
-       busiest_local_queue = get_ckrm_local_runqueue(clsptr,busiest_cpu);
-       weight = cpu_class_weight(clsptr);
-
+       int idx;
+       int pulled = 0;
+       int phase = -1;
+       long pressure_min, pressure_max;
+       /*hzheng: magic : 90% balance is enough*/
+       long balance_min = *pressure_imbalance / 10; 
+/*
+ * we don't want to migrate tasks that will reverse the balance
+ *     or the tasks that make too small difference
+ */
+#define CKRM_BALANCE_MAX_RATIO 100
+#define CKRM_BALANCE_MIN_RATIO 1
+ start:
+       phase ++;
         /*
          * We first consider expired tasks. Those will likely not be
          * executed in the near future, and they are most likely to
          * be cache-cold, thus switching CPUs has the least effect
          * on them.
          */
-       if (busiest_local_queue->expired->nr_active)
-               array = busiest_local_queue->expired;
-       else
-               array = busiest_local_queue->active;
+       if (src_lrq->expired->nr_active) {
+               array = src_lrq->expired;
+               dst_array = dst_lrq->expired;
+       } else {
+               array = src_lrq->active;
+               dst_array = dst_lrq->active;
+       }
         
   new_array:
         /* Start searching at priority 0: */
@@ -1756,11 +1602,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
         else
                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
         if (idx >= MAX_PRIO) {
-               if (array == busiest_local_queue->expired && busiest_local_queue->active->nr_active) {
-                       array = busiest_local_queue->active;
+               if (array == src_lrq->expired && src_lrq->active->nr_active) {
+                       array = src_lrq->active;
+                       dst_array = dst_lrq->active;
                         goto new_array;
                 }
-               goto retry_other_class;
+               if ((! phase) && (! pulled) && (idle != IDLE))
+                       goto start; //try again
+               else 
+                       goto out; //finished search for this lrq
         }
         
         head = array->queue + idx;
@@ -1770,42 +1620,365 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
         
         curr = curr->prev;
         
-       if (!can_migrate_task(tmp, busiest, this_cpu, sd,idle)) {
+       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
                 if (curr != head)
                         goto skip_queue;
                 idx++;
                 goto skip_bitmap;
         }
-       pull_task(busiest, array, tmp, this_rq, rq_active(tmp,this_rq),this_cpu);
+
+       pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
+       pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
         /*
-        * tmp BUG FIX: hzheng
-        * load balancing can make the busiest local queue empty
-        * thus it should be removed from bpt
+        * skip the tasks that will reverse the balance too much
          */
-       if (! local_queue_nr_running(busiest_local_queue)) {
-               classqueue_dequeue(busiest_local_queue->classqueue,&busiest_local_queue->classqueue_linkobj);
-               cpu_demand_event(get_rq_local_stat(busiest_local_queue,busiest_cpu),CPU_DEMAND_DEQUEUE,0);              
+       if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
+               *pressure_imbalance -= task_load(tmp);
+               pull_task(busiest, array, tmp, 
+                         this_rq, dst_array, this_cpu);
+               pulled++;
+
+               if (*pressure_imbalance <= balance_min)
+                       goto out;
         }
+               
+       if (curr != head)
+               goto skip_queue;
+       idx++;
+       goto skip_bitmap;
+ out:         
+       return pulled;
+}
  
-       imbalance -= weight;
-       if (!idle && (imbalance>0)) {
-               if (curr != head)
-                       goto skip_queue;
-               idx++;
-               goto skip_bitmap;
+static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
+{
+       long imbalance;
+       /*
+        * make sure after balance, imbalance' > - imbalance/2
+        * we don't want the imbalance be reversed too much
+        */
+       imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) 
+               - pid_get_pressure(rq_ckrm_load(this_rq),1);
+       imbalance /= 2;
+       return imbalance;
+}
+
+/*
+ * try to balance the two runqueues
+ *
+ * Called with both runqueues locked.
+ * if move_tasks is called, it will try to move at least one task over
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+                     unsigned long max_nr_move, struct sched_domain *sd,
+                     enum idle_type idle)
+{
+       struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
+       ckrm_lrq_t* src_lrq,*dst_lrq;
+       long pressure_imbalance, pressure_imbalance_old;
+       int src_cpu = task_cpu(busiest->curr);
+       struct list_head *list;
+       int pulled = 0;
+       long imbalance;
+
+       imbalance =  ckrm_rq_imbalance(this_rq,busiest);
+
+       if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
+               goto out;
+
+       //try to find the vip class
+        list_for_each_entry(clsptr,&active_cpu_classes,links) {
+               src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+
+               if (! lrq_nr_running(src_lrq))
+                       continue;
+
+               if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
+                       {
+                               vip_cls = clsptr;
+                       }
         }
- out_unlock:
-       spin_unlock(&busiest->lock);
+
+       /*
+        * do search from the most significant class
+        * hopefully, less tasks will be migrated this way
+        */
+       clsptr = vip_cls;
+
+ move_class:
+       if (! clsptr)
+               goto out;
+       
+
+       src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+       if (! lrq_nr_running(src_lrq))
+               goto other_class;
+       
+       dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
+
+       //how much pressure for this class should be transferred
+       pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
+       if (pulled && ! pressure_imbalance) 
+               goto other_class;
+       
+       pressure_imbalance_old = pressure_imbalance;
+       
+       //move tasks
+       pulled += 
+               ckrm_cls_move_tasks(src_lrq,dst_lrq,
+                                   this_rq,
+                                   busiest,
+                                   sd,this_cpu,idle,
+                                   &pressure_imbalance);
+
+       /* 
+        * hzheng: 2 is another magic number
+        * stop balancing if the imbalance is less than 25% of the orig
+        */
+       if (pressure_imbalance <= (pressure_imbalance_old >> 2))
+               goto out;
+               
+       //update imbalance
+       imbalance *= pressure_imbalance / pressure_imbalance_old;
+ other_class:
+       //who is next?
+       list = clsptr->links.next;
+       if (list == &active_cpu_classes)
+               list = list->next;
+       clsptr = list_entry(list, typeof(*clsptr), links);
+       if (clsptr != vip_cls)
+               goto move_class;
   out:
-       read_unlock(&class_list_lock);
+       return pulled;
+}
+
+/**
+ * ckrm_check_balance - is load balancing necessary?
+ * return 0 if load balancing is not necessary
+ * otherwise return the average load of the system
+ * also, update nr_group
+ *
+ * heuristics: 
+ *   no load balancing if it's load is over average
+ *   no load balancing if it's load is far more than the min
+ * task:
+ *   read the status of all the runqueues
+ */
+static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
+                                            enum idle_type idle, int* nr_group)
+{
+       struct sched_group *group = sd->groups;
+       unsigned long min_load, max_load, avg_load;
+       unsigned long total_load, this_load, total_pwr;
+
+       max_load = this_load = total_load = total_pwr = 0;
+       min_load = 0xFFFFFFFF;
+       *nr_group = 0;
+
+       do {
+               cpumask_t tmp;
+               unsigned long load;
+               int local_group;
+               int i, nr_cpus = 0;
+
+               /* Tally up the load of all CPUs in the group */
+               cpus_and(tmp, group->cpumask, cpu_online_map);
+               if (unlikely(cpus_empty(tmp)))
+                       goto nextgroup;
+
+               avg_load = 0;
+               local_group = cpu_isset(this_cpu, group->cpumask);
+
+               for_each_cpu_mask(i, tmp) {
+                       load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+                       nr_cpus++;
+                       avg_load += load;
+               }
+
+               if (!nr_cpus)
+                       goto nextgroup;
+
+               total_load += avg_load;
+               total_pwr += group->cpu_power;
+
+               /* Adjust by relative CPU power of the group */
+               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+               if (local_group) {
+                       this_load = avg_load;
+                       goto nextgroup;
+               } else if (avg_load > max_load) {
+                       max_load = avg_load;
+               }      
+               if (avg_load < min_load) {
+                       min_load = avg_load;
+               }
+nextgroup:
+               group = group->next;
+               *nr_group = *nr_group + 1;
+       } while (group != sd->groups);
+
+       if (!max_load || this_load >= max_load)
+               goto out_balanced;
+
+       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+       /* hzheng: debugging: 105 is a magic number
+        * 100*max_load <= sd->imbalance_pct*this_load)
+        * should use imbalance_pct instead
+        */
+       if (this_load > avg_load 
+           || 100*max_load < 105*this_load
+           || 100*min_load < 70*this_load
+           )
+               goto out_balanced;
+
+       return avg_load;
+ out_balanced:
         return 0;
  }
  
+/**
+ * any group that has above average load is considered busy
+ * find the busiest queue from any of busy group
+ */
+static runqueue_t *
+ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
+                    unsigned long avg_load, enum idle_type idle,
+                    int nr_group)
+{
+       struct sched_group *group;
+       runqueue_t * busiest=NULL;
+       unsigned long rand;
+       
+       group = sd->groups;
+       rand = get_ckrm_rand(nr_group);
+       nr_group = 0;
+
+       do {
+               unsigned long load,total_load,max_load;
+               cpumask_t tmp;
+               int i;
+               runqueue_t * grp_busiest;
+
+               cpus_and(tmp, group->cpumask, cpu_online_map);
+               if (unlikely(cpus_empty(tmp)))
+                       goto find_nextgroup;
+
+               total_load = 0;
+               max_load = 0;
+               grp_busiest = NULL;
+               for_each_cpu_mask(i, tmp) {
+                       load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
+                       total_load += load;
+                       if (load > max_load) {
+                               max_load = load;
+                               grp_busiest = cpu_rq(i);
+                       }                               
+               }
+
+               total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
+               if (total_load > avg_load) {
+                       busiest = grp_busiest;
+                       if (nr_group >= rand)
+                               break;
+               }
+       find_nextgroup:         
+               group = group->next;
+               nr_group ++;
+       } while (group != sd->groups);
+
+       return busiest;
+}
+
+/**
+ * load_balance - pressure based load balancing algorithm used by ckrm
+ */
+static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+                       struct sched_domain *sd, enum idle_type idle)
+{
+       runqueue_t *busiest;
+       unsigned long avg_load;
+       int nr_moved,nr_group;
  
-static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+       avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
+       if (! avg_load)
+               goto out_balanced;
+
+       busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
+       if (! busiest)
+               goto out_balanced;
+       /*
+        * This should be "impossible", but since load
+        * balancing is inherently racy and statistical,
+        * it could happen in theory.
+        */
+       if (unlikely(busiest == this_rq)) {
+               WARN_ON(1);
+               goto out_balanced;
+       }
+
+       nr_moved = 0;
+       if (busiest->nr_running > 1) {
+               /*
+                * Attempt to move tasks. If find_busiest_group has found
+                * an imbalance but busiest->nr_running <= 1, the group is
+                * still unbalanced. nr_moved simply stays zero, so it is
+                * correctly treated as an imbalance.
+                */
+               double_lock_balance(this_rq, busiest);
+               nr_moved = move_tasks(this_rq, this_cpu, busiest,
+                                     0,sd, idle);              
+               spin_unlock(&busiest->lock);
+               if (nr_moved) {
+                       adjust_local_weight();
+               }
+       }
+
+       if (!nr_moved) 
+               sd->nr_balance_failed ++;
+       else
+               sd->nr_balance_failed  = 0;             
+
+       /* We were unbalanced, so reset the balancing interval */
+       sd->balance_interval = sd->min_interval;
+
+       return nr_moved;
+
+out_balanced:
+       /* tune up the balancing interval */
+       if (sd->balance_interval < sd->max_interval)
+               sd->balance_interval *= 2;
+
+       return 0;
+}
+
+/*
+ * this_rq->lock is already held
+ */
+static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+                                      struct sched_domain *sd)
  {
+       int ret;
+       read_lock(&class_list_lock);
+       ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+       read_unlock(&class_list_lock);
+       return ret;
  }
-#else /* CONFIG_CKRM_CPU_SCHEDULE */
+
+static inline int load_balance(int this_cpu, runqueue_t *this_rq,
+                       struct sched_domain *sd, enum idle_type idle)
+{
+       int ret;
+
+       spin_lock(&this_rq->lock);
+       read_lock(&class_list_lock);
+       ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+       read_unlock(&class_list_lock);
+       spin_unlock(&this_rq->lock);
+       return ret;
+}
+#else /*! CONFIG_CKRM_CPU_SCHEDULE */
  /*
   * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
   * as part of a balancing operation within "domain". Returns the number of
@@ -2170,6 +2343,8 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
  out:
         return nr_moved;
  }
+#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
+
  
  /*
   * idle_balance is called by schedule() if this_cpu is about to become
@@ -2255,7 +2430,6 @@ next_group:
                 group = group->next;
         } while (group != sd->groups);
  }
-#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
  
  /*
   * rebalance_tick will get called every timer tick, on every CPU.
@@ -2276,8 +2450,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
         struct sched_domain *sd;
  
-       ckrm_rebalance_tick(j,this_cpu);
-
         /* Update our load */
         old_load = this_rq->cpu_load;
         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -2316,9 +2488,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
   */
  static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
  {
-       ckrm_rebalance_tick(jiffies,cpu);
  }
-
  static inline void idle_balance(int cpu, runqueue_t *rq)
  {
  }
@@ -2340,7 +2510,6 @@ static inline int wake_priority_sleeper(runqueue_t *rq)
  }
  
  DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
-
  EXPORT_PER_CPU_SYMBOL(kstat);
  
  /*
@@ -2364,7 +2533,7 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  #define EXPIRED_STARVING(rq) \
                 (STARVATION_LIMIT && ((rq)->expired_timestamp && \
                 (jiffies - (rq)->expired_timestamp >= \
-                       STARVATION_LIMIT * (local_queue_nr_running(rq)) + 1)))
+                       STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
  #endif
  
  /*
@@ -2447,7 +2616,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
         if (vx_need_resched(p)) {
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
                 /* Hubertus ... we can abstract this out */
-               struct ckrm_local_runqueue* rq = get_task_class_queue(p);
+               ckrm_lrq_t* rq = get_task_lrq(p);
  #endif
                 dequeue_task(p, rq->active);
                 set_tsk_need_resched(p);
@@ -2494,6 +2663,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
  out_unlock:
         spin_unlock(&rq->lock);
  out:
+       ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
         rebalance_tick(cpu, rq, NOT_IDLE);
  }
  
@@ -2634,6 +2804,19 @@ need_resched:
  
         spin_lock_irq(&rq->lock);
  
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+       if (prev != rq->idle) {
+               unsigned long long run = now - prev->timestamp;
+               ckrm_lrq_t * lrq = get_task_lrq(prev);
+
+               lrq->lrq_load -= task_load(prev);
+               cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
+               lrq->lrq_load += task_load(prev);
+
+               cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+               update_local_cvt(prev, run);
+       }
+#endif
         /*
          * if entering off of a kernel preemption go straight
          * to picking the next task.
@@ -2682,17 +2865,14 @@ pick_next:
  #endif
         if (unlikely(!rq->nr_running)) {
                 idle_balance(cpu, rq);
-               if (!rq->nr_running) {
-                       next = rq->idle;
-                       rq->expired_timestamp = 0;
-                       wake_sleeping_dependent(cpu, rq);
-                       goto switch_tasks;
-               }
         }
  
         next = rq_get_next_task(rq);
-       if (next == rq->idle) 
+       if (next == rq->idle) {
+               rq->expired_timestamp = 0;
+               wake_sleeping_dependent(cpu, rq);
                 goto switch_tasks;
+       }
  
         if (dependent_sleeper(cpu, rq, next)) {
                 next = rq->idle;
@@ -2734,14 +2914,6 @@ switch_tasks:
                 rq->nr_preempt++;
         RCU_qsctr(task_cpu(prev))++;
  
-#ifdef CONFIG_CKRM_CPU_SCHEDULE
-       if (prev != rq->idle) {
-               unsigned long long run = now - prev->timestamp;
-               cpu_demand_event(get_task_local_stat(prev),CPU_DEMAND_DESCHEDULE,run);
-               update_local_cvt(prev, run);
-       }
-#endif
-
         prev->sleep_avg -= run_time;
         if ((long)prev->sleep_avg <= 0) {
                 prev->sleep_avg = 0;
@@ -2774,7 +2946,6 @@ switch_tasks:
  }
  
  EXPORT_SYMBOL(schedule);
-
  #ifdef CONFIG_PREEMPT
  /*
   * this is is the entry point to schedule() from in-kernel preemption
@@ -3924,9 +4095,7 @@ static int migration_thread(void * data)
                 }
  
                 if (rq->active_balance) {
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
                         active_load_balance(rq, cpu);
-#endif
                         rq->active_balance = 0;
                 }
  
@@ -4401,9 +4570,6 @@ void __init sched_init(void)
  {
         runqueue_t *rq;
         int i;
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
-       int j, k;
-#endif
  
  #ifdef CONFIG_SMP
         /* Set up an initial dummy domain for early boot */
@@ -4426,22 +4592,35 @@ void __init sched_init(void)
  
         for (i = 0; i < NR_CPUS; i++) {
  #ifndef CONFIG_CKRM_CPU_SCHEDULE
+               int j, k;
                 prio_array_t *array;
-#endif
+
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
  
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
+               for (j = 0; j < 2; j++) {
+                       array = rq->arrays + j;
+                       for (k = 0; k < MAX_PRIO; k++) {
+                               INIT_LIST_HEAD(array->queue + k);
+                               __clear_bit(k, array->bitmap);
+                       }
+                       // delimiter for bitsearch
+                       __set_bit(MAX_PRIO, array->bitmap);
+               }
                 rq->active = rq->arrays;
                 rq->expired = rq->arrays + 1;
  #else
-               rq->ckrm_cpu_load = 0;
+               rq = cpu_rq(i);
+               spin_lock_init(&rq->lock);
  #endif
                 rq->best_expired_prio = MAX_PRIO;
  
  #ifdef CONFIG_SMP
                 rq->sd = &sched_domain_init;
                 rq->cpu_load = 0;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+               ckrm_load_init(rq_ckrm_load(rq));
+#endif
                 rq->active_balance = 0;
                 rq->push_cpu = 0;
                 rq->migration_thread = NULL;
@@ -4450,17 +4629,6 @@ void __init sched_init(void)
                 INIT_LIST_HEAD(&rq->hold_queue);
                 atomic_set(&rq->nr_iowait, 0);
  
-#ifndef CONFIG_CKRM_CPU_SCHEDULE
-               for (j = 0; j < 2; j++) {
-                       array = rq->arrays + j;
-                       for (k = 0; k < MAX_PRIO; k++) {
-                               INIT_LIST_HEAD(array->queue + k);
-                               __clear_bit(k, array->bitmap);
-                       }
-                       // delimiter for bitsearch
-                       __set_bit(MAX_PRIO, array->bitmap);
-               }
-#endif
         }
  
         /*
@@ -4472,7 +4640,7 @@ void __init sched_init(void)
         rq->idle = current;
         set_task_cpu(current, smp_processor_id());
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
-       current->cpu_class = default_cpu_class;
+       current->cpu_class = get_default_cpu_class();
         current->array = NULL;
  #endif
         wake_up_forked_process(current);
@@ -4566,10 +4734,30 @@ EXPORT_SYMBOL(task_running_sys);
  #ifdef CONFIG_CKRM_CPU_SCHEDULE
  /**
   * return the classqueue object of a certain processor
- * Note: not supposed to be used in performance sensitive functions
   */
  struct classqueue_struct * get_cpu_classqueue(int cpu)
  {
         return (& (cpu_rq(cpu)->classqueue) );
  }
+
+/**
+ * _ckrm_cpu_change_class - change the class of a task
+ */
+void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
+{
+       prio_array_t *array;
+       struct runqueue *rq;
+       unsigned long flags;
+
+       rq = task_rq_lock(tsk,&flags); 
+       array = tsk->array;
+       if (array) {
+               dequeue_task(tsk,array);
+               tsk->cpu_class = newcls;
+               enqueue_task(tsk,rq_active(tsk,rq));
+       } else
+               tsk->cpu_class = newcls;
+
+       task_rq_unlock(rq,&flags);
+}
  #endif
diff --git a/kernel/vserver/dlimit.c b/kernel/vserver/dlimit.c

index c7cbe7d..3f58b73 100644 (file)
--- a/kernel/vserver/dlimit.c
+++ b/kernel/vserver/dlimit.c
@@ -437,3 +437,7 @@ no_blim:
         return; 
  }
  
+#include <linux/module.h>
+
+EXPORT_SYMBOL_GPL(locate_dl_info);
+EXPORT_SYMBOL_GPL(rcu_free_dl_info);
diff --git a/kernel/vserver/sysctl.c b/kernel/vserver/sysctl.c

index 562fc0e..79df3cf 100644 (file)
--- a/kernel/vserver/sysctl.c
+++ b/kernel/vserver/sysctl.c
@@ -27,11 +27,13 @@
  enum {
          CTL_DEBUG_SWITCH = 1,
          CTL_DEBUG_LIMIT,
+        CTL_DEBUG_DLIMIT,
  };
  
  
  unsigned int vx_debug_switch = 0;
  unsigned int vx_debug_limit = 0;
+unsigned int vx_debug_dlimit = 0;
  
  
  static struct ctl_table_header *vserver_table_header;
@@ -135,6 +137,14 @@ static ctl_table debug_table[] = {
                  .mode           = 0644,
                  .proc_handler   = &proc_dodebug
          },
+        {
+                .ctl_name       = CTL_DEBUG_DLIMIT,
+                .procname       = "debug_dlimit",
+                .data           = &vx_debug_dlimit,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dodebug
+        },
          { .ctl_name = 0 }
  };
  
@@ -148,3 +158,6 @@ static ctl_table vserver_table[] = {
          { .ctl_name = 0 }
  };
  
+#include <linux/module.h>
+
+EXPORT_SYMBOL_GPL(vx_debug_dlimit);
author	Marc Fiuczynski <mef@cs.princeton.edu>
	Tue, 7 Sep 2004 19:54:07 +0000 (19:54 +0000)
committer	Marc Fiuczynski <mef@cs.princeton.edu>
	Tue, 7 Sep 2004 19:54:07 +0000 (19:54 +0000)
include/linux/ckrm_classqueue.h		patch \| blob \| history
include/linux/ckrm_sched.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
init/main.c		patch \| blob \| history
kernel/Makefile		patch \| blob \| history
kernel/ckrm/Makefile		patch \| blob \| history
kernel/ckrm/ckrm_cpu_class.c		patch \| blob \| history
kernel/ckrm/ckrm_cpu_monitor.c		patch \| blob \| history
kernel/ckrm/ckrm_tc.c		patch \| blob \| history
kernel/ckrm/rbce/rbcemod.c		patch \| blob \| history
kernel/ckrm_classqueue.c		patch \| blob \| history
kernel/ckrm_sched.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/vserver/dlimit.c		patch \| blob \| history
kernel/vserver/sysctl.c		patch \| blob \| history