Merge LKCD 2.6 tree at :pserver:anonymous@cvs.sourceforge.net:/cvsroot/lkcd/2.6 as...

[linux-2.6.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 53c92f1..e9c48e4 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -17,7 +17,6 @@
   *  2003-09-03 Interactivity tuning by Con Kolivas.
   *  2004-04-02 Scheduler domains code by Nick Piggin
   */
-
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/nmi.h>
@@ -42,7 +41,8 @@
  #include <linux/percpu.h>
  #include <linux/kthread.h>
  #include <linux/vserver/sched.h>
-#include <linux/vinline.h>
+#include <linux/vs_base.h>
+#include <asm/tlb.h>
  
  #include <asm/unistd.h>
  
@@ -52,6 +52,10 @@
  #define cpu_to_node_mask(cpu) (cpu_online_map)
  #endif
  
+/* used to soft spin in sched while dump is in progress */
+unsigned long dump_oncpu;
+EXPORT_SYMBOL(dump_oncpu);
+
  /*
   * Convert user-nice values [ -20 ... 0 ... 19 ]
   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -159,9 +163,6 @@
  #define LOW_CREDIT(p) \
         ((p)->interactive_credit < -CREDIT_LIMIT)
  
-#define TASK_PREEMPTS_CURR(p, rq) \
-       ((p)->prio < (rq)->curr->prio)
-
  /*
   * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
   * to time slice values.
@@ -184,86 +185,177 @@ static unsigned int task_timeslice(task_t *p)
  
  #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
  
+DEFINE_PER_CPU(struct runqueue, runqueues);
+
+#define for_each_domain(cpu, domain) \
+       for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+
+#define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
+#define this_rq()              (&__get_cpu_var(runqueues))
+#define task_rq(p)             cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
+
  /*
- * These are the runqueue data structures:
+ * Default context-switch locking:
   */
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(rq, next) do { } while (0)
+# define finish_arch_switch(rq, next)  spin_unlock_irq(&(rq)->lock)
+# define task_running(rq, p)           ((rq)->curr == (p))
+#endif
  
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
-typedef struct runqueue runqueue_t;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#include <linux/ckrm_sched.h>
+spinlock_t cvt_lock        = SPIN_LOCK_UNLOCKED;
+rwlock_t   class_list_lock = RW_LOCK_UNLOCKED;
+LIST_HEAD(active_cpu_classes);   // list of active cpu classes; anchor
+struct ckrm_cpu_class default_cpu_class_obj;
  
-struct prio_array {
-       unsigned int nr_active;
-       unsigned long bitmap[BITMAP_SIZE];
-       struct list_head queue[MAX_PRIO];
-};
+/*
+ * the minimum CVT allowed is the base_cvt
+ * otherwise, it will starve others
+ */
+CVT_t get_min_cvt(int cpu)
+{
+       cq_node_t *node;
+       struct ckrm_local_runqueue * lrq;
+       CVT_t min_cvt;
+
+       node = classqueue_get_head(bpt_queue(cpu));
+       lrq =  (node) ? class_list_entry(node) : NULL;
+       
+       if (lrq) 
+               min_cvt = lrq->local_cvt;
+       else 
+               min_cvt = 0;
+               
+       return min_cvt;
+}
  
  /*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
+ * update the classueue base for all the runqueues
+ * TODO: we can only update half of the min_base to solve the movebackward issue
   */
-struct runqueue {
-       spinlock_t lock;
+static inline void check_update_class_base(int this_cpu) {
+       unsigned long min_base = 0xFFFFFFFF; 
+       cq_node_t *node;
+       int i;
+
+       if (! cpu_online(this_cpu)) return;
  
         /*
-        * nr_running and cpu_load should be in the same cacheline because
-        * remote CPUs use both these fields when doing load calculation.
+        * find the min_base across all the processors
          */
-       unsigned long nr_running;
-#ifdef CONFIG_SMP
-       unsigned long cpu_load;
-#endif
-       unsigned long long nr_switches;
-       unsigned long expired_timestamp, nr_uninterruptible;
-       unsigned long long timestamp_last_tick;
-       task_t *curr, *idle;
-       struct mm_struct *prev_mm;
-       prio_array_t *active, *expired, arrays[2];
-       int best_expired_prio;
-       atomic_t nr_iowait;
+       for_each_online_cpu(i) {
+               /*
+                * I should change it to directly use bpt->base
+                */
+               node = classqueue_get_head(bpt_queue(i));
+               if (node && node->prio < min_base) {
+                       min_base = node->prio;
+               }
+       }
+       if (min_base != 0xFFFFFFFF) 
+               classqueue_update_base(bpt_queue(this_cpu),min_base);
+}
  
-#ifdef CONFIG_SMP
-       struct sched_domain *sd;
+static inline void ckrm_rebalance_tick(int j,int this_cpu)
+{
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+       read_lock(&class_list_lock);
+       if (!(j % CVT_UPDATE_TICK))
+               update_global_cvts(this_cpu);
  
-       /* For active balancing */
-       int active_balance;
-       int push_cpu;
+#define CKRM_BASE_UPDATE_RATE 400
+       if (! (jiffies % CKRM_BASE_UPDATE_RATE))
+               check_update_class_base(this_cpu);
  
-       task_t *migration_thread;
-       struct list_head migration_queue;
+       read_unlock(&class_list_lock);
  #endif
-       struct list_head hold_queue;
-       int idle_tokens;
-};
+}
  
-static DEFINE_PER_CPU(struct runqueue, runqueues);
+static inline struct ckrm_local_runqueue *rq_get_next_class(struct runqueue *rq)
+{
+       cq_node_t *node = classqueue_get_head(&rq->classqueue);
+       return ((node) ? class_list_entry(node) : NULL);
+}
  
-#define for_each_domain(cpu, domain) \
-       for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+{
+       prio_array_t               *array;
+       struct task_struct         *next;
+       struct ckrm_local_runqueue *queue;
+       int cpu = smp_processor_id();
+       
+       next = rq->idle;
+ retry_next_class:
+       if ((queue = rq_get_next_class(rq))) {
+               array = queue->active;
+               //check switch active/expired queue
+               if (unlikely(!queue->active->nr_active)) {
+                       queue->active = queue->expired;
+                       queue->expired = array;
+                       queue->expired_timestamp = 0;
+
+                       if (queue->active->nr_active)
+                               set_top_priority(queue,
+                                                find_first_bit(queue->active->bitmap, MAX_PRIO));
+                       else {
+                               classqueue_dequeue(queue->classqueue,
+                                                  &queue->classqueue_linkobj);
+                               cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
+                       }
  
-#define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
-#define this_rq()              (&__get_cpu_var(runqueues))
-#define task_rq(p)             cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
+                       goto retry_next_class;                          
+               }
+               BUG_ON(!queue->active->nr_active);
+               next = task_list_entry(array->queue[queue->top_priority].next);
+       }
+       return next;
+}
+
+static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load += cpu_class_weight(p->cpu_class); }
+static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load -= cpu_class_weight(p->cpu_class); }
+
+#else /*CONFIG_CKRM_CPU_SCHEDULE*/
+
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
+{
+       prio_array_t *array;
+        struct list_head *queue;
+       int idx;
+
+       array = rq->active;
+       if (unlikely(!array->nr_active)) {
+               /*
+                * Switch the active and expired arrays.
+                */
+               rq->active = rq->expired;
+               rq->expired = array;
+               array = rq->active;
+               rq->expired_timestamp = 0;
+               rq->best_expired_prio = MAX_PRIO;
+       }
+
+       idx = sched_find_first_bit(array->bitmap);
+       queue = array->queue + idx;
+       return list_entry(queue->next, task_t, run_list);
+}
+
+static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void init_cpu_classes(void) { }
+static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { }
+static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { }
+#endif  /* CONFIG_CKRM_CPU_SCHEDULE */
  
-/*
- * Default context-switch locking:
- */
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next) do { } while (0)
-# define finish_arch_switch(rq, next)  spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)           ((rq)->curr == (p))
-#endif
  
  /*
   * task_rq_lock - lock the runqueue a given task resides on and disable
   * interrupts.  Note the ordering: we can safely lookup the task_rq without
   * explicitly disabling preemption.
   */
-static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
  {
         struct runqueue *rq;
  
@@ -278,7 +370,7 @@ repeat_lock_task:
         return rq;
  }
  
-static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
  {
         spin_unlock_irqrestore(&rq->lock, *flags);
  }
@@ -305,20 +397,23 @@ static inline void rq_unlock(runqueue_t *rq)
  /*
   * Adding/removing a task to/from a priority array:
   */
-static void dequeue_task(struct task_struct *p, prio_array_t *array)
+void dequeue_task(struct task_struct *p, prio_array_t *array)
  {
+       BUG_ON(! array);
         array->nr_active--;
         list_del(&p->run_list);
         if (list_empty(array->queue + p->prio))
                 __clear_bit(p->prio, array->bitmap);
+       class_dequeue_task(p,array);
  }
  
-static void enqueue_task(struct task_struct *p, prio_array_t *array)
+void enqueue_task(struct task_struct *p, prio_array_t *array)
  {
         list_add_tail(&p->run_list, array->queue + p->prio);
         __set_bit(p->prio, array->bitmap);
         array->nr_active++;
         p->array = array;
+       class_enqueue_task(p,array);
  }
  
  /*
@@ -332,6 +427,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
         __set_bit(p->prio, array->bitmap);
         array->nr_active++;
         p->array = array;
+       class_enqueue_task(p,array);
  }
  
  /*
@@ -373,8 +469,9 @@ static int effective_prio(task_t *p)
   */
  static inline void __activate_task(task_t *p, runqueue_t *rq)
  {
-       enqueue_task(p, rq->active);
+       enqueue_task(p, rq_active(p,rq));
         rq->nr_running++;
+       rq_load_inc(rq,p);
  }
  
  /*
@@ -382,8 +479,9 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
   */
  static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
  {
-       enqueue_task_head(p, rq->active);
+       enqueue_task_head(p, rq_active(p,rq));
         rq->nr_running++;
+       rq_load_inc(rq,p);
  }
  
  static void recalc_task_prio(task_t *p, unsigned long long now)
@@ -515,6 +613,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
  static void deactivate_task(struct task_struct *p, runqueue_t *rq)
  {
         rq->nr_running--;
+       rq_load_dec(rq,p);
         if (p->state == TASK_UNINTERRUPTIBLE)
                 rq->nr_uninterruptible++;
         dequeue_task(p, p->array);
@@ -554,7 +653,7 @@ static inline void resched_task(task_t *p)
   * task_curr - is this task currently executing on a CPU?
   * @p: the task in question.
   */
-inline int task_curr(task_t *p)
+inline int task_curr(const task_t *p)
  {
         return cpu_curr(task_cpu(p)) == p;
  }
@@ -704,10 +803,9 @@ static int wake_idle(int cpu, task_t *p)
                 return cpu;
  
         cpus_and(tmp, sd->span, cpu_online_map);
-       for_each_cpu_mask(i, tmp) {
-               if (!cpu_isset(i, p->cpus_allowed))
-                       continue;
+       cpus_and(tmp, tmp, p->cpus_allowed);
  
+       for_each_cpu_mask(i, tmp) {
                 if (idle_cpu(i))
                         return i;
         }
@@ -770,6 +868,13 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
         load = source_load(cpu);
         this_load = target_load(this_cpu);
  
+       /*
+        * If sync wakeup then subtract the (maximum possible) effect of
+        * the currently running task from the load of the current CPU:
+        */
+       if (sync)
+               this_load -= SCHED_LOAD_SCALE;
+
         /* Don't pull the task off an idle CPU to a busy one */
         if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
                 goto out_set_cpu;
@@ -957,6 +1062,7 @@ void fastcall wake_up_forked_process(task_t * p)
                 p->array = current->array;
                 p->array->nr_active++;
                 rq->nr_running++;
+               rq_load_inc(rq,p);
         }
         task_rq_unlock(rq, &flags);
  }
@@ -1156,6 +1262,16 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
                 spin_unlock(&rq2->lock);
  }
  
+unsigned long long nr_preempt(void)
+{
+       unsigned long long i, sum = 0;
+
+       for_each_online_cpu(i)
+               sum += cpu_rq(i)->nr_preempt;
+
+       return sum;
+}
+
  enum idle_type
  {
         IDLE,
@@ -1279,6 +1395,7 @@ lock_again:
                         p->array = current->array;
                         p->array->nr_active++;
                         rq->nr_running++;
+                       rq_load_inc(rq,p);
                 }
         } else {
                 /* Not the local CPU - must adjust timestamp */
@@ -1383,9 +1500,13 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
  {
         dequeue_task(p, src_array);
         src_rq->nr_running--;
+       rq_load_dec(src_rq,p);
+
         set_task_cpu(p, this_cpu);
         this_rq->nr_running++;
+       rq_load_inc(this_rq,p);
         enqueue_task(p, this_array);
+
         p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
                                 + this_rq->timestamp_last_tick;
         /*
@@ -1424,6 +1545,194 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
         return 1;
  }
  
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+
+struct ckrm_cpu_class *find_unbalanced_class(int busiest_cpu, int this_cpu, unsigned long *cls_imbalance)
+{
+       struct ckrm_cpu_class *most_unbalanced_class = NULL;
+       struct ckrm_cpu_class *clsptr;
+       int max_unbalance = 0;
+
+       list_for_each_entry(clsptr,&active_cpu_classes,links) {
+               struct ckrm_local_runqueue *this_lrq    = get_ckrm_local_runqueue(clsptr,this_cpu);
+               struct ckrm_local_runqueue *busiest_lrq = get_ckrm_local_runqueue(clsptr,busiest_cpu);
+               int unbalance_degree;
+               
+               unbalance_degree = (local_queue_nr_running(busiest_lrq) - local_queue_nr_running(this_lrq)) * cpu_class_weight(clsptr);
+               if (unbalance_degree >= *cls_imbalance) 
+                       continue;  // already looked at this class
+
+               if (unbalance_degree > max_unbalance) {
+                       max_unbalance = unbalance_degree;
+                       most_unbalanced_class = clsptr;
+               }
+       }
+       *cls_imbalance = max_unbalance;
+       return most_unbalanced_class;
+}
+
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
+ */
+static int find_busiest_cpu(runqueue_t *this_rq, int this_cpu, int idle, 
+                           int *imbalance)
+{
+       int cpu_load, load, max_load, i, busiest_cpu;
+       runqueue_t *busiest, *rq_src;
+
+
+       /*Hubertus ... the concept of nr_running is replace with cpu_load */
+       cpu_load = this_rq->ckrm_cpu_load;
+
+       busiest = NULL;
+       busiest_cpu = -1;
+
+       max_load = -1;
+       for_each_online_cpu(i) {
+               rq_src = cpu_rq(i);
+               load = rq_src->ckrm_cpu_load;
+
+               if ((load > max_load) && (rq_src != this_rq)) {
+                       busiest = rq_src;
+                       busiest_cpu = i;
+                       max_load = load;
+               }
+       }
+
+       if (likely(!busiest))
+               goto out;
+
+       *imbalance = max_load - cpu_load;
+
+       /* It needs an at least ~25% imbalance to trigger balancing. */
+       if (!idle && ((*imbalance)*4 < max_load)) {
+               busiest = NULL;
+               goto out;
+       }
+
+       double_lock_balance(this_rq, busiest);
+       /*
+        * Make sure nothing changed since we checked the
+        * runqueue length.
+        */
+       if (busiest->ckrm_cpu_load <= cpu_load) {
+               spin_unlock(&busiest->lock);
+               busiest = NULL;
+       }
+out:
+       return (busiest ? busiest_cpu : -1);
+}
+
+static int load_balance(int this_cpu, runqueue_t *this_rq,
+                       struct sched_domain *sd, enum idle_type idle)
+{
+       int imbalance, idx;
+       int busiest_cpu;
+       runqueue_t *busiest;
+       prio_array_t *array;
+       struct list_head *head, *curr;
+       task_t *tmp;
+        struct ckrm_local_runqueue * busiest_local_queue;
+       struct ckrm_cpu_class *clsptr;
+       int weight;
+       unsigned long cls_imbalance;      // so we can retry other classes
+
+       // need to update global CVT based on local accumulated CVTs
+       read_lock(&class_list_lock);
+       busiest_cpu = find_busiest_cpu(this_rq, this_cpu, idle, &imbalance);
+       if (busiest_cpu == -1)
+               goto out;
+
+       busiest = cpu_rq(busiest_cpu);
+
+       /*
+        * We only want to steal a number of tasks equal to 1/2 the imbalance,
+        * otherwise we'll just shift the imbalance to the new queue:
+        */
+       imbalance /= 2;
+               
+       /* now find class on that runqueue with largest inbalance */
+       cls_imbalance = 0xFFFFFFFF; 
+
+ retry_other_class:
+       clsptr = find_unbalanced_class(busiest_cpu, this_cpu, &cls_imbalance);
+       if (!clsptr) 
+               goto out_unlock;
+
+       busiest_local_queue = get_ckrm_local_runqueue(clsptr,busiest_cpu);
+       weight = cpu_class_weight(clsptr);
+
+       /*
+        * We first consider expired tasks. Those will likely not be
+        * executed in the near future, and they are most likely to
+        * be cache-cold, thus switching CPUs has the least effect
+        * on them.
+        */
+       if (busiest_local_queue->expired->nr_active)
+               array = busiest_local_queue->expired;
+       else
+               array = busiest_local_queue->active;
+       
+ new_array:
+       /* Start searching at priority 0: */
+       idx = 0;
+ skip_bitmap:
+       if (!idx)
+               idx = sched_find_first_bit(array->bitmap);
+       else
+               idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+       if (idx >= MAX_PRIO) {
+               if (array == busiest_local_queue->expired && busiest_local_queue->active->nr_active) {
+                       array = busiest_local_queue->active;
+                       goto new_array;
+               }
+               goto retry_other_class;
+       }
+       
+       head = array->queue + idx;
+       curr = head->prev;
+ skip_queue:
+       tmp = list_entry(curr, task_t, run_list);
+       
+       curr = curr->prev;
+       
+       if (!can_migrate_task(tmp, busiest, this_cpu, sd,idle)) {
+               if (curr != head)
+                       goto skip_queue;
+               idx++;
+               goto skip_bitmap;
+       }
+       pull_task(busiest, array, tmp, this_rq, rq_active(tmp,this_rq),this_cpu);
+       /*
+        * tmp BUG FIX: hzheng
+        * load balancing can make the busiest local queue empty
+        * thus it should be removed from bpt
+        */
+       if (! local_queue_nr_running(busiest_local_queue)) {
+               classqueue_dequeue(busiest_local_queue->classqueue,&busiest_local_queue->classqueue_linkobj);
+               cpu_demand_event(get_rq_local_stat(busiest_local_queue,busiest_cpu),CPU_DEMAND_DEQUEUE,0);              
+       }
+
+       imbalance -= weight;
+       if (!idle && (imbalance>0)) {
+               if (curr != head)
+                       goto skip_queue;
+               idx++;
+               goto skip_bitmap;
+       }
+ out_unlock:
+       spin_unlock(&busiest->lock);
+ out:
+       read_unlock(&class_list_lock);
+       return 0;
+}
+
+
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+{
+}
+#else /* CONFIG_CKRM_CPU_SCHEDULE */
  /*
   * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
   * as part of a balancing operation within "domain". Returns the number of
@@ -1570,6 +1879,15 @@ nextgroup:
                         100*max_load <= sd->imbalance_pct*this_load)
                 goto out_balanced;
  
+       /*
+        * If crash dump is in progress, this other cpu's
+        * need to wait until it completes.
+        * NB: this code is optimized away for kernels without
+        * dumping enabled.
+        */
+       if (unlikely(dump_oncpu))
+               goto dump_scheduling_disabled;
+
         /*
          * We're trying to get all the cpus to the average_load, so we don't
          * want to push ourselves above the average load, nor do we wish to
@@ -1671,11 +1989,8 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
   * tasks if there is an imbalance.
   *
   * Called with this_rq unlocked.
- *
- * This function is marked noinline to work around a compiler
- * bug with gcc 3.3.3-hammer on x86-64.
   */
-static int noinline load_balance(int this_cpu, runqueue_t *this_rq,
+static int load_balance(int this_cpu, runqueue_t *this_rq,
                         struct sched_domain *sd, enum idle_type idle)
  {
         struct sched_group *group;
@@ -1692,6 +2007,11 @@ static int noinline load_balance(int this_cpu, runqueue_t *this_rq,
         busiest = find_busiest_queue(group);
         if (!busiest)
                 goto out_balanced;
+       /*
+        * This should be "impossible", but since load
+        * balancing is inherently racy and statistical,
+        * it could happen in theory.
+        */
         if (unlikely(busiest == this_rq)) {
                 WARN_ON(1);
                 goto out_balanced;
@@ -1855,6 +2175,15 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
                 }
  
                 rq = cpu_rq(push_cpu);
+
+               /*
+                * This condition is "impossible", but since load
+                * balancing is inherently a bit racy and statistical,
+                * it can trigger.. Reported by Bjorn Helgaas on a
+                * 128-cpu setup.
+                */
+               if (unlikely(busiest == rq))
+                       goto next_group;
                 double_lock_balance(busiest, rq);
                 move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
                 spin_unlock(&rq->lock);
@@ -1862,6 +2191,7 @@ next_group:
                 group = group->next;
         } while (group != sd->groups);
  }
+#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
  
  /*
   * rebalance_tick will get called every timer tick, on every CPU.
@@ -1882,6 +2212,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
         struct sched_domain *sd;
  
+       ckrm_rebalance_tick(j,this_cpu);
+
         /* Update our load */
         old_load = this_rq->cpu_load;
         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -1914,13 +2246,15 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
                 }
         }
  }
-#else
+#else /* SMP*/
  /*
   * on UP we do not need to balance between CPUs:
   */
  static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
  {
+       ckrm_rebalance_tick(jiffies,cpu);
  }
+
  static inline void idle_balance(int cpu, runqueue_t *rq)
  {
  }
@@ -1941,7 +2275,7 @@ static inline int wake_priority_sleeper(runqueue_t *rq)
         return 0;
  }
  
-DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
  
  EXPORT_PER_CPU_SYMBOL(kstat);
  
@@ -1955,11 +2289,19 @@ EXPORT_PER_CPU_SYMBOL(kstat);
   * increasing number of running tasks. We also ignore the interactivity
   * if a better static_prio task has expired:
   */
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
  #define EXPIRED_STARVING(rq) \
         ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
                 (jiffies - (rq)->expired_timestamp >= \
                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
+#else
+#define EXPIRED_STARVING(rq) \
+               (STARVATION_LIMIT && ((rq)->expired_timestamp && \
+               (jiffies - (rq)->expired_timestamp >= \
+                       STARVATION_LIMIT * (local_queue_nr_running(rq)) + 1)))
+#endif
  
  /*
   * This function gets called by the timer code, with HZ frequency.
@@ -2009,7 +2351,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
         cpustat->system += sys_ticks;
  
         /* Task might have expired already, but not scheduled off yet */
-       if (p->array != rq->active) {
+       if (p->array != rq_active(p,rq)) {
                 set_tsk_need_resched(p);
                 goto out;
         }
@@ -2032,12 +2374,17 @@ void scheduler_tick(int user_ticks, int sys_ticks)
                         set_tsk_need_resched(p);
  
                         /* put it at the end of the queue: */
-                       dequeue_task(p, rq->active);
-                       enqueue_task(p, rq->active);
+                       dequeue_task(p, rq_active(p,rq));
+                       enqueue_task(p, rq_active(p,rq));
                 }
                 goto out_unlock;
         }
+#warning MEF PLANETLAB: "if (vx_need_resched(p)) was if (!--p->time_slice) */"
         if (vx_need_resched(p)) {
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+               /* Hubertus ... we can abstract this out */
+               struct ckrm_local_runqueue* rq = get_task_class_queue(p);
+#endif
                 dequeue_task(p, rq->active);
                 set_tsk_need_resched(p);
                 p->prio = effective_prio(p);
@@ -2048,8 +2395,8 @@ void scheduler_tick(int user_ticks, int sys_ticks)
                         rq->expired_timestamp = jiffies;
                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
                         enqueue_task(p, rq->expired);
-                       if (p->static_prio < rq->best_expired_prio)
-                               rq->best_expired_prio = p->static_prio;
+                       if (p->static_prio < this_rq()->best_expired_prio)
+                               this_rq()->best_expired_prio = p->static_prio;
                 } else
                         enqueue_task(p, rq->active);
         } else {
@@ -2072,12 +2419,12 @@ void scheduler_tick(int user_ticks, int sys_ticks)
                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
-                       (p->array == rq->active)) {
+                       (p->array == rq_active(p,rq))) {
  
-                       dequeue_task(p, rq->active);
+                       dequeue_task(p, rq_active(p,rq));
                         set_tsk_need_resched(p);
                         p->prio = effective_prio(p);
-                       enqueue_task(p, rq->active);
+                       enqueue_task(p, rq_active(p,rq));
                 }
         }
  out_unlock:
@@ -2180,15 +2527,15 @@ asmlinkage void __sched schedule(void)
         task_t *prev, *next;
         runqueue_t *rq;
         prio_array_t *array;
-       struct list_head *queue;
         unsigned long long now;
         unsigned long run_time;
-       int cpu, idx;
+       int cpu;
  #ifdef CONFIG_VSERVER_HARDCPU          
         struct vx_info *vxi;
         int maxidle = -HZ;
  #endif
  
+       //WARN_ON(system_state == SYSTEM_BOOTING);
         /*
          * Test if we are atomic.  Since do_exit() needs to call into
          * schedule() atomically, we ignore that path for now.
@@ -2279,21 +2626,9 @@ pick_next:
                 }
         }
  
-       array = rq->active;
-       if (unlikely(!array->nr_active)) {
-               /*
-                * Switch the active and expired arrays.
-                */
-               rq->active = rq->expired;
-               rq->expired = array;
-               array = rq->active;
-               rq->expired_timestamp = 0;
-               rq->best_expired_prio = MAX_PRIO;
-       }
-
-       idx = sched_find_first_bit(array->bitmap);
-       queue = array->queue + idx;
-       next = list_entry(queue->next, task_t, run_list);
+       next = rq_get_next_task(rq);
+       if (next == rq->idle) 
+               goto switch_tasks;
  
         if (dependent_sleeper(cpu, rq, next)) {
                 next = rq->idle;
@@ -2331,18 +2666,30 @@ pick_next:
         next->activated = 0;
  switch_tasks:
         prefetch(next);
-       clear_tsk_need_resched(prev);
+       if (test_and_clear_tsk_thread_flag(prev,TIF_NEED_RESCHED))
+               rq->nr_preempt++;
         RCU_qsctr(task_cpu(prev))++;
  
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+       if (prev != rq->idle) {
+               unsigned long long run = now - prev->timestamp;
+               cpu_demand_event(get_task_local_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+               update_local_cvt(prev, run);
+       }
+#endif
+
         prev->sleep_avg -= run_time;
         if ((long)prev->sleep_avg <= 0) {
                 prev->sleep_avg = 0;
                 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
                         prev->interactive_credit--;
         }
+       add_delay_ts(prev,runcpu_total,prev->timestamp,now);
         prev->timestamp = now;
  
         if (likely(prev != next)) {
+               add_delay_ts(next,waitcpu_total,next->timestamp,now);
+               inc_delay(next,runs);
                 next->timestamp = now;
                 rq->nr_switches++;
                 rq->curr = next;
@@ -2360,6 +2707,16 @@ switch_tasks:
         preempt_enable_no_resched();
         if (test_thread_flag(TIF_NEED_RESCHED))
                 goto need_resched;
+
+       return;
+
+ dump_scheduling_disabled:
+       /* allow scheduling only if this is the dumping cpu */
+       if (dump_oncpu != smp_processor_id()+1) {
+               while (dump_oncpu)
+                       cpu_relax();
+       }
+       return;
  }
  
  EXPORT_SYMBOL(schedule);
@@ -2705,7 +3062,7 @@ asmlinkage long sys_nice(int increment)
   * RT tasks are offset by -200. Normal tasks are centered
   * around 0, value goes from -16 to +15.
   */
-int task_prio(task_t *p)
+int task_prio(const task_t *p)
  {
         return p->prio - MAX_RT_PRIO;
  }
@@ -2714,7 +3071,7 @@ int task_prio(task_t *p)
   * task_nice - return the nice value of a given task.
   * @p: the task in question.
   */
-int task_nice(task_t *p)
+int task_nice(const task_t *p)
  {
         return TASK_NICE(p);
  }
@@ -2988,6 +3345,21 @@ out_unlock:
         return retval;
  }
  
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+
+cpumask_t cpu_present_map;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+cpumask_t cpu_online_map = CPU_MASK_ALL;
+cpumask_t cpu_possible_map = CPU_MASK_ALL;
+#endif
+
  /**
   * sys_sched_getaffinity - get the cpu affinity of a process
   * @pid: pid of the process
@@ -3038,7 +3410,7 @@ asmlinkage long sys_sched_yield(void)
  {
         runqueue_t *rq = this_rq_lock();
         prio_array_t *array = current->array;
-       prio_array_t *target = rq->expired;
+       prio_array_t *target = rq_expired(current,rq);
  
         /*
          * We implement yielding by moving the task into the expired
@@ -3048,7 +3420,7 @@ asmlinkage long sys_sched_yield(void)
          *  array.)
          */
         if (unlikely(rt_task(current)))
-               target = rq->active;
+               target = rq_active(current,rq);
  
         dequeue_task(current, array);
         enqueue_task(current, target);
@@ -3067,12 +3439,34 @@ asmlinkage long sys_sched_yield(void)
  
  void __sched __cond_resched(void)
  {
-       set_current_state(TASK_RUNNING);
-       schedule();
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+       __might_sleep(__FILE__, __LINE__, 0);
+#endif
+       /*
+        * The system_state check is somewhat ugly but we might be
+        * called during early boot when we are not yet ready to reschedule.
+        */
+       if (need_resched() && system_state >= SYSTEM_BOOTING_SCHEDULER_OK) {
+               set_current_state(TASK_RUNNING);
+               schedule();
+       }
  }
  
  EXPORT_SYMBOL(__cond_resched);
  
+void __sched __cond_resched_lock(spinlock_t * lock)
+{
+        if (need_resched()) {
+                _raw_spin_unlock(lock);
+                preempt_enable_no_resched();
+               set_current_state(TASK_RUNNING);
+               schedule();
+                spin_lock(lock);
+        }
+}
+
+EXPORT_SYMBOL(__cond_resched_lock);
+
  /**
   * yield - yield the current processor to other threads.
   *
@@ -3097,10 +3491,13 @@ EXPORT_SYMBOL(yield);
  void __sched io_schedule(void)
  {
         struct runqueue *rq = this_rq();
+       def_delay_var(dstart);
  
+       start_delay_set(dstart,PF_IOWAIT);
         atomic_inc(&rq->nr_iowait);
         schedule();
         atomic_dec(&rq->nr_iowait);
+       add_io_delay(dstart);
  }
  
  EXPORT_SYMBOL(io_schedule);
@@ -3109,10 +3506,13 @@ long __sched io_schedule_timeout(long timeout)
  {
         struct runqueue *rq = this_rq();
         long ret;
+       def_delay_var(dstart);
  
+       start_delay_set(dstart,PF_IOWAIT);
         atomic_inc(&rq->nr_iowait);
         ret = schedule_timeout(timeout);
         atomic_dec(&rq->nr_iowait);
+       add_io_delay(dstart);
         return ret;
  }
  
@@ -3298,6 +3698,8 @@ void show_state(void)
         read_unlock(&tasklist_lock);
  }
  
+EXPORT_SYMBOL_GPL(show_state);
+
  void __devinit init_idle(task_t *idle, int cpu)
  {
         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
@@ -3367,7 +3769,7 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
         runqueue_t *rq;
  
         rq = task_rq_lock(p, &flags);
-       if (any_online_cpu(new_mask) == NR_CPUS) {
+       if (!cpus_intersects(new_mask, cpu_online_map)) {
                 ret = -EINVAL;
                 goto out;
         }
@@ -3382,6 +3784,7 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
                 task_rq_unlock(rq, &flags);
                 wake_up_process(rq->migration_thread);
                 wait_for_completion(&req.done);
+               tlb_migrate_finish(p->mm);
                 return 0;
         }
  out:
@@ -3467,7 +3870,9 @@ static int migration_thread(void * data)
                 }
  
                 if (rq->active_balance) {
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
                         active_load_balance(rq, cpu);
+#endif
                         rq->active_balance = 0;
                 }
  
@@ -3542,8 +3947,7 @@ static void migrate_all_tasks(int src_cpu)
                 if (dest_cpu == NR_CPUS)
                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
                 if (dest_cpu == NR_CPUS) {
-                       cpus_clear(tsk->cpus_allowed);
-                       cpus_complement(tsk->cpus_allowed);
+                       cpus_setall(tsk->cpus_allowed);
                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
  
                         /* Don't tell them about moving exiting tasks
@@ -3605,6 +4009,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
                 if (IS_ERR(p))
                         return NOTIFY_BAD;
+               p->flags |= PF_NOFREEZE;
                 kthread_bind(p, cpu);
                 /* Must be high prio: stop_machine expects to yield to it. */
                 rq = task_rq_lock(p, &flags);
@@ -3859,7 +4264,7 @@ void sched_domain_debug(void)
                         int j;
                         char str[NR_CPUS];
                         struct sched_group *group = sd->groups;
-                       cpumask_t groupmask, tmp;
+                       cpumask_t groupmask;
  
                         cpumask_scnprintf(str, NR_CPUS, sd->span);
                         cpus_clear(groupmask);
@@ -3889,8 +4294,7 @@ void sched_domain_debug(void)
                                 if (!cpus_weight(group->cpumask))
                                         printk(" ERROR empty group:");
  
-                               cpus_and(tmp, groupmask, group->cpumask);
-                               if (cpus_weight(tmp) > 0)
+                               if (cpus_intersects(groupmask, group->cpumask))
                                         printk(" ERROR repeated CPUs:");
  
                                 cpus_or(groupmask, groupmask, group->cpumask);
@@ -3909,8 +4313,7 @@ void sched_domain_debug(void)
                         sd = sd->parent;
  
                         if (sd) {
-                               cpus_and(tmp, groupmask, sd->span);
-                               if (!cpus_equal(tmp, groupmask))
+                               if (!cpus_subset(groupmask, sd->span))
                                         printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
                         }
  
@@ -3943,33 +4346,43 @@ int in_sched_functions(unsigned long addr)
  void __init sched_init(void)
  {
         runqueue_t *rq;
-       int i, j, k;
+       int i;
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+       int j, k;
+#endif
  
  #ifdef CONFIG_SMP
         /* Set up an initial dummy domain for early boot */
         static struct sched_domain sched_domain_init;
         static struct sched_group sched_group_init;
-       cpumask_t cpu_mask_all = CPU_MASK_ALL;
  
         memset(&sched_domain_init, 0, sizeof(struct sched_domain));
-       sched_domain_init.span = cpu_mask_all;
+       sched_domain_init.span = CPU_MASK_ALL;
         sched_domain_init.groups = &sched_group_init;
         sched_domain_init.last_balance = jiffies;
         sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
  
         memset(&sched_group_init, 0, sizeof(struct sched_group));
-       sched_group_init.cpumask = cpu_mask_all;
+       sched_group_init.cpumask = CPU_MASK_ALL;
         sched_group_init.next = &sched_group_init;
         sched_group_init.cpu_power = SCHED_LOAD_SCALE;
  #endif
  
+       init_cpu_classes();
+
         for (i = 0; i < NR_CPUS; i++) {
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
                 prio_array_t *array;
-
+#endif
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
                 rq->active = rq->arrays;
                 rq->expired = rq->arrays + 1;
+#else
+               rq->ckrm_cpu_load = 0;
+#endif
                 rq->best_expired_prio = MAX_PRIO;
  
  #ifdef CONFIG_SMP
@@ -3983,6 +4396,7 @@ void __init sched_init(void)
                 INIT_LIST_HEAD(&rq->hold_queue);
                 atomic_set(&rq->nr_iowait, 0);
  
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
                 for (j = 0; j < 2; j++) {
                         array = rq->arrays + j;
                         for (k = 0; k < MAX_PRIO; k++) {
@@ -3992,7 +4406,9 @@ void __init sched_init(void)
                         // delimiter for bitsearch
                         __set_bit(MAX_PRIO, array->bitmap);
                 }
+#endif
         }
+
         /*
          * We have to do a little magic to get the first
          * thread right in SMP mode.
@@ -4001,6 +4417,10 @@ void __init sched_init(void)
         rq->curr = current;
         rq->idle = current;
         set_task_cpu(current, smp_processor_id());
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+       current->cpu_class = default_cpu_class;
+       current->array = NULL;
+#endif
         wake_up_forked_process(current);
  
         /*
@@ -4011,20 +4431,23 @@ void __init sched_init(void)
  }
  
  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+void __might_sleep(char *file, int line, int atomic_depth)
  {
  #if defined(in_atomic)
         static unsigned long prev_jiffy;        /* ratelimiting */
  
-       if ((in_atomic() || irqs_disabled()) &&
+#ifndef CONFIG_PREEMPT
+       atomic_depth = 0;
+#endif
+       if (((in_atomic() != atomic_depth) || irqs_disabled()) &&
             system_state == SYSTEM_RUNNING) {
                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                         return;
                 prev_jiffy = jiffies;
                 printk(KERN_ERR "Debug: sleeping function called from invalid"
                                 " context at %s:%d\n", file, line);
-               printk("in_atomic():%d, irqs_disabled():%d\n",
-                       in_atomic(), irqs_disabled());
+               printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
+                       in_atomic(), atomic_depth, irqs_disabled());
                 dump_stack();
         }
  #endif
@@ -4077,3 +4500,22 @@ void __sched __preempt_write_lock(rwlock_t *lock)
  
  EXPORT_SYMBOL(__preempt_write_lock);
  #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */
+
+#ifdef CONFIG_DELAY_ACCT
+int task_running_sys(struct task_struct *p)
+{
+       return task_running(task_rq(p),p);
+}
+EXPORT_SYMBOL(task_running_sys);
+#endif
+
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/**
+ * return the classqueue object of a certain processor
+ * Note: not supposed to be used in performance sensitive functions
+ */
+struct classqueue_struct * get_cpu_classqueue(int cpu)
+{
+       return (& (cpu_rq(cpu)->classqueue) );
+}
+#endif