* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
*/
-
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
#include <linux/vserver/sched.h>
-#include <linux/vinline.h>
+#include <linux/vs_base.h>
+#include <asm/tlb.h>
#include <asm/unistd.h>
#define cpu_to_node_mask(cpu) (cpu_online_map)
#endif
+/* used to soft spin in sched while dump is in progress */
+unsigned long dump_oncpu;
+EXPORT_SYMBOL(dump_oncpu);
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
#define LOW_CREDIT(p) \
((p)->interactive_credit < -CREDIT_LIMIT)
-#define TASK_PREEMPTS_CURR(p, rq) \
- ((p)->prio < (rq)->curr->prio)
-
/*
* BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
* to time slice values.
#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
+DEFINE_PER_CPU(struct runqueue, runqueues);
+
+#define for_each_domain(cpu, domain) \
+ for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+
+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+#define this_rq() (&__get_cpu_var(runqueues))
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+
/*
- * These are the runqueue data structures:
+ * Default context-switch locking:
*/
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(rq, next) do { } while (0)
+# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
+# define task_running(rq, p) ((rq)->curr == (p))
+#endif
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
-typedef struct runqueue runqueue_t;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+#include <linux/ckrm_sched.h>
+spinlock_t cvt_lock = SPIN_LOCK_UNLOCKED;
+rwlock_t class_list_lock = RW_LOCK_UNLOCKED;
+LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor
+struct ckrm_cpu_class default_cpu_class_obj;
-struct prio_array {
- unsigned int nr_active;
- unsigned long bitmap[BITMAP_SIZE];
- struct list_head queue[MAX_PRIO];
-};
+/*
+ * the minimum CVT allowed is the base_cvt
+ * otherwise, it will starve others
+ */
+CVT_t get_min_cvt(int cpu)
+{
+ cq_node_t *node;
+ struct ckrm_local_runqueue * lrq;
+ CVT_t min_cvt;
+
+ node = classqueue_get_head(bpt_queue(cpu));
+ lrq = (node) ? class_list_entry(node) : NULL;
+
+ if (lrq)
+ min_cvt = lrq->local_cvt;
+ else
+ min_cvt = 0;
+
+ return min_cvt;
+}
/*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
+ * update the classueue base for all the runqueues
+ * TODO: we can only update half of the min_base to solve the movebackward issue
*/
-struct runqueue {
- spinlock_t lock;
+static inline void check_update_class_base(int this_cpu) {
+ unsigned long min_base = 0xFFFFFFFF;
+ cq_node_t *node;
+ int i;
+
+ if (! cpu_online(this_cpu)) return;
/*
- * nr_running and cpu_load should be in the same cacheline because
- * remote CPUs use both these fields when doing load calculation.
+ * find the min_base across all the processors
*/
- unsigned long nr_running;
-#ifdef CONFIG_SMP
- unsigned long cpu_load;
-#endif
- unsigned long long nr_switches;
- unsigned long expired_timestamp, nr_uninterruptible;
- unsigned long long timestamp_last_tick;
- task_t *curr, *idle;
- struct mm_struct *prev_mm;
- prio_array_t *active, *expired, arrays[2];
- int best_expired_prio;
- atomic_t nr_iowait;
+ for_each_online_cpu(i) {
+ /*
+ * I should change it to directly use bpt->base
+ */
+ node = classqueue_get_head(bpt_queue(i));
+ if (node && node->prio < min_base) {
+ min_base = node->prio;
+ }
+ }
+ if (min_base != 0xFFFFFFFF)
+ classqueue_update_base(bpt_queue(this_cpu),min_base);
+}
-#ifdef CONFIG_SMP
- struct sched_domain *sd;
+static inline void ckrm_rebalance_tick(int j,int this_cpu)
+{
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ read_lock(&class_list_lock);
+ if (!(j % CVT_UPDATE_TICK))
+ update_global_cvts(this_cpu);
- /* For active balancing */
- int active_balance;
- int push_cpu;
+#define CKRM_BASE_UPDATE_RATE 400
+ if (! (jiffies % CKRM_BASE_UPDATE_RATE))
+ check_update_class_base(this_cpu);
- task_t *migration_thread;
- struct list_head migration_queue;
+ read_unlock(&class_list_lock);
#endif
- struct list_head hold_queue;
- int idle_tokens;
-};
+}
-static DEFINE_PER_CPU(struct runqueue, runqueues);
+static inline struct ckrm_local_runqueue *rq_get_next_class(struct runqueue *rq)
+{
+ cq_node_t *node = classqueue_get_head(&rq->classqueue);
+ return ((node) ? class_list_entry(node) : NULL);
+}
-#define for_each_domain(cpu, domain) \
- for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
+{
+ prio_array_t *array;
+ struct task_struct *next;
+ struct ckrm_local_runqueue *queue;
+ int cpu = smp_processor_id();
+
+ next = rq->idle;
+ retry_next_class:
+ if ((queue = rq_get_next_class(rq))) {
+ array = queue->active;
+ //check switch active/expired queue
+ if (unlikely(!queue->active->nr_active)) {
+ queue->active = queue->expired;
+ queue->expired = array;
+ queue->expired_timestamp = 0;
+
+ if (queue->active->nr_active)
+ set_top_priority(queue,
+ find_first_bit(queue->active->bitmap, MAX_PRIO));
+ else {
+ classqueue_dequeue(queue->classqueue,
+ &queue->classqueue_linkobj);
+ cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
+ }
-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
-#define this_rq() (&__get_cpu_var(runqueues))
-#define task_rq(p) cpu_rq(task_cpu(p))
-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+ goto retry_next_class;
+ }
+ BUG_ON(!queue->active->nr_active);
+ next = task_list_entry(array->queue[queue->top_priority].next);
+ }
+ return next;
+}
+
+static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load += cpu_class_weight(p->cpu_class); }
+static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load -= cpu_class_weight(p->cpu_class); }
+
+#else /*CONFIG_CKRM_CPU_SCHEDULE*/
+
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
+{
+ prio_array_t *array;
+ struct list_head *queue;
+ int idx;
+
+ array = rq->active;
+ if (unlikely(!array->nr_active)) {
+ /*
+ * Switch the active and expired arrays.
+ */
+ rq->active = rq->expired;
+ rq->expired = array;
+ array = rq->active;
+ rq->expired_timestamp = 0;
+ rq->best_expired_prio = MAX_PRIO;
+ }
+
+ idx = sched_find_first_bit(array->bitmap);
+ queue = array->queue + idx;
+ return list_entry(queue->next, task_t, run_list);
+}
+
+static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void init_cpu_classes(void) { }
+static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { }
+static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { }
+#endif /* CONFIG_CKRM_CPU_SCHEDULE */
-/*
- * Default context-switch locking:
- */
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next) do { } while (0)
-# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p) ((rq)->curr == (p))
-#endif
/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/
-static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
{
struct runqueue *rq;
return rq;
}
-static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
{
spin_unlock_irqrestore(&rq->lock, *flags);
}
/*
* Adding/removing a task to/from a priority array:
*/
-static void dequeue_task(struct task_struct *p, prio_array_t *array)
+void dequeue_task(struct task_struct *p, prio_array_t *array)
{
+ BUG_ON(! array);
array->nr_active--;
list_del(&p->run_list);
if (list_empty(array->queue + p->prio))
__clear_bit(p->prio, array->bitmap);
+ class_dequeue_task(p,array);
}
-static void enqueue_task(struct task_struct *p, prio_array_t *array)
+void enqueue_task(struct task_struct *p, prio_array_t *array)
{
list_add_tail(&p->run_list, array->queue + p->prio);
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
+ class_enqueue_task(p,array);
}
/*
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
+ class_enqueue_task(p,array);
}
/*
*/
static inline void __activate_task(task_t *p, runqueue_t *rq)
{
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq_active(p,rq));
rq->nr_running++;
+ rq_load_inc(rq,p);
}
/*
*/
static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
{
- enqueue_task_head(p, rq->active);
+ enqueue_task_head(p, rq_active(p,rq));
rq->nr_running++;
+ rq_load_inc(rq,p);
}
static void recalc_task_prio(task_t *p, unsigned long long now)
static void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
rq->nr_running--;
+ rq_load_dec(rq,p);
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
dequeue_task(p, p->array);
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
*/
-inline int task_curr(task_t *p)
+inline int task_curr(const task_t *p)
{
return cpu_curr(task_cpu(p)) == p;
}
return cpu;
cpus_and(tmp, sd->span, cpu_online_map);
- for_each_cpu_mask(i, tmp) {
- if (!cpu_isset(i, p->cpus_allowed))
- continue;
+ cpus_and(tmp, tmp, p->cpus_allowed);
+ for_each_cpu_mask(i, tmp) {
if (idle_cpu(i))
return i;
}
load = source_load(cpu);
this_load = target_load(this_cpu);
+ /*
+ * If sync wakeup then subtract the (maximum possible) effect of
+ * the currently running task from the load of the current CPU:
+ */
+ if (sync)
+ this_load -= SCHED_LOAD_SCALE;
+
/* Don't pull the task off an idle CPU to a busy one */
if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
goto out_set_cpu;
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
+ rq_load_inc(rq,p);
}
task_rq_unlock(rq, &flags);
}
spin_unlock(&rq2->lock);
}
+unsigned long long nr_preempt(void)
+{
+ unsigned long long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->nr_preempt;
+
+ return sum;
+}
+
enum idle_type
{
IDLE,
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
+ rq_load_inc(rq,p);
}
} else {
/* Not the local CPU - must adjust timestamp */
{
dequeue_task(p, src_array);
src_rq->nr_running--;
+ rq_load_dec(src_rq,p);
+
set_task_cpu(p, this_cpu);
this_rq->nr_running++;
+ rq_load_inc(this_rq,p);
enqueue_task(p, this_array);
+
p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+ this_rq->timestamp_last_tick;
/*
return 1;
}
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+
+struct ckrm_cpu_class *find_unbalanced_class(int busiest_cpu, int this_cpu, unsigned long *cls_imbalance)
+{
+ struct ckrm_cpu_class *most_unbalanced_class = NULL;
+ struct ckrm_cpu_class *clsptr;
+ int max_unbalance = 0;
+
+ list_for_each_entry(clsptr,&active_cpu_classes,links) {
+ struct ckrm_local_runqueue *this_lrq = get_ckrm_local_runqueue(clsptr,this_cpu);
+ struct ckrm_local_runqueue *busiest_lrq = get_ckrm_local_runqueue(clsptr,busiest_cpu);
+ int unbalance_degree;
+
+ unbalance_degree = (local_queue_nr_running(busiest_lrq) - local_queue_nr_running(this_lrq)) * cpu_class_weight(clsptr);
+ if (unbalance_degree >= *cls_imbalance)
+ continue; // already looked at this class
+
+ if (unbalance_degree > max_unbalance) {
+ max_unbalance = unbalance_degree;
+ most_unbalanced_class = clsptr;
+ }
+ }
+ *cls_imbalance = max_unbalance;
+ return most_unbalanced_class;
+}
+
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
+ */
+static int find_busiest_cpu(runqueue_t *this_rq, int this_cpu, int idle,
+ int *imbalance)
+{
+ int cpu_load, load, max_load, i, busiest_cpu;
+ runqueue_t *busiest, *rq_src;
+
+
+ /*Hubertus ... the concept of nr_running is replace with cpu_load */
+ cpu_load = this_rq->ckrm_cpu_load;
+
+ busiest = NULL;
+ busiest_cpu = -1;
+
+ max_load = -1;
+ for_each_online_cpu(i) {
+ rq_src = cpu_rq(i);
+ load = rq_src->ckrm_cpu_load;
+
+ if ((load > max_load) && (rq_src != this_rq)) {
+ busiest = rq_src;
+ busiest_cpu = i;
+ max_load = load;
+ }
+ }
+
+ if (likely(!busiest))
+ goto out;
+
+ *imbalance = max_load - cpu_load;
+
+ /* It needs an at least ~25% imbalance to trigger balancing. */
+ if (!idle && ((*imbalance)*4 < max_load)) {
+ busiest = NULL;
+ goto out;
+ }
+
+ double_lock_balance(this_rq, busiest);
+ /*
+ * Make sure nothing changed since we checked the
+ * runqueue length.
+ */
+ if (busiest->ckrm_cpu_load <= cpu_load) {
+ spin_unlock(&busiest->lock);
+ busiest = NULL;
+ }
+out:
+ return (busiest ? busiest_cpu : -1);
+}
+
+static int load_balance(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
+{
+ int imbalance, idx;
+ int busiest_cpu;
+ runqueue_t *busiest;
+ prio_array_t *array;
+ struct list_head *head, *curr;
+ task_t *tmp;
+ struct ckrm_local_runqueue * busiest_local_queue;
+ struct ckrm_cpu_class *clsptr;
+ int weight;
+ unsigned long cls_imbalance; // so we can retry other classes
+
+ // need to update global CVT based on local accumulated CVTs
+ read_lock(&class_list_lock);
+ busiest_cpu = find_busiest_cpu(this_rq, this_cpu, idle, &imbalance);
+ if (busiest_cpu == -1)
+ goto out;
+
+ busiest = cpu_rq(busiest_cpu);
+
+ /*
+ * We only want to steal a number of tasks equal to 1/2 the imbalance,
+ * otherwise we'll just shift the imbalance to the new queue:
+ */
+ imbalance /= 2;
+
+ /* now find class on that runqueue with largest inbalance */
+ cls_imbalance = 0xFFFFFFFF;
+
+ retry_other_class:
+ clsptr = find_unbalanced_class(busiest_cpu, this_cpu, &cls_imbalance);
+ if (!clsptr)
+ goto out_unlock;
+
+ busiest_local_queue = get_ckrm_local_runqueue(clsptr,busiest_cpu);
+ weight = cpu_class_weight(clsptr);
+
+ /*
+ * We first consider expired tasks. Those will likely not be
+ * executed in the near future, and they are most likely to
+ * be cache-cold, thus switching CPUs has the least effect
+ * on them.
+ */
+ if (busiest_local_queue->expired->nr_active)
+ array = busiest_local_queue->expired;
+ else
+ array = busiest_local_queue->active;
+
+ new_array:
+ /* Start searching at priority 0: */
+ idx = 0;
+ skip_bitmap:
+ if (!idx)
+ idx = sched_find_first_bit(array->bitmap);
+ else
+ idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+ if (idx >= MAX_PRIO) {
+ if (array == busiest_local_queue->expired && busiest_local_queue->active->nr_active) {
+ array = busiest_local_queue->active;
+ goto new_array;
+ }
+ goto retry_other_class;
+ }
+
+ head = array->queue + idx;
+ curr = head->prev;
+ skip_queue:
+ tmp = list_entry(curr, task_t, run_list);
+
+ curr = curr->prev;
+
+ if (!can_migrate_task(tmp, busiest, this_cpu, sd,idle)) {
+ if (curr != head)
+ goto skip_queue;
+ idx++;
+ goto skip_bitmap;
+ }
+ pull_task(busiest, array, tmp, this_rq, rq_active(tmp,this_rq),this_cpu);
+ /*
+ * tmp BUG FIX: hzheng
+ * load balancing can make the busiest local queue empty
+ * thus it should be removed from bpt
+ */
+ if (! local_queue_nr_running(busiest_local_queue)) {
+ classqueue_dequeue(busiest_local_queue->classqueue,&busiest_local_queue->classqueue_linkobj);
+ cpu_demand_event(get_rq_local_stat(busiest_local_queue,busiest_cpu),CPU_DEMAND_DEQUEUE,0);
+ }
+
+ imbalance -= weight;
+ if (!idle && (imbalance>0)) {
+ if (curr != head)
+ goto skip_queue;
+ idx++;
+ goto skip_bitmap;
+ }
+ out_unlock:
+ spin_unlock(&busiest->lock);
+ out:
+ read_unlock(&class_list_lock);
+ return 0;
+}
+
+
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+{
+}
+#else /* CONFIG_CKRM_CPU_SCHEDULE */
/*
* move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
* as part of a balancing operation within "domain". Returns the number of
100*max_load <= sd->imbalance_pct*this_load)
goto out_balanced;
+ /*
+ * If crash dump is in progress, this other cpu's
+ * need to wait until it completes.
+ * NB: this code is optimized away for kernels without
+ * dumping enabled.
+ */
+ if (unlikely(dump_oncpu))
+ goto dump_scheduling_disabled;
+
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* tasks if there is an imbalance.
*
* Called with this_rq unlocked.
- *
- * This function is marked noinline to work around a compiler
- * bug with gcc 3.3.3-hammer on x86-64.
*/
-static int noinline load_balance(int this_cpu, runqueue_t *this_rq,
+static int load_balance(int this_cpu, runqueue_t *this_rq,
struct sched_domain *sd, enum idle_type idle)
{
struct sched_group *group;
busiest = find_busiest_queue(group);
if (!busiest)
goto out_balanced;
+ /*
+ * This should be "impossible", but since load
+ * balancing is inherently racy and statistical,
+ * it could happen in theory.
+ */
if (unlikely(busiest == this_rq)) {
WARN_ON(1);
goto out_balanced;
}
rq = cpu_rq(push_cpu);
+
+ /*
+ * This condition is "impossible", but since load
+ * balancing is inherently a bit racy and statistical,
+ * it can trigger.. Reported by Bjorn Helgaas on a
+ * 128-cpu setup.
+ */
+ if (unlikely(busiest == rq))
+ goto next_group;
double_lock_balance(busiest, rq);
move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
spin_unlock(&rq->lock);
group = group->next;
} while (group != sd->groups);
}
+#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
/*
* rebalance_tick will get called every timer tick, on every CPU.
unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *sd;
+ ckrm_rebalance_tick(j,this_cpu);
+
/* Update our load */
old_load = this_rq->cpu_load;
this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
}
}
}
-#else
+#else /* SMP*/
/*
* on UP we do not need to balance between CPUs:
*/
static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
{
+ ckrm_rebalance_tick(jiffies,cpu);
}
+
static inline void idle_balance(int cpu, runqueue_t *rq)
{
}
return 0;
}
-DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
EXPORT_PER_CPU_SYMBOL(kstat);
* increasing number of running tasks. We also ignore the interactivity
* if a better static_prio task has expired:
*/
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
#define EXPIRED_STARVING(rq) \
((STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
((rq)->curr->static_prio > (rq)->best_expired_prio))
+#else
+#define EXPIRED_STARVING(rq) \
+ (STARVATION_LIMIT && ((rq)->expired_timestamp && \
+ (jiffies - (rq)->expired_timestamp >= \
+ STARVATION_LIMIT * (local_queue_nr_running(rq)) + 1)))
+#endif
/*
* This function gets called by the timer code, with HZ frequency.
cpustat->system += sys_ticks;
/* Task might have expired already, but not scheduled off yet */
- if (p->array != rq->active) {
+ if (p->array != rq_active(p,rq)) {
set_tsk_need_resched(p);
goto out;
}
set_tsk_need_resched(p);
/* put it at the end of the queue: */
- dequeue_task(p, rq->active);
- enqueue_task(p, rq->active);
+ dequeue_task(p, rq_active(p,rq));
+ enqueue_task(p, rq_active(p,rq));
}
goto out_unlock;
}
+#warning MEF PLANETLAB: "if (vx_need_resched(p)) was if (!--p->time_slice) */"
if (vx_need_resched(p)) {
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ /* Hubertus ... we can abstract this out */
+ struct ckrm_local_runqueue* rq = get_task_class_queue(p);
+#endif
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
p->prio = effective_prio(p);
rq->expired_timestamp = jiffies;
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
enqueue_task(p, rq->expired);
- if (p->static_prio < rq->best_expired_prio)
- rq->best_expired_prio = p->static_prio;
+ if (p->static_prio < this_rq()->best_expired_prio)
+ this_rq()->best_expired_prio = p->static_prio;
} else
enqueue_task(p, rq->active);
} else {
if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
- (p->array == rq->active)) {
+ (p->array == rq_active(p,rq))) {
- dequeue_task(p, rq->active);
+ dequeue_task(p, rq_active(p,rq));
set_tsk_need_resched(p);
p->prio = effective_prio(p);
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq_active(p,rq));
}
}
out_unlock:
task_t *prev, *next;
runqueue_t *rq;
prio_array_t *array;
- struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu;
#ifdef CONFIG_VSERVER_HARDCPU
struct vx_info *vxi;
int maxidle = -HZ;
#endif
+ //WARN_ON(system_state == SYSTEM_BOOTING);
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
}
}
- array = rq->active;
- if (unlikely(!array->nr_active)) {
- /*
- * Switch the active and expired arrays.
- */
- rq->active = rq->expired;
- rq->expired = array;
- array = rq->active;
- rq->expired_timestamp = 0;
- rq->best_expired_prio = MAX_PRIO;
- }
-
- idx = sched_find_first_bit(array->bitmap);
- queue = array->queue + idx;
- next = list_entry(queue->next, task_t, run_list);
+ next = rq_get_next_task(rq);
+ if (next == rq->idle)
+ goto switch_tasks;
if (dependent_sleeper(cpu, rq, next)) {
next = rq->idle;
next->activated = 0;
switch_tasks:
prefetch(next);
- clear_tsk_need_resched(prev);
+ if (test_and_clear_tsk_thread_flag(prev,TIF_NEED_RESCHED))
+ rq->nr_preempt++;
RCU_qsctr(task_cpu(prev))++;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ if (prev != rq->idle) {
+ unsigned long long run = now - prev->timestamp;
+ cpu_demand_event(get_task_local_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+ update_local_cvt(prev, run);
+ }
+#endif
+
prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0) {
prev->sleep_avg = 0;
if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
prev->interactive_credit--;
}
+ add_delay_ts(prev,runcpu_total,prev->timestamp,now);
prev->timestamp = now;
if (likely(prev != next)) {
+ add_delay_ts(next,waitcpu_total,next->timestamp,now);
+ inc_delay(next,runs);
next->timestamp = now;
rq->nr_switches++;
rq->curr = next;
preempt_enable_no_resched();
if (test_thread_flag(TIF_NEED_RESCHED))
goto need_resched;
+
+ return;
+
+ dump_scheduling_disabled:
+ /* allow scheduling only if this is the dumping cpu */
+ if (dump_oncpu != smp_processor_id()+1) {
+ while (dump_oncpu)
+ cpu_relax();
+ }
+ return;
}
EXPORT_SYMBOL(schedule);
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
-int task_prio(task_t *p)
+int task_prio(const task_t *p)
{
return p->prio - MAX_RT_PRIO;
}
* task_nice - return the nice value of a given task.
* @p: the task in question.
*/
-int task_nice(task_t *p)
+int task_nice(const task_t *p)
{
return TASK_NICE(p);
}
return retval;
}
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+
+cpumask_t cpu_present_map;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+cpumask_t cpu_online_map = CPU_MASK_ALL;
+cpumask_t cpu_possible_map = CPU_MASK_ALL;
+#endif
+
/**
* sys_sched_getaffinity - get the cpu affinity of a process
* @pid: pid of the process
{
runqueue_t *rq = this_rq_lock();
prio_array_t *array = current->array;
- prio_array_t *target = rq->expired;
+ prio_array_t *target = rq_expired(current,rq);
/*
* We implement yielding by moving the task into the expired
* array.)
*/
if (unlikely(rt_task(current)))
- target = rq->active;
+ target = rq_active(current,rq);
dequeue_task(current, array);
enqueue_task(current, target);
void __sched __cond_resched(void)
{
- set_current_state(TASK_RUNNING);
- schedule();
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+ __might_sleep(__FILE__, __LINE__, 0);
+#endif
+ /*
+ * The system_state check is somewhat ugly but we might be
+ * called during early boot when we are not yet ready to reschedule.
+ */
+ if (need_resched() && system_state >= SYSTEM_BOOTING_SCHEDULER_OK) {
+ set_current_state(TASK_RUNNING);
+ schedule();
+ }
}
EXPORT_SYMBOL(__cond_resched);
+void __sched __cond_resched_lock(spinlock_t * lock)
+{
+ if (need_resched()) {
+ _raw_spin_unlock(lock);
+ preempt_enable_no_resched();
+ set_current_state(TASK_RUNNING);
+ schedule();
+ spin_lock(lock);
+ }
+}
+
+EXPORT_SYMBOL(__cond_resched_lock);
+
/**
* yield - yield the current processor to other threads.
*
void __sched io_schedule(void)
{
struct runqueue *rq = this_rq();
+ def_delay_var(dstart);
+ start_delay_set(dstart,PF_IOWAIT);
atomic_inc(&rq->nr_iowait);
schedule();
atomic_dec(&rq->nr_iowait);
+ add_io_delay(dstart);
}
EXPORT_SYMBOL(io_schedule);
{
struct runqueue *rq = this_rq();
long ret;
+ def_delay_var(dstart);
+ start_delay_set(dstart,PF_IOWAIT);
atomic_inc(&rq->nr_iowait);
ret = schedule_timeout(timeout);
atomic_dec(&rq->nr_iowait);
+ add_io_delay(dstart);
return ret;
}
read_unlock(&tasklist_lock);
}
+EXPORT_SYMBOL_GPL(show_state);
+
void __devinit init_idle(task_t *idle, int cpu)
{
runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
runqueue_t *rq;
rq = task_rq_lock(p, &flags);
- if (any_online_cpu(new_mask) == NR_CPUS) {
+ if (!cpus_intersects(new_mask, cpu_online_map)) {
ret = -EINVAL;
goto out;
}
task_rq_unlock(rq, &flags);
wake_up_process(rq->migration_thread);
wait_for_completion(&req.done);
+ tlb_migrate_finish(p->mm);
return 0;
}
out:
}
if (rq->active_balance) {
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
active_load_balance(rq, cpu);
+#endif
rq->active_balance = 0;
}
if (dest_cpu == NR_CPUS)
dest_cpu = any_online_cpu(tsk->cpus_allowed);
if (dest_cpu == NR_CPUS) {
- cpus_clear(tsk->cpus_allowed);
- cpus_complement(tsk->cpus_allowed);
+ cpus_setall(tsk->cpus_allowed);
dest_cpu = any_online_cpu(tsk->cpus_allowed);
/* Don't tell them about moving exiting tasks
p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
if (IS_ERR(p))
return NOTIFY_BAD;
+ p->flags |= PF_NOFREEZE;
kthread_bind(p, cpu);
/* Must be high prio: stop_machine expects to yield to it. */
rq = task_rq_lock(p, &flags);
int j;
char str[NR_CPUS];
struct sched_group *group = sd->groups;
- cpumask_t groupmask, tmp;
+ cpumask_t groupmask;
cpumask_scnprintf(str, NR_CPUS, sd->span);
cpus_clear(groupmask);
if (!cpus_weight(group->cpumask))
printk(" ERROR empty group:");
- cpus_and(tmp, groupmask, group->cpumask);
- if (cpus_weight(tmp) > 0)
+ if (cpus_intersects(groupmask, group->cpumask))
printk(" ERROR repeated CPUs:");
cpus_or(groupmask, groupmask, group->cpumask);
sd = sd->parent;
if (sd) {
- cpus_and(tmp, groupmask, sd->span);
- if (!cpus_equal(tmp, groupmask))
+ if (!cpus_subset(groupmask, sd->span))
printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
}
void __init sched_init(void)
{
runqueue_t *rq;
- int i, j, k;
+ int i;
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+ int j, k;
+#endif
#ifdef CONFIG_SMP
/* Set up an initial dummy domain for early boot */
static struct sched_domain sched_domain_init;
static struct sched_group sched_group_init;
- cpumask_t cpu_mask_all = CPU_MASK_ALL;
memset(&sched_domain_init, 0, sizeof(struct sched_domain));
- sched_domain_init.span = cpu_mask_all;
+ sched_domain_init.span = CPU_MASK_ALL;
sched_domain_init.groups = &sched_group_init;
sched_domain_init.last_balance = jiffies;
sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
memset(&sched_group_init, 0, sizeof(struct sched_group));
- sched_group_init.cpumask = cpu_mask_all;
+ sched_group_init.cpumask = CPU_MASK_ALL;
sched_group_init.next = &sched_group_init;
sched_group_init.cpu_power = SCHED_LOAD_SCALE;
#endif
+ init_cpu_classes();
+
for (i = 0; i < NR_CPUS; i++) {
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
prio_array_t *array;
-
+#endif
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
+#else
+ rq->ckrm_cpu_load = 0;
+#endif
rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
INIT_LIST_HEAD(&rq->hold_queue);
atomic_set(&rq->nr_iowait, 0);
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
for (j = 0; j < 2; j++) {
array = rq->arrays + j;
for (k = 0; k < MAX_PRIO; k++) {
// delimiter for bitsearch
__set_bit(MAX_PRIO, array->bitmap);
}
+#endif
}
+
/*
* We have to do a little magic to get the first
* thread right in SMP mode.
rq->curr = current;
rq->idle = current;
set_task_cpu(current, smp_processor_id());
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ current->cpu_class = default_cpu_class;
+ current->array = NULL;
+#endif
wake_up_forked_process(current);
/*
}
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+void __might_sleep(char *file, int line, int atomic_depth)
{
#if defined(in_atomic)
static unsigned long prev_jiffy; /* ratelimiting */
- if ((in_atomic() || irqs_disabled()) &&
+#ifndef CONFIG_PREEMPT
+ atomic_depth = 0;
+#endif
+ if (((in_atomic() != atomic_depth) || irqs_disabled()) &&
system_state == SYSTEM_RUNNING) {
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
printk(KERN_ERR "Debug: sleeping function called from invalid"
" context at %s:%d\n", file, line);
- printk("in_atomic():%d, irqs_disabled():%d\n",
- in_atomic(), irqs_disabled());
+ printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
+ in_atomic(), atomic_depth, irqs_disabled());
dump_stack();
}
#endif
EXPORT_SYMBOL(__preempt_write_lock);
#endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */
+
+#ifdef CONFIG_DELAY_ACCT
+int task_running_sys(struct task_struct *p)
+{
+ return task_running(task_rq(p),p);
+}
+EXPORT_SYMBOL(task_running_sys);
+#endif
+
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/**
+ * return the classqueue object of a certain processor
+ * Note: not supposed to be used in performance sensitive functions
+ */
+struct classqueue_struct * get_cpu_classqueue(int cpu)
+{
+ return (& (cpu_rq(cpu)->classqueue) );
+}
+#endif