X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Fsched.c;h=f9be596767c25d71ea1b12b5ae20a1d545a8872d;hb=f9296eb00ed30209424102d3c920e69617eea853;hp=20b09215e320c31b1157758078099c42f86f42aa;hpb=a91482bdcc2e0f6035702e46f1b99043a0893346;p=linux-2.6.git diff --git a/kernel/sched.c b/kernel/sched.c index 20b09215e..f9be59676 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -17,6 +17,7 @@ * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin */ + #include #include #include @@ -42,7 +43,6 @@ #include #include #include -#include #include @@ -52,10 +52,6 @@ #define cpu_to_node_mask(cpu) (cpu_online_map) #endif -/* used to soft spin in sched while dump is in progress */ -unsigned long dump_oncpu; -EXPORT_SYMBOL(dump_oncpu); - /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -163,20 +159,8 @@ EXPORT_SYMBOL(dump_oncpu); #define LOW_CREDIT(p) \ ((p)->interactive_credit < -CREDIT_LIMIT) -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/* - * if belong to different class, compare class priority - * otherwise compare task priority - */ -#define TASK_PREEMPTS_CURR(p, rq) \ - ( ((p)->cpu_class != (rq)->curr->cpu_class) \ - && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \ - ? class_preempts_curr((p),(rq)->curr) \ - : ((p)->prio < (rq)->curr->prio) -#else #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) -#endif /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] @@ -193,7 +177,7 @@ EXPORT_SYMBOL(dump_oncpu); ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } @@ -204,9 +188,15 @@ unsigned int task_timeslice(task_t *p) * These are the runqueue data structures: */ +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + typedef struct runqueue runqueue_t; -#include -#include + +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; +}; /* * This is the main, per-CPU runqueue data structure. @@ -223,20 +213,15 @@ struct runqueue { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP unsigned long cpu_load; #endif - unsigned long long nr_switches, nr_preempt; + unsigned long long nr_switches; unsigned long expired_timestamp, nr_uninterruptible; unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - struct classqueue_struct classqueue; - ckrm_load_t ckrm_load; -#else - prio_array_t *active, *expired, arrays[2]; -#endif + prio_array_t *active, *expired, arrays[2]; int best_expired_prio; atomic_t nr_iowait; @@ -250,11 +235,8 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; #endif - -#ifdef CONFIG_VSERVER_HARDCPU struct list_head hold_queue; int idle_tokens; -#endif }; static DEFINE_PER_CPU(struct runqueue, runqueues); @@ -320,97 +302,6 @@ static inline void rq_unlock(runqueue_t *rq) spin_unlock_irq(&rq->lock); } -#ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) -{ - cq_node_t *node = classqueue_get_head(&rq->classqueue); - return ((node) ? class_list_entry(node) : NULL); -} - -/* - * return the cvt of the current running class - * if no current running class, return 0 - * assume cpu is valid (cpu_online(cpu) == 1) - */ -CVT_t get_local_cur_cvt(int cpu) -{ - ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu)); - - if (lrq) - return lrq->local_cvt; - else - return 0; -} - -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) -{ - prio_array_t *array; - struct task_struct *next; - ckrm_lrq_t *queue; - int idx; - int cpu = smp_processor_id(); - - // it is guaranteed be the ( rq->nr_running > 0 ) check in - // schedule that a task will be found. - - retry_next_class: - queue = rq_get_next_class(rq); - // BUG_ON( !queue ); - - array = queue->active; - if (unlikely(!array->nr_active)) { - queue->active = queue->expired; - queue->expired = array; - queue->expired_timestamp = 0; - - if (queue->active->nr_active) - set_top_priority(queue, - find_first_bit(queue->active->bitmap, MAX_PRIO)); - else { - classqueue_dequeue(queue->classqueue, - &queue->classqueue_linkobj); - cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); - } - goto retry_next_class; - } - // BUG_ON(!array->nr_active); - - idx = queue->top_priority; - // BUG_ON (idx == MAX_PRIO); - next = task_list_entry(array->queue[idx].next); - return next; -} -#else /*! CONFIG_CKRM_CPU_SCHEDULE*/ -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) -{ - prio_array_t *array; - struct list_head *queue; - int idx; - - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - return list_entry(queue->next, task_t, run_list); -} - -static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } -static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } -static inline void init_cpu_classes(void) { } -#define rq_ckrm_load(rq) NULL -static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} -#endif /* CONFIG_CKRM_CPU_SCHEDULE */ - /* * Adding/removing a task to/from a priority array: */ @@ -420,7 +311,6 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array) list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); - class_dequeue_task(p,array); } static void enqueue_task(struct task_struct *p, prio_array_t *array) @@ -429,7 +319,6 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; - class_enqueue_task(p,array); } /* @@ -443,7 +332,6 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; - class_enqueue_task(p,array); } /* @@ -485,7 +373,7 @@ static int effective_prio(task_t *p) */ static inline void __activate_task(task_t *p, runqueue_t *rq) { - enqueue_task(p, rq_active(p,rq)); + enqueue_task(p, rq->active); rq->nr_running++; } @@ -494,7 +382,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) */ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { - enqueue_task_head(p, rq_active(p,rq)); + enqueue_task_head(p, rq->active); rq->nr_running++; } @@ -816,9 +704,10 @@ static int wake_idle(int cpu, task_t *p) return cpu; cpus_and(tmp, sd->span, cpu_online_map); - cpus_and(tmp, tmp, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { + if (!cpu_isset(i, p->cpus_allowed)) + continue; + if (idle_cpu(i)) return i; } @@ -1000,10 +889,6 @@ void fastcall sched_fork(task_t *p) INIT_LIST_HEAD(&p->run_list); p->array = NULL; spin_lock_init(&p->switch_lock); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0); -#endif - #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which @@ -1079,7 +964,6 @@ void fastcall wake_up_forked_process(task_t * p) p->array = current->array; p->array->nr_active++; rq->nr_running++; - class_enqueue_task(p,p->array); } task_rq_unlock(rq, &flags); } @@ -1219,7 +1103,7 @@ unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; - for_each_cpu(i) + for_each_online_cpu(i) sum += cpu_rq(i)->nr_uninterruptible; return sum; @@ -1229,7 +1113,7 @@ unsigned long long nr_context_switches(void) { unsigned long long i, sum = 0; - for_each_cpu(i) + for_each_online_cpu(i) sum += cpu_rq(i)->nr_switches; return sum; @@ -1239,7 +1123,7 @@ unsigned long nr_iowait(void) { unsigned long i, sum = 0; - for_each_cpu(i) + for_each_online_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); return sum; @@ -1279,16 +1163,6 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) spin_unlock(&rq2->lock); } -unsigned long long nr_preempt(void) -{ - unsigned long long i, sum = 0; - - for_each_online_cpu(i) - sum += cpu_rq(i)->nr_preempt; - - return sum; -} - enum idle_type { IDLE, @@ -1412,7 +1286,6 @@ lock_again: p->array = current->array; p->array->nr_active++; rq->nr_running++; - class_enqueue_task(p,p->array); } } else { /* Not the local CPU - must adjust timestamp */ @@ -1558,449 +1431,6 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } -#ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline int ckrm_preferred_task(task_t *tmp,long min, long max, - int phase, enum idle_type idle) -{ - long pressure = task_load(tmp); - - if (pressure > max) - return 0; - - if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) - return 0; - return 1; -} - -/* - * move tasks for a specic local class - * return number of tasks pulled - */ -static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, - runqueue_t *this_rq, - runqueue_t *busiest, - struct sched_domain *sd, - int this_cpu, - enum idle_type idle, - long* pressure_imbalance) -{ - prio_array_t *array, *dst_array; - struct list_head *head, *curr; - task_t *tmp; - int idx; - int pulled = 0; - int phase = -1; - long pressure_min, pressure_max; - /*hzheng: magic : 90% balance is enough*/ - long balance_min = *pressure_imbalance / 10; -/* - * we don't want to migrate tasks that will reverse the balance - * or the tasks that make too small difference - */ -#define CKRM_BALANCE_MAX_RATIO 100 -#define CKRM_BALANCE_MIN_RATIO 1 - start: - phase ++; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (src_lrq->expired->nr_active) { - array = src_lrq->expired; - dst_array = dst_lrq->expired; - } else { - array = src_lrq->active; - dst_array = dst_lrq->active; - } - - new_array: - /* Start searching at priority 0: */ - idx = 0; - skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == src_lrq->expired && src_lrq->active->nr_active) { - array = src_lrq->active; - dst_array = dst_lrq->active; - goto new_array; - } - if ((! phase) && (! pulled) && (idle != IDLE)) - goto start; //try again - else - goto out; //finished search for this lrq - } - - head = array->queue + idx; - curr = head->prev; - skip_queue: - tmp = list_entry(curr, task_t, run_list); - - curr = curr->prev; - - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - - pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; - pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; - /* - * skip the tasks that will reverse the balance too much - */ - if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { - *pressure_imbalance -= task_load(tmp); - pull_task(busiest, array, tmp, - this_rq, dst_array, this_cpu); - pulled++; - - if (*pressure_imbalance <= balance_min) - goto out; - } - - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - out: - return pulled; -} - -static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) -{ - long imbalance; - /* - * make sure after balance, imbalance' > - imbalance/2 - * we don't want the imbalance be reversed too much - */ - imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) - - pid_get_pressure(rq_ckrm_load(this_rq),1); - imbalance /= 2; - return imbalance; -} - -/* - * try to balance the two runqueues - * - * Called with both runqueues locked. - * if move_tasks is called, it will try to move at least one task over - */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) -{ - struct ckrm_cpu_class *clsptr,*vip_cls = NULL; - ckrm_lrq_t* src_lrq,*dst_lrq; - long pressure_imbalance, pressure_imbalance_old; - int src_cpu = task_cpu(busiest->curr); - struct list_head *list; - int pulled = 0; - long imbalance; - - imbalance = ckrm_rq_imbalance(this_rq,busiest); - - if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) - goto out; - - //try to find the vip class - list_for_each_entry(clsptr,&active_cpu_classes,links) { - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - - if (! lrq_nr_running(src_lrq)) - continue; - - if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) - { - vip_cls = clsptr; - } - } - - /* - * do search from the most significant class - * hopefully, less tasks will be migrated this way - */ - clsptr = vip_cls; - - move_class: - if (! clsptr) - goto out; - - - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - if (! lrq_nr_running(src_lrq)) - goto other_class; - - dst_lrq = get_ckrm_lrq(clsptr,this_cpu); - - //how much pressure for this class should be transferred - pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; - if (pulled && ! pressure_imbalance) - goto other_class; - - pressure_imbalance_old = pressure_imbalance; - - //move tasks - pulled += - ckrm_cls_move_tasks(src_lrq,dst_lrq, - this_rq, - busiest, - sd,this_cpu,idle, - &pressure_imbalance); - - /* - * hzheng: 2 is another magic number - * stop balancing if the imbalance is less than 25% of the orig - */ - if (pressure_imbalance <= (pressure_imbalance_old >> 2)) - goto out; - - //update imbalance - imbalance *= pressure_imbalance / pressure_imbalance_old; - other_class: - //who is next? - list = clsptr->links.next; - if (list == &active_cpu_classes) - list = list->next; - clsptr = list_entry(list, typeof(*clsptr), links); - if (clsptr != vip_cls) - goto move_class; - out: - return pulled; -} - -/** - * ckrm_check_balance - is load balancing necessary? - * return 0 if load balancing is not necessary - * otherwise return the average load of the system - * also, update nr_group - * - * heuristics: - * no load balancing if it's load is over average - * no load balancing if it's load is far more than the min - * task: - * read the status of all the runqueues - */ -static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, - enum idle_type idle, int* nr_group) -{ - struct sched_group *group = sd->groups; - unsigned long min_load, max_load, avg_load; - unsigned long total_load, this_load, total_pwr; - - max_load = this_load = total_load = total_pwr = 0; - min_load = 0xFFFFFFFF; - *nr_group = 0; - - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; - - /* Tally up the load of all CPUs in the group */ - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; - - avg_load = 0; - local_group = cpu_isset(this_cpu, group->cpumask); - - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); - nr_cpus++; - avg_load += load; - } - - if (!nr_cpus) - goto nextgroup; - - total_load += avg_load; - total_pwr += group->cpu_power; - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - goto nextgroup; - } else if (avg_load > max_load) { - max_load = avg_load; - } - if (avg_load < min_load) { - min_load = avg_load; - } -nextgroup: - group = group->next; - *nr_group = *nr_group + 1; - } while (group != sd->groups); - - if (!max_load || this_load >= max_load) - goto out_balanced; - - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - - /* hzheng: debugging: 105 is a magic number - * 100*max_load <= sd->imbalance_pct*this_load) - * should use imbalance_pct instead - */ - if (this_load > avg_load - || 100*max_load < 105*this_load - || 100*min_load < 70*this_load - ) - goto out_balanced; - - return avg_load; - out_balanced: - return 0; -} - -/** - * any group that has above average load is considered busy - * find the busiest queue from any of busy group - */ -static runqueue_t * -ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, - unsigned long avg_load, enum idle_type idle, - int nr_group) -{ - struct sched_group *group; - runqueue_t * busiest=NULL; - unsigned long rand; - - group = sd->groups; - rand = get_ckrm_rand(nr_group); - nr_group = 0; - - do { - unsigned long load,total_load,max_load; - cpumask_t tmp; - int i; - runqueue_t * grp_busiest; - - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto find_nextgroup; - - total_load = 0; - max_load = 0; - grp_busiest = NULL; - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); - total_load += load; - if (load > max_load) { - max_load = load; - grp_busiest = cpu_rq(i); - } - } - - total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; - if (total_load > avg_load) { - busiest = grp_busiest; - if (nr_group >= rand) - break; - } - find_nextgroup: - group = group->next; - nr_group ++; - } while (group != sd->groups); - - return busiest; -} - -/** - * load_balance - pressure based load balancing algorithm used by ckrm - */ -static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - runqueue_t *busiest; - unsigned long avg_load; - int nr_moved,nr_group; - - avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); - if (! avg_load) - goto out_balanced; - - busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); - if (! busiest) - goto out_balanced; - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } - - nr_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - 0,sd, idle); - spin_unlock(&busiest->lock); - if (nr_moved) { - adjust_local_weight(); - } - } - - if (!nr_moved) - sd->nr_balance_failed ++; - else - sd->nr_balance_failed = 0; - - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - - return nr_moved; - -out_balanced: - /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - - return 0; -} - -/* - * this_rq->lock is already held - */ -static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd) -{ - int ret; - read_lock(&class_list_lock); - ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - return ret; -} - -static inline int load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - int ret; - - spin_lock(&this_rq->lock); - read_lock(&class_list_lock); - ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - spin_unlock(&this_rq->lock); - return ret; -} -#else /*! CONFIG_CKRM_CPU_SCHEDULE */ /* * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, * as part of a balancing operation within "domain". Returns the number of @@ -2365,8 +1795,6 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out: return nr_moved; } -#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ - /* * idle_balance is called by schedule() if this_cpu is about to become @@ -2504,7 +1932,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, } } } -#else /* SMP*/ +#else /* * on UP we do not need to balance between CPUs: */ @@ -2532,6 +1960,7 @@ static inline int wake_priority_sleeper(runqueue_t *rq) } DEFINE_PER_CPU(struct kernel_stat, kstat); + EXPORT_PER_CPU_SYMBOL(kstat); /* @@ -2544,19 +1973,11 @@ EXPORT_PER_CPU_SYMBOL(kstat); * increasing number of running tasks. We also ignore the interactivity * if a better static_prio task has expired: */ - -#ifndef CONFIG_CKRM_CPU_SCHEDULE #define EXPIRED_STARVING(rq) \ ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ ((rq)->curr->static_prio > (rq)->best_expired_prio)) -#else -#define EXPIRED_STARVING(rq) \ - (STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) -#endif /* * This function gets called by the timer code, with HZ frequency. @@ -2587,10 +2008,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) } if (p == rq->idle) { -#ifdef CONFIG_VSERVER_HARDCPU if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) set_need_resched(); -#endif if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; @@ -2598,7 +2017,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, IDLE); return; } @@ -2609,7 +2027,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq_active(p,rq)) { + if (p->array != rq->active) { set_tsk_need_resched(p); goto out; } @@ -2632,16 +2050,12 @@ void scheduler_tick(int user_ticks, int sys_ticks) set_tsk_need_resched(p); /* put it at the end of the queue: */ - dequeue_task(p, rq_active(p,rq)); - enqueue_task(p, rq_active(p,rq)); + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); } goto out_unlock; } if (vx_need_resched(p)) { -#ifdef CONFIG_CKRM_CPU_SCHEDULE - /* Hubertus ... we can abstract this out */ - ckrm_lrq_t* rq = get_task_lrq(p); -#endif dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); @@ -2652,8 +2066,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { enqueue_task(p, rq->expired); - if (p->static_prio < this_rq()->best_expired_prio) - this_rq()->best_expired_prio = p->static_prio; + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); } else { @@ -2676,18 +2090,17 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY(p)) && (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq_active(p,rq))) { + (p->array == rq->active)) { - dequeue_task(p, rq_active(p,rq)); + dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); - enqueue_task(p, rq_active(p,rq)); + enqueue_task(p, rq->active); } } out_unlock: spin_unlock(&rq->lock); out: - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -2785,24 +2198,15 @@ asmlinkage void __sched schedule(void) task_t *prev, *next; runqueue_t *rq; prio_array_t *array; + struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu; + int cpu, idx; #ifdef CONFIG_VSERVER_HARDCPU struct vx_info *vxi; int maxidle = -HZ; #endif - /* - * If crash dump is in progress, this other cpu's - * need to wait until it completes. - * NB: this code is optimized away for kernels without - * dumping enabled. - */ - if (unlikely(dump_oncpu)) - goto dump_scheduling_disabled; - - //WARN_ON(system_state == SYSTEM_BOOTING); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -2837,19 +2241,6 @@ need_resched: spin_lock_irq(&rq->lock); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - if (prev != rq->idle) { - unsigned long long run = now - prev->timestamp; - ckrm_lrq_t * lrq = get_task_lrq(prev); - - lrq->lrq_load -= task_load(prev); - cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); - lrq->lrq_load += task_load(prev); - - cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); - update_local_cvt(prev, run); - } -#endif /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2898,17 +2289,29 @@ pick_next: #endif if (unlikely(!rq->nr_running)) { idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - rq->expired_timestamp = 0; -#endif - wake_sleeping_dependent(cpu, rq); - goto switch_tasks; - } + if (!rq->nr_running) { + next = rq->idle; + rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + goto switch_tasks; + } + } + + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + rq->best_expired_prio = MAX_PRIO; } - next = rq_get_next_task(rq); + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; @@ -2946,8 +2349,7 @@ pick_next: next->activated = 0; switch_tasks: prefetch(next); - if (test_and_clear_tsk_thread_flag(prev,TIF_NEED_RESCHED)) - rq->nr_preempt++; + clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; prev->sleep_avg -= run_time; @@ -2979,19 +2381,10 @@ switch_tasks: preempt_enable_no_resched(); if (test_thread_flag(TIF_NEED_RESCHED)) goto need_resched; - - return; - - dump_scheduling_disabled: - /* allow scheduling only if this is the dumping cpu */ - if (dump_oncpu != smp_processor_id()+1) { - while (dump_oncpu) - cpu_relax(); - } - return; } EXPORT_SYMBOL(schedule); + #ifdef CONFIG_PREEMPT /* * this is is the entry point to schedule() from in-kernel preemption @@ -3616,21 +3009,6 @@ out_unlock: return retval; } -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map = CPU_MASK_ALL; -cpumask_t cpu_possible_map = CPU_MASK_ALL; -#endif - /** * sys_sched_getaffinity - get the cpu affinity of a process * @pid: pid of the process @@ -3681,7 +3059,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; - prio_array_t *target = rq_expired(current,rq); + prio_array_t *target = rq->expired; /* * We implement yielding by moving the task into the expired @@ -3691,7 +3069,7 @@ asmlinkage long sys_sched_yield(void) * array.) */ if (unlikely(rt_task(current))) - target = rq_active(current,rq); + target = rq->active; dequeue_task(current, array); enqueue_task(current, target); @@ -3710,34 +3088,12 @@ asmlinkage long sys_sched_yield(void) void __sched __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - __might_sleep(__FILE__, __LINE__, 0); -#endif - /* - * The system_state check is somewhat ugly but we might be - * called during early boot when we are not yet ready to reschedule. - */ - if (need_resched() && system_state >= SYSTEM_BOOTING_SCHEDULER_OK) { - set_current_state(TASK_RUNNING); - schedule(); - } + set_current_state(TASK_RUNNING); + schedule(); } EXPORT_SYMBOL(__cond_resched); -void __sched __cond_resched_lock(spinlock_t * lock) -{ - if (need_resched()) { - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - set_current_state(TASK_RUNNING); - schedule(); - spin_lock(lock); - } -} - -EXPORT_SYMBOL(__cond_resched_lock); - /** * yield - yield the current processor to other threads. * @@ -3969,8 +3325,6 @@ void show_state(void) read_unlock(&tasklist_lock); } -EXPORT_SYMBOL_GPL(show_state); - void __devinit init_idle(task_t *idle, int cpu) { runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); @@ -4040,7 +3394,7 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask) runqueue_t *rq; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(new_mask, cpu_online_map)) { + if (any_online_cpu(new_mask) == NR_CPUS) { ret = -EINVAL; goto out; } @@ -4055,7 +3409,6 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask) task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); wait_for_completion(&req.done); - tlb_migrate_finish(p->mm); return 0; } out: @@ -4092,6 +3445,7 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + set_task_cpu(p, dest_cpu); if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -4102,12 +3456,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); - set_task_cpu(p, dest_cpu); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); - } else - set_task_cpu(p, dest_cpu); + } out: double_rq_unlock(rq_src, rq_dest); @@ -4217,7 +3569,8 @@ static void migrate_all_tasks(int src_cpu) if (dest_cpu == NR_CPUS) dest_cpu = any_online_cpu(tsk->cpus_allowed); if (dest_cpu == NR_CPUS) { - cpus_setall(tsk->cpus_allowed); + cpus_clear(tsk->cpus_allowed); + cpus_complement(tsk->cpus_allowed); dest_cpu = any_online_cpu(tsk->cpus_allowed); /* Don't tell them about moving exiting tasks @@ -4279,7 +3632,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); if (IS_ERR(p)) return NOTIFY_BAD; - p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); @@ -4527,14 +3879,14 @@ void sched_domain_debug(void) sd = rq->sd; - printk(KERN_DEBUG "CPU%d: %s\n", + printk(KERN_WARNING "CPU%d: %s\n", i, (cpu_online(i) ? " online" : "offline")); do { int j; char str[NR_CPUS]; struct sched_group *group = sd->groups; - cpumask_t groupmask; + cpumask_t groupmask, tmp; cpumask_scnprintf(str, NR_CPUS, sd->span); cpus_clear(groupmask); @@ -4545,13 +3897,13 @@ void sched_domain_debug(void) printk("domain %d: span %s\n", level, str); if (!cpu_isset(i, sd->span)) - printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); + printk(KERN_WARNING "ERROR domain->span does not contain CPU%d\n", i); if (!cpu_isset(i, group->cpumask)) - printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + printk(KERN_WARNING "ERROR domain->groups does not contain CPU%d\n", i); if (!group->cpu_power) - printk(KERN_DEBUG "ERROR domain->cpu_power not set\n"); + printk(KERN_WARNING "ERROR domain->cpu_power not set\n"); - printk(KERN_DEBUG); + printk(KERN_WARNING); for (j = 0; j < level + 2; j++) printk(" "); printk("groups:"); @@ -4564,7 +3916,8 @@ void sched_domain_debug(void) if (!cpus_weight(group->cpumask)) printk(" ERROR empty group:"); - if (cpus_intersects(groupmask, group->cpumask)) + cpus_and(tmp, groupmask, group->cpumask); + if (cpus_weight(tmp) > 0) printk(" ERROR repeated CPUs:"); cpus_or(groupmask, groupmask, group->cpumask); @@ -4583,8 +3936,9 @@ void sched_domain_debug(void) sd = sd->parent; if (sd) { - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); + cpus_and(tmp, groupmask, sd->span); + if (!cpus_equal(tmp, groupmask)) + printk(KERN_WARNING "ERROR parent span is not a superset of domain->span\n"); } } while (sd); @@ -4616,71 +3970,56 @@ int in_sched_functions(unsigned long addr) void __init sched_init(void) { runqueue_t *rq; - int i; + int i, j, k; #ifdef CONFIG_SMP /* Set up an initial dummy domain for early boot */ static struct sched_domain sched_domain_init; static struct sched_group sched_group_init; + cpumask_t cpu_mask_all = CPU_MASK_ALL; memset(&sched_domain_init, 0, sizeof(struct sched_domain)); - sched_domain_init.span = CPU_MASK_ALL; + sched_domain_init.span = cpu_mask_all; sched_domain_init.groups = &sched_group_init; sched_domain_init.last_balance = jiffies; sched_domain_init.balance_interval = INT_MAX; /* Don't balance */ - sched_domain_init.busy_factor = 1; memset(&sched_group_init, 0, sizeof(struct sched_group)); - sched_group_init.cpumask = CPU_MASK_ALL; + sched_group_init.cpumask = cpu_mask_all; sched_group_init.next = &sched_group_init; sched_group_init.cpu_power = SCHED_LOAD_SCALE; #endif - init_cpu_classes(); for (i = 0; i < NR_CPUS; i++) { -#ifndef CONFIG_CKRM_CPU_SCHEDULE - int j, k; prio_array_t *array; rq = cpu_rq(i); spin_lock_init(&rq->lock); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } - rq->active = rq->arrays; rq->expired = rq->arrays + 1; -#else - rq = cpu_rq(i); - spin_lock_init(&rq->lock); -#endif - rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - ckrm_load_init(rq_ckrm_load(rq)); -#endif rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); #endif -#ifdef CONFIG_VSERVER_HARDCPU INIT_LIST_HEAD(&rq->hold_queue); -#endif atomic_set(&rq->nr_iowait, 0); - } + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } /* * We have to do a little magic to get the first * thread right in SMP mode. @@ -4689,11 +4028,6 @@ void __init sched_init(void) rq->curr = current; rq->idle = current; set_task_cpu(current, smp_processor_id()); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - cpu_demand_event(&(current)->demand_stat,CPU_DEMAND_INIT,0); - current->cpu_class = get_default_cpu_class(); - current->array = NULL; -#endif wake_up_forked_process(current); /* @@ -4704,23 +4038,20 @@ void __init sched_init(void) } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line, int atomic_depth) +void __might_sleep(char *file, int line) { #if defined(in_atomic) static unsigned long prev_jiffy; /* ratelimiting */ -#ifndef CONFIG_PREEMPT - atomic_depth = 0; -#endif - if (((in_atomic() != atomic_depth) || irqs_disabled()) && + if ((in_atomic() || irqs_disabled()) && system_state == SYSTEM_RUNNING) { if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; printk(KERN_ERR "Debug: sleeping function called from invalid" " context at %s:%d\n", file, line); - printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n", - in_atomic(), atomic_depth, irqs_disabled()); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); dump_stack(); } #endif @@ -4782,33 +4113,3 @@ int task_running_sys(struct task_struct *p) EXPORT_SYMBOL(task_running_sys); #endif -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/** - * return the classqueue object of a certain processor - */ -struct classqueue_struct * get_cpu_classqueue(int cpu) -{ - return (& (cpu_rq(cpu)->classqueue) ); -} - -/** - * _ckrm_cpu_change_class - change the class of a task - */ -void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) -{ - prio_array_t *array; - struct runqueue *rq; - unsigned long flags; - - rq = task_rq_lock(tsk,&flags); - array = tsk->array; - if (array) { - dequeue_task(tsk,array); - tsk->cpu_class = newcls; - enqueue_task(tsk,rq_active(tsk,rq)); - } else - tsk->cpu_class = newcls; - - task_rq_unlock(rq,&flags); -} -#endif