X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Fsched.c;h=74624e037d06c24049a7565c397fbe17249c58a8;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=e43f1a76dd437aee45bae1e1c1f4d28628a078a2;hpb=16c70f8c1b54b61c3b951b6fb220df250fe09b32;p=linux-2.6.git diff --git a/kernel/sched.c b/kernel/sched.c index e43f1a76d..74624e037 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include @@ -49,15 +49,14 @@ #include #include #include -#include +#include #include #include #include #include -#include -#include #include +#include /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -163,15 +162,6 @@ #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) -/* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - */ - #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) @@ -183,6 +173,15 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + static inline unsigned int task_timeslice(struct task_struct *p) { return static_prio_timeslice(p->static_prio); @@ -228,8 +227,10 @@ struct rq { unsigned long nr_uninterruptible; unsigned long expired_timestamp; - unsigned long long timestamp_last_tick; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; + unsigned long next_balance; struct mm_struct *prev_mm; struct prio_array *active, *expired, arrays[2]; int best_expired_prio; @@ -245,9 +246,15 @@ struct rq { struct task_struct *migration_thread; struct list_head migration_queue; +#endif + unsigned long norm_time; + unsigned long idle_time; +#ifdef CONFIG_VSERVER_IDLETIME + int idle_skip; #endif #ifdef CONFIG_VSERVER_HARDCPU struct list_head hold_queue; + unsigned long nr_onhold; int idle_tokens; #endif @@ -273,7 +280,7 @@ struct rq { struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues); +static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; static inline int cpu_of(struct rq *rq) { @@ -433,7 +440,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 12 +#define SCHEDSTAT_VERSION 14 static int show_schedstat(struct seq_file *seq, void *v) { @@ -471,7 +478,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " + "%lu", sd->lb_cnt[itype], sd->lb_balanced[itype], sd->lb_failed[itype], @@ -481,11 +489,13 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" + " %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); } preempt_enable(); #endif @@ -512,7 +522,7 @@ static int schedstat_open(struct inode *inode, struct file *file) return res; } -struct file_operations proc_schedstat_operations = { +const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, @@ -554,7 +564,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) #endif /* - * rq_lock - lock a given runqueue and disable interrupts. + * this_rq_lock - lock this runqueue and disable interrupts. */ static inline struct rq *this_rq_lock(void) __acquires(rq->lock) @@ -734,15 +744,13 @@ enqueue_task_head(struct task_struct *p, struct prio_array *array) static inline int __normal_prio(struct task_struct *p) { int bonus, prio; - struct vx_info *vxi; bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; - if ((vxi = p->vx_info) && - vx_info_flags(vxi, VXF_SCHED_PRIO, 0)) - prio += vx_effective_vavavoom(vxi, MAX_USER_PRIO); + /* adjust effective priority */ + prio = vx_adjust_prio(p, prio, MAX_USER_PRIO); if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; @@ -853,6 +861,9 @@ static int effective_prio(struct task_struct *p) return p->prio; } +#include "sched_mon.h" + + /* * __activate_task - move a task to the runqueue. */ @@ -862,6 +873,7 @@ static void __activate_task(struct task_struct *p, struct rq *rq) if (batch_task(p)) target = rq->expired; + vxm_activate_task(p, rq); enqueue_task(p, target); inc_nr_running(p, rq); } @@ -871,6 +883,7 @@ static void __activate_task(struct task_struct *p, struct rq *rq) */ static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) { + vxm_activate_idle(p, rq); enqueue_task_head(p, rq->active); inc_nr_running(p, rq); } @@ -955,18 +968,31 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) { unsigned long long now; + if (rt_task(p)) + goto out; + now = sched_clock(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ struct rq *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; } #endif - if (!rt_task(p)) - p->prio = recalc_task_prio(p, now); + /* + * Sleep time is in units of nanosecs, so shift by 20 to get a + * milliseconds-range estimation of the amount of time that the task + * spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + if (p->state == TASK_UNINTERRUPTIBLE) + profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), + (now - p->timestamp) >> 20); + } + + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -991,18 +1017,19 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) } } p->timestamp = now; - +out: vx_activate_task(p); __activate_task(p, rq); } /* - * deactivate_task - remove a task from the runqueue. + * __deactivate_task - remove a task from the runqueue. */ static void __deactivate_task(struct task_struct *p, struct rq *rq) { dec_nr_running(p, rq); dequeue_task(p, p->array); + vxm_deactivate_task(p, rq); p->array = NULL; } @@ -1013,55 +1040,7 @@ void deactivate_task(struct task_struct *p, struct rq *rq) __deactivate_task(p, rq); } - -#ifdef CONFIG_VSERVER_HARDCPU -/* - * vx_hold_task - put a task on the hold queue - */ -static inline -void vx_hold_task(struct vx_info *vxi, - struct task_struct *p, struct rq *rq) -{ - __deactivate_task(p, rq); - p->state |= TASK_ONHOLD; - /* a new one on hold */ - vx_onhold_inc(vxi); - list_add_tail(&p->run_list, &rq->hold_queue); -} - -/* - * vx_unhold_task - put a task back to the runqueue - */ -static inline -void vx_unhold_task(struct vx_info *vxi, - struct task_struct *p, struct rq *rq) -{ - list_del(&p->run_list); - /* one less waiting */ - vx_onhold_dec(vxi); - p->state &= ~TASK_ONHOLD; - enqueue_task(p, rq->expired); - inc_nr_running(p, rq); - - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; -} -#else -static inline -void vx_hold_task(struct vx_info *vxi, - struct task_struct *p, struct rq *rq) -{ - return; -} - -static inline -void vx_unhold_task(struct vx_info *vxi, - struct task_struct *p, struct rq *rq) -{ - return; -} -#endif /* CONFIG_VSERVER_HARDCPU */ - +#include "sched_hard.h" /* * resched_task - mark a task 'to be rescheduled now'. @@ -1138,6 +1117,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) { struct rq *rq = task_rq(p); + vxm_migrate_task(p, rq, dest_cpu); /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. @@ -1307,7 +1287,7 @@ nextgroup: } /* - * find_idlest_queue - find the idlest runqueue among the cpus in group. + * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) @@ -1361,21 +1341,29 @@ static int sched_balance_self(int cpu, int flag) while (sd) { cpumask_t span; struct sched_group *group; - int new_cpu; - int weight; + int new_cpu, weight; + + if (!(sd->flags & flag)) { + sd = sd->child; + continue; + } span = sd->span; group = find_idlest_group(sd, t, cpu); - if (!group) - goto nextlevel; + if (!group) { + sd = sd->child; + continue; + } new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) - goto nextlevel; + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } - /* Now try balancing at a lower domain level */ + /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; -nextlevel: sd = NULL; weight = cpus_weight(span); for_each_domain(cpu, tmp) { @@ -1461,7 +1449,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) /* we need to unhold suspended tasks */ if (old_state & TASK_ONHOLD) { - vx_unhold_task(p->vx_info, p, rq); + vx_unhold_task(p, rq); old_state = p->state; } if (!(old_state & state)) @@ -1512,7 +1500,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_AFFINE) { unsigned long tl = this_load; - unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); + unsigned long tl_per_task; + + tl_per_task = cpu_avg_load_per_task(this_cpu); /* * If sync wakeup then subtract the (maximum possible) @@ -1621,6 +1611,7 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } +static void task_running_tick(struct rq *rq, struct task_struct *p, int cpu); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1681,7 +1672,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) * runqueue lock is not a problem. */ current->time_slice = 1; - scheduler_tick(); + task_running_tick(cpu_rq(cpu), current, cpu); } local_irq_enable(); put_cpu(); @@ -1753,8 +1744,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1839,27 +1830,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; - unsigned long prev_task_flags; + long prev_state; rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". - * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and - * calls schedule one last time. The schedule call will never return, - * and the scheduled task must drop that reference. - * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. + * The test for TASK_DEAD must occur while the runqueue locks are * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. * Manfred Spraul */ - prev_task_flags = prev->flags; + prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); if (mm) mmdrop(mm); - if (unlikely(prev_task_flags & PF_DEAD)) { + if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -1898,14 +1889,14 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; - if (unlikely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; @@ -2017,6 +2008,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { + BUG_ON(!irqs_disabled()); if (rq1 == rq2) { spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ @@ -2056,6 +2048,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -2126,8 +2123,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. @@ -2163,10 +2160,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (sd->nr_balance_failed > sd->cache_nice_tries) + if (sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->most_recent_timestamp, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif return 1; + } - if (task_hot(p, rq->timestamp_last_tick, sd)) + if (task_hot(p, rq->most_recent_timestamp, sd)) return 0; return 1; } @@ -2264,11 +2266,6 @@ skip_queue: goto skip_bitmap; } -#ifdef CONFIG_SCHEDSTATS - if (task_hot(tmp, busiest->timestamp_last_tick, sd)) - schedstat_inc(sd, lb_hot_gained[idle]); -#endif - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -2306,7 +2303,7 @@ out: static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum idle_type idle, int *sd_idle, - cpumask_t *cpus) + cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2335,10 +2332,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long load, group_capacity; int local_group; int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); + if (local_group) + balance_cpu = first_cpu(group->cpumask); + /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -2354,9 +2355,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, *sd_idle = 0; /* Bias balancing toward cpus of our domain */ - if (local_group) + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + load = target_load(i, load_idx); - else + } else load = source_load(i, load_idx); avg_load += load; @@ -2364,6 +2370,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_weighted_load += rq->raw_weighted_load; } + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. + */ + if (local_group && balance_cpu != this_cpu && balance) { + *balance = 0; + goto ret; + } + total_load += avg_load; total_pwr += group->cpu_power; @@ -2523,18 +2539,21 @@ small_imbalance: pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + busiest->cpu_power; if (max_load > tmp) pwr_move += busiest->cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load*busiest->cpu_power < - busiest_load_per_task*SCHED_LOAD_SCALE) - tmp = max_load*busiest->cpu_power/this->cpu_power; + if (max_load * busiest->cpu_power < + busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = max_load * busiest->cpu_power / this->cpu_power; else - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + this->cpu_power; + pwr_move += this->cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -2555,8 +2574,8 @@ out_balanced: *imbalance = min_load_per_task; return group_min; } -ret: #endif +ret: *imbalance = 0; return NULL; } @@ -2605,27 +2624,37 @@ static inline unsigned long minus_1_or_zero(unsigned long n) /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. - * - * Called with this_rq unlocked. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, + int *balance) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; cpumask_t cpus = CPU_MASK_ALL; + unsigned long flags; + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus); + &cpus, balance); + + if (*balance == 0) + goto out_balanced; + if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2649,11 +2678,13 @@ redo: * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ + local_irq_save(flags); double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); + local_irq_restore(flags); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { @@ -2670,13 +2701,13 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock(&busiest->lock); + spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } @@ -2686,7 +2717,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); @@ -2714,7 +2745,7 @@ redo: } if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return nr_moved; @@ -2730,7 +2761,7 @@ out_one_pinned: sd->balance_interval *= 2; if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; return 0; } @@ -2752,13 +2783,20 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int sd_idle = 0; cpumask_t cpus = CPU_MASK_ALL; - if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ + if (sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, - &sd_idle, &cpus); + &sd_idle, &cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2793,7 +2831,8 @@ redo: if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; } else sd->nr_balance_failed = 0; @@ -2803,7 +2842,7 @@ redo: out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; sd->nr_balance_failed = 0; @@ -2817,14 +2856,28 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + 60 * HZ; for_each_domain(this_cpu, sd) { if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ - if (load_balance_newidle(this_cpu, this_rq, sd)) + pulled_task = load_balance_newidle(this_cpu, + this_rq, sd); + if (time_after(next_balance, + sd->last_balance + sd->balance_interval)) + next_balance = sd->last_balance + + sd->balance_interval; + if (pulled_task) break; } } + if (!pulled_task) + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; } /* @@ -2877,26 +2930,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) spin_unlock(&target_rq->lock); } -/* - * rebalance_tick will get called every timer tick, on every CPU. - * - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ - -/* Don't have all balancing operations going off at once: */ -static inline unsigned long cpu_offset(int cpu) -{ - return jiffies + cpu * HZ / NR_CPUS; -} - -static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +static void update_load(struct rq *this_rq) { - unsigned long this_load, interval, j = cpu_offset(this_cpu); - struct sched_domain *sd; + unsigned long this_load; int i, scale; this_load = this_rq->raw_weighted_load; @@ -2916,6 +2952,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; } +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ +static DEFINE_SPINLOCK(balancing); + +static void run_rebalance_domains(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(), balance = 1; + struct rq *this_rq = cpu_rq(this_cpu); + unsigned long interval; + struct sched_domain *sd; + /* + * We are idle if there are no processes running. This + * is valid even if we are the idle process (SMT). + */ + enum idle_type idle = !this_rq->nr_running ? + SCHED_IDLE : NOT_IDLE; + /* Earliest time when we have to call run_rebalance_domains again */ + unsigned long next_balance = jiffies + 60*HZ; for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2930,8 +2992,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) if (unlikely(!interval)) interval = 1; - if (j - sd->last_balance >= interval) { - if (load_balance(this_cpu, this_rq, sd, idle)) { + if (sd->flags & SD_SERIALIZE) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -2939,39 +3006,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) */ idle = NOT_IDLE; } - sd->last_balance += interval; + sd->last_balance = jiffies; } + if (sd->flags & SD_SERIALIZE) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; } + this_rq->next_balance = next_balance; } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) -{ -} static inline void idle_balance(int cpu, struct rq *rq) { } #endif -static inline int wake_priority_sleeper(struct rq *rq) +static inline void wake_priority_sleeper(struct rq *rq) { - int ret = 0; - #ifdef CONFIG_SCHED_SMT + if (!rq->nr_running) + return; + spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ - if (rq->nr_running) { + if (rq->nr_running) resched_task(rq->idle); - ret = 1; - } spin_unlock(&rq->lock); #endif - return ret; } DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -2985,7 +3061,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); static inline void update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { - p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - p->last_ran; + p->last_ran = rq->most_recent_timestamp = now; } /* @@ -2998,8 +3075,7 @@ unsigned long long current_sched_time(const struct task_struct *p) unsigned long flags; local_irq_save(flags); - ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); - ns = p->sched_time + sched_clock() - ns; + ns = p->sched_time + sched_clock() - p->last_ran; local_irq_restore(flags); return ns; @@ -3104,39 +3180,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->steal = cputime64_add(cpustat->steal, tmp); } -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. - */ -void scheduler_tick(void) +static void task_running_tick(struct rq *rq, struct task_struct *p, int cpu) { - unsigned long long now = sched_clock(); - struct task_struct *p = current; - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - update_cpu_clock(p, rq, now); - - rq->timestamp_last_tick = now; - - if (p == rq->idle) { - if (wake_priority_sleeper(rq)) - goto out; -#ifdef CONFIG_VSERVER_HARDCPU_IDLE - if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) - set_need_resched(); -#endif - rebalance_tick(cpu, rq, SCHED_IDLE); - return; - } - - /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { + /* Task has expired but was not scheduled yet */ set_tsk_need_resched(p); - goto out; + return; } spin_lock(&rq->lock); /* @@ -3161,7 +3210,7 @@ void scheduler_tick(void) } goto out_unlock; } - if (vx_need_resched(p)) { + if (vx_need_resched(p, --p->time_slice, cpu)) { dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); @@ -3204,8 +3253,36 @@ void scheduler_tick(void) } out_unlock: spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + unsigned long long now = sched_clock(); + struct task_struct *p = current; + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + update_cpu_clock(p, rq, now); + vxm_sync(now, cpu); + + if (p == rq->idle) { + /* Task on the idle queue */ + wake_priority_sleeper(rq); + vx_idle_resched(rq); + } else + task_running_tick(rq, p, cpu); +#ifdef CONFIG_SMP + update_load(rq); + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); +#endif } #ifdef CONFIG_SCHED_SMT @@ -3351,7 +3428,8 @@ void fastcall add_preempt_count(int val) /* * Spinlock count overflowing soon? */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); } EXPORT_SYMBOL(add_preempt_count); @@ -3394,10 +3472,6 @@ asmlinkage void __sched schedule(void) int cpu, idx, new_prio; long *switch_count; struct rq *rq; - struct vx_info *vxi; -#ifdef CONFIG_VSERVER_HARDCPU - int maxidle = -HZ; -#endif /* * Test if we are atomic. Since do_exit() needs to call into @@ -3408,6 +3482,9 @@ asmlinkage void __sched schedule(void) printk(KERN_ERR "BUG: scheduling while atomic: " "%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid); + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -3445,9 +3522,6 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); - if (unlikely(prev->flags & PF_DEAD)) - prev->state = EXIT_DEAD; - switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; @@ -3463,35 +3537,17 @@ need_resched_nonpreemptible: } } -#ifdef CONFIG_VSERVER_HARDCPU - if (!list_empty(&rq->hold_queue)) { - struct list_head *l, *n; - int ret; - - vxi = NULL; - list_for_each_safe(l, n, &rq->hold_queue) { - next = list_entry(l, struct task_struct, run_list); - if (vxi == next->vx_info) - continue; - - vxi = next->vx_info; - ret = vx_tokens_recalc(vxi); - - if (ret > 0) { - vx_unhold_task(vxi, next, rq); - break; - } - if ((ret < 0) && (maxidle < ret)) - maxidle = ret; - } - } - rq->idle_tokens = -maxidle; - + cpu = smp_processor_id(); + vx_set_rq_time(rq, jiffies); +try_unhold: + vx_try_unhold(rq, cpu); pick_next: -#endif - cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { + /* can we skip idle time? */ + if (vx_try_skip(rq, cpu)) + goto try_unhold; + idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; @@ -3518,21 +3574,9 @@ pick_next: queue = array->queue + idx; next = list_entry(queue->next, struct task_struct, run_list); - vxi = next->vx_info; -#ifdef CONFIG_VSERVER_HARDCPU - if (vx_info_flags(vxi, VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) { - int ret = vx_tokens_recalc(vxi); - - if (unlikely(ret <= 0)) { - if (ret && (rq->idle_tokens > -ret)) - rq->idle_tokens = -ret; - vx_hold_task(vxi, next, rq); - goto pick_next; - } - } else /* well, looks ugly but not as ugly as the ifdef-ed version */ -#endif - if (vx_info_flags(vxi, VXF_SCHED_PRIO, 0)) - vx_tokens_recalc(vxi); + /* check before we schedule this context */ + if (!vx_schedule(next, rq, cpu)) + goto pick_next; if (!rt_task(next) && interactive_sleep(next->sleep_type)) { unsigned long long delta = now - next->timestamp; @@ -3552,7 +3596,7 @@ pick_next: } } next->sleep_type = SLEEP_NORMAL; - if (dependent_sleeper(cpu, rq, next)) + if (rq->nr_running == 1 && dependent_sleeper(cpu, rq, next)) next = rq->idle; switch_tasks: if (next == rq->idle) @@ -3571,7 +3615,7 @@ switch_tasks: sched_info_switch(prev, next); if (likely(prev != next)) { - next->timestamp = now; + next->timestamp = next->last_ran = now; rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -3614,7 +3658,7 @@ asmlinkage void __sched preempt_schedule(void) * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (unlikely(ti->preempt_count || irqs_disabled())) + if (likely(ti->preempt_count || irqs_disabled())) return; need_resched: @@ -4225,6 +4269,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. + * + * NOTE: the task may be already dead */ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) @@ -4252,28 +4298,32 @@ recheck: (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) - != (param->sched_priority == 0)) + if (is_rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { - /* - * can't change policy, except between SCHED_NORMAL - * and SCHED_BATCH: - */ - if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && - (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && - !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) - return -EPERM; - /* can't increase priority */ - if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && - param->sched_priority > p->rt_priority && - param->sched_priority > - p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) - return -EPERM; + if (is_rt_policy(policy)) { + unsigned long rlim_rtprio; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + unlock_task_sighand(p, &flags); + + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; + + /* can't increase priority */ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; + } + /* can't change other user's priorities */ if ((current->euid != p->euid) && (current->euid != p->uid)) @@ -4339,14 +4389,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; - read_lock_irq(&tasklist_lock); + + rcu_read_lock(); + retval = -ESRCH; p = find_process_by_pid(pid); - if (!p) { - read_unlock_irq(&tasklist_lock); - return -ESRCH; - } - retval = sched_setscheduler(p, policy, &lparam); - read_unlock_irq(&tasklist_lock); + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); + rcu_read_unlock(); return retval; } @@ -4528,7 +4577,10 @@ EXPORT_SYMBOL(cpu_present_map); #ifndef CONFIG_SMP cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); #endif long sched_getaffinity(pid_t pid, cpumask_t *mask) @@ -4637,15 +4689,6 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline int __resched_legal(int expected_preempt_count) -{ - if (unlikely(preempt_count() != expected_preempt_count)) - return 0; - if (unlikely(system_state != SYSTEM_RUNNING)) - return 0; - return 1; -} - static void __cond_resched(void) { #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -4665,7 +4708,8 @@ static void __cond_resched(void) int __sched cond_resched(void) { - if (need_resched() && __resched_legal(0)) { + if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && + system_state == SYSTEM_RUNNING) { __cond_resched(); return 1; } @@ -4691,7 +4735,7 @@ int cond_resched_lock(spinlock_t *lock) ret = 1; spin_lock(lock); } - if (need_resched() && __resched_legal(1)) { + if (need_resched() && system_state == SYSTEM_RUNNING) { spin_release(&lock->dep_map, 1, _THIS_IP_); _raw_spin_unlock(lock); preempt_enable_no_resched(); @@ -4707,7 +4751,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && __resched_legal(0)) { + if (need_resched() && system_state == SYSTEM_RUNNING) { raw_local_irq_disable(); _local_bh_enable(); raw_local_irq_enable(); @@ -4923,18 +4967,18 @@ static void show_task(struct task_struct *p) show_stack(p, NULL); } -void show_state(void) +void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; #if (BITS_PER_LONG == 32) printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); + " free sibling\n"); + printk(" task PC stack pid father child younger older\n"); #else printk("\n" - " sibling\n"); - printk(" task PC pid father child younger older\n"); + " free sibling\n"); + printk(" task PC stack pid father child younger older\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -4943,11 +4987,16 @@ void show_state(void) * console might take alot of time: */ touch_nmi_watchdog(); - show_task(p); + if (p->state & state_filter) + show_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); - debug_show_all_locks(); + /* + * Only show locks if all tasks are dumped: + */ + if (state_filter == -1) + debug_show_all_locks(); } /** @@ -4958,7 +5007,7 @@ void show_state(void) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void __devinit init_idle(struct task_struct *idle, int cpu) +void __cpuinit init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -5092,8 +5141,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; + p->timestamp = p->timestamp - rq_src->most_recent_timestamp + + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); vx_activate_task(p); __activate_task(p, rq_dest); @@ -5170,7 +5219,10 @@ wait_to_die: } #ifdef CONFIG_HOTPLUG_CPU -/* Figure out where task on dead CPU should go, use force if neccessary. */ +/* + * Figure out where task on dead CPU should go, use force if neccessary. + * NOTE: interrupts should be disabled by the caller + */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { unsigned long flags; @@ -5290,6 +5342,7 @@ void idle_task_exit(void) mmdrop(mm); } +/* called under rq->lock with disabled interrupts */ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) { struct rq *rq = cpu_rq(dead_cpu); @@ -5298,7 +5351,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->flags & PF_DEAD); + BUG_ON(p->state == TASK_DEAD); get_task_struct(p); @@ -5306,10 +5359,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) * Drop lock around migration; if someone else moves it, * that's OK. No task can be added to this CPU, so iteration is * fine. + * NOTE: interrupts should be left disabled --dev@ */ - spin_unlock_irq(&rq->lock); + spin_unlock(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + spin_lock(&rq->lock); put_task_struct(p); } @@ -5419,9 +5473,11 @@ static struct notifier_block __cpuinitdata migration_notifier = { int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); + int err; /* Start one for the boot CPU: */ - migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); @@ -5460,16 +5516,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); break; } printk("span %s\n", str); if (!cpu_isset(cpu, sd->span)) - printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); if (!cpu_isset(cpu, group->cpumask)) - printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); printk(KERN_DEBUG); for (i = 0; i < level + 2; i++) @@ -5484,7 +5543,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!group->cpu_power) { printk("\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); } if (!cpus_weight(group->cpumask)) { @@ -5507,15 +5567,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk("\n"); if (!cpus_equal(sd->span, groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + printk(KERN_ERR "ERROR: groups don't span " + "domain->span\n"); level++; sd = sd->parent; + if (!sd) + continue; - if (sd) { - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); - } + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); } while (sd); } @@ -5532,7 +5594,9 @@ static int sd_degenerate(struct sched_domain *sd) if (sd->flags & (SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC)) { + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5566,7 +5630,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) pflags &= ~(SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC); + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); } if (~cflags & pflags) return 0; @@ -5588,12 +5654,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) struct sched_domain *parent = tmp->parent; if (!parent) break; - if (sd_parent_degenerate(tmp, parent)) + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + } } - if (sd && sd_degenerate(sd)) + if (sd && sd_degenerate(sd)) { sd = sd->parent; + if (sd) + sd->child = NULL; + } sched_domain_debug(sd, cpu); @@ -5601,7 +5673,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -5619,26 +5691,27 @@ static int __init isolated_cpu_setup(char *str) __setup ("isolcpus=", isolated_cpu_setup); /* - * init_sched_build_groups takes an array of groups, the cpumask we wish - * to span, and a pointer to a function which identifies what group a CPU - * belongs to. The return value of group_fn must be a valid index into the - * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we - * keep track of groups covered with a cpumask_t). + * init_sched_build_groups takes the cpumask we wish to span, and a pointer + * to a function which identifies what group(along with sched group) a CPU + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS + * (due to the fact that we keep track of groups covered with a cpumask_t). * * init_sched_build_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, - int (*group_fn)(int cpu)) +static void +init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, + int (*group_fn)(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; int i; for_each_cpu_mask(i, span) { - int group = group_fn(i); - struct sched_group *sg = &groups[group]; + struct sched_group *sg; + int group = group_fn(i, cpu_map, &sg); int j; if (cpu_isset(i, covered)) @@ -5648,7 +5721,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, sg->cpu_power = 0; for_each_cpu_mask(j, span) { - if (group_fn(j) != group) + if (group_fn(j, cpu_map, NULL) != group) continue; cpu_set(j, covered); @@ -5822,8 +5895,9 @@ __setup("max_cache_size=", setup_max_cache_size); */ static void touch_cache(void *__cache, unsigned long __size) { - unsigned long size = __size/sizeof(long), chunk1 = size/3, - chunk2 = 2*size/3; + unsigned long size = __size / sizeof(long); + unsigned long chunk1 = size / 3; + unsigned long chunk2 = 2 * size / 3; unsigned long *cache = __cache; int i; @@ -5932,11 +6006,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) */ measure_one(cache, size, cpu1, cpu2); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); + cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); measure_one(cache, size, cpu2, cpu1); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); + cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); /* * (We measure the non-migrating [cached] cost on both @@ -5946,17 +6020,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) measure_one(cache, size, cpu1, cpu1); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); + cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); measure_one(cache, size, cpu2, cpu2); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); + cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); /* * Get the per-iteration migration cost: */ - do_div(cost1, 2*ITERATIONS); - do_div(cost2, 2*ITERATIONS); + do_div(cost1, 2 * ITERATIONS); + do_div(cost2, 2 * ITERATIONS); return cost1 - cost2; } @@ -5994,7 +6068,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) */ cache = vmalloc(max_size); if (!cache) { - printk("could not vmalloc %d bytes for cache!\n", 2*max_size); + printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); return 1000000; /* return 1 msec on very small boxen */ } @@ -6019,7 +6093,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) avg_fluct = (avg_fluct + fluct)/2; if (migration_debug) - printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " + "(%8Ld %8Ld)\n", cpu1, cpu2, size, (long)cost / 1000000, ((long)cost / 100000) % 10, @@ -6114,20 +6189,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) -1 #endif ); - if (system_state == SYSTEM_BOOTING) { - if (num_online_cpus() > 1) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); - } - printk("\n"); + if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); } + printk("\n"); } j1 = jiffies; if (migration_debug) - printk("migration: %ld seconds\n", (j1-j0)/HZ); + printk("migration: %ld seconds\n", (j1-j0) / HZ); /* * Move back to the original CPU. NUMA-Q gets confused @@ -6224,10 +6297,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static struct sched_group sched_group_cpus[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); -static int cpu_to_cpu_group(int cpu) +static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_cpus, cpu); return cpu; } #endif @@ -6237,34 +6313,52 @@ static int cpu_to_cpu_group(int cpu) */ #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group *sched_group_core_bycpu[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_core); #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int cpu_to_core_group(int cpu) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { - return first_cpu(cpu_sibling_map[cpu]); + int group; + cpumask_t mask = cpu_sibling_map[cpu]; + cpus_and(mask, mask, *cpu_map); + group = first_cpu(mask); + if (sg) + *sg = &per_cpu(sched_group_core, group); + return group; } #elif defined(CONFIG_SCHED_MC) -static int cpu_to_core_group(int cpu) +static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + if (sg) + *sg = &per_cpu(sched_group_core, cpu); return cpu; } #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_phys); -static int cpu_to_phys_group(int cpu) +static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { + int group; #ifdef CONFIG_SCHED_MC cpumask_t mask = cpu_coregroup_map(cpu); - return first_cpu(mask); + cpus_and(mask, mask, *cpu_map); + group = first_cpu(mask); #elif defined(CONFIG_SCHED_SMT) - return first_cpu(cpu_sibling_map[cpu]); + cpumask_t mask = cpu_sibling_map[cpu]; + cpus_and(mask, mask, *cpu_map); + group = first_cpu(mask); #else - return cpu; + group = cpu; #endif + if (sg) + *sg = &per_cpu(sched_group_phys, group); + return group; } #ifdef CONFIG_NUMA @@ -6277,12 +6371,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); -static int cpu_to_allnodes_group(int cpu) +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, + struct sched_group **sg) { - return cpu_to_node(cpu); + cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); + int group; + + cpus_and(nodemask, nodemask, *cpu_map); + group = first_cpu(nodemask); + + if (sg) + *sg = &per_cpu(sched_group_allnodes, group); + return group; } + static void init_numa_sched_groups_power(struct sched_group *group_head) { struct sched_group *sg = group_head; @@ -6311,24 +6415,16 @@ next_sg: } #endif +#ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ static void free_sched_groups(const cpumask_t *cpu_map) { - int cpu; -#ifdef CONFIG_NUMA - int i; + int cpu, i; for_each_cpu_mask(cpu, *cpu_map) { - struct sched_group *sched_group_allnodes - = sched_group_allnodes_bycpu[cpu]; struct sched_group **sched_group_nodes = sched_group_nodes_bycpu[cpu]; - if (sched_group_allnodes) { - kfree(sched_group_allnodes); - sched_group_allnodes_bycpu[cpu] = NULL; - } - if (!sched_group_nodes) continue; @@ -6353,19 +6449,63 @@ next_sg: kfree(sched_group_nodes); sched_group_nodes_bycpu[cpu] = NULL; } +} +#else +static void free_sched_groups(const cpumask_t *cpu_map) +{ +} #endif - for_each_cpu_mask(cpu, *cpu_map) { - if (sched_group_phys_bycpu[cpu]) { - kfree(sched_group_phys_bycpu[cpu]); - sched_group_phys_bycpu[cpu] = NULL; - } -#ifdef CONFIG_SCHED_MC - if (sched_group_core_bycpu[cpu]) { - kfree(sched_group_core_bycpu[cpu]); - sched_group_core_bycpu[cpu] = NULL; - } -#endif + +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + * + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents + * the maximum number of tasks a group can handle in the presence of other idle + * or lightly loaded groups in the same sched domain. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_domain *child; + struct sched_group *group; + + WARN_ON(!sd || !sd->groups); + + if (cpu != first_cpu(sd->groups->cpumask)) + return; + + child = sd->child; + + /* + * For perf policy, if the groups in child domain share resources + * (for example cores sharing some portions of the cache hierarchy + * or SMT), then set this domain groups cpu_power such that each group + * can handle only one task, when there are other idle groups in the + * same sched domain. + */ + if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && + (child->flags & + (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { + sd->groups->cpu_power = SCHED_LOAD_SCALE; + return; } + + sd->groups->cpu_power = 0; + + /* + * add cpu_power of each child group to this groups cpu_power + */ + group = child->groups; + do { + sd->groups->cpu_power += group->cpu_power; + group = group->next; + } while (group != child->groups); } /* @@ -6375,13 +6515,10 @@ next_sg: static int build_sched_domains(const cpumask_t *cpu_map) { int i; - struct sched_group *sched_group_phys = NULL; -#ifdef CONFIG_SCHED_MC - struct sched_group *sched_group_core = NULL; -#endif + struct sched_domain *sd; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; - struct sched_group *sched_group_allnodes = NULL; + int sd_allnodes = 0; /* * Allocate the per-node list of sched groups @@ -6399,7 +6536,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { - int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); @@ -6408,25 +6544,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { - if (!sched_group_allnodes) { - sched_group_allnodes - = kmalloc(sizeof(struct sched_group) - * MAX_NUMNODES, - GFP_KERNEL); - if (!sched_group_allnodes) { - printk(KERN_WARNING - "Can not alloc allnodes sched group\n"); - goto error; - } - sched_group_allnodes_bycpu[i] - = sched_group_allnodes; - } sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; - group = cpu_to_allnodes_group(i); - sd->groups = &sched_group_allnodes[group]; + cpu_to_allnodes_group(i, cpu_map, &sd->groups); p = sd; + sd_allnodes = 1; } else p = NULL; @@ -6434,61 +6557,40 @@ static int build_sched_domains(const cpumask_t *cpu_map) *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; + if (p) + p->child = sd; cpus_and(sd->span, sd->span, *cpu_map); #endif - if (!sched_group_phys) { - sched_group_phys - = kmalloc(sizeof(struct sched_group) * NR_CPUS, - GFP_KERNEL); - if (!sched_group_phys) { - printk (KERN_WARNING "Can not alloc phys sched" - "group\n"); - goto error; - } - sched_group_phys_bycpu[i] = sched_group_phys; - } - p = sd; sd = &per_cpu(phys_domains, i); - group = cpu_to_phys_group(i); *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; - sd->groups = &sched_group_phys[group]; + if (p) + p->child = sd; + cpu_to_phys_group(i, cpu_map, &sd->groups); #ifdef CONFIG_SCHED_MC - if (!sched_group_core) { - sched_group_core - = kmalloc(sizeof(struct sched_group) * NR_CPUS, - GFP_KERNEL); - if (!sched_group_core) { - printk (KERN_WARNING "Can not alloc core sched" - "group\n"); - goto error; - } - sched_group_core_bycpu[i] = sched_group_core; - } - p = sd; sd = &per_cpu(core_domains, i); - group = cpu_to_core_group(i); *sd = SD_MC_INIT; sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; - sd->groups = &sched_group_core[group]; + p->child = sd; + cpu_to_core_group(i, cpu_map, &sd->groups); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - group = cpu_to_cpu_group(i); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; - sd->groups = &sched_group_cpus[group]; + p->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups); #endif } @@ -6500,8 +6602,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (i != first_cpu(this_sibling_map)) continue; - init_sched_build_groups(sched_group_cpus, this_sibling_map, - &cpu_to_cpu_group); + init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); } #endif @@ -6512,8 +6613,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpus_and(this_core_map, this_core_map, *cpu_map); if (i != first_cpu(this_core_map)) continue; - init_sched_build_groups(sched_group_core, this_core_map, - &cpu_to_core_group); + init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); } #endif @@ -6526,15 +6626,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (cpus_empty(nodemask)) continue; - init_sched_build_groups(sched_group_phys, nodemask, - &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sched_group_allnodes) - init_sched_build_groups(sched_group_allnodes, *cpu_map, - &cpu_to_allnodes_group); + if (sd_allnodes) + init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ @@ -6606,82 +6704,30 @@ static int build_sched_domains(const cpumask_t *cpu_map) /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; sd = &per_cpu(cpu_domains, i); - sd->groups->cpu_power = SCHED_LOAD_SCALE; + init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - int power; - struct sched_domain *sd; sd = &per_cpu(core_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) - * SCHED_LOAD_SCALE / 10; - sd->groups->cpu_power = power; + init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; -#ifdef CONFIG_SCHED_MC - sd = &per_cpu(phys_domains, i); - if (i != first_cpu(sd->groups->cpumask)) - continue; - - sd->groups->cpu_power = 0; - if (sched_mc_power_savings || sched_smt_power_savings) { - int j; - - for_each_cpu_mask(j, sd->groups->cpumask) { - struct sched_domain *sd1; - sd1 = &per_cpu(core_domains, j); - /* - * for each core we will add once - * to the group in physical domain - */ - if (j != first_cpu(sd1->groups->cpumask)) - continue; - - if (sched_smt_power_savings) - sd->groups->cpu_power += sd1->groups->cpu_power; - else - sd->groups->cpu_power += SCHED_LOAD_SCALE; - } - } else - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; -#else - int power; sd = &per_cpu(phys_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; -#endif + init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA for (i = 0; i < MAX_NUMNODES; i++) init_numa_sched_groups_power(sched_group_nodes[i]); - if (sched_group_allnodes) { - int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); - struct sched_group *sg = &sched_group_allnodes[group]; + if (sd_allnodes) { + struct sched_group *sg; + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); init_numa_sched_groups_power(sg); } #endif @@ -6705,9 +6751,11 @@ static int build_sched_domains(const cpumask_t *cpu_map) return 0; +#ifdef CONFIG_NUMA error: free_sched_groups(cpu_map); return -ENOMEM; +#endif } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. @@ -6851,8 +6899,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, sched_smt_power_savings_store); #endif - -#ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing @@ -6885,15 +6931,23 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_OK; } -#endif void __init sched_init_smp(void) { + cpumask_t non_isolated_cpus; + lock_cpu_hotplug(); arch_init_sched_domains(&cpu_online_map); + cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); + if (cpus_empty(non_isolated_cpus)) + cpu_set(smp_processor_id(), non_isolated_cpus); unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed(current, non_isolated_cpus) < 0) + BUG(); } #else void __init sched_init_smp(void) @@ -6940,8 +6994,8 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); #ifdef CONFIG_VSERVER_HARDCPU INIT_LIST_HEAD(&rq->hold_queue); + rq->nr_onhold = 0; #endif - for (j = 0; j < 2; j++) { array = rq->arrays + j; for (k = 0; k < MAX_PRIO; k++) { @@ -6955,6 +7009,10 @@ void __init sched_init(void) set_load_weight(&init_task); +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); +#endif + #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); #endif @@ -6989,6 +7047,9 @@ void __might_sleep(char *file, int line) " context at %s:%d\n", file, line); printk("in_atomic():%d, irqs_disabled():%d\n", in_atomic(), irqs_disabled()); + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); dump_stack(); } #endif