+ local_group = cpu_isset(this_cpu, group->cpumask);
+
+ /* Tally up the load of all CPUs in the group */
+ avg_load = 0;
+
+ for_each_cpu_mask(i, group->cpumask) {
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+ load = target_load(i);
+ else
+ load = source_load(i);
+
+ nr_cpus++;
+ avg_load += load;
+ }
+
+ if (!nr_cpus)
+ goto nextgroup;
+
+ total_load += avg_load;
+ total_pwr += group->cpu_power;
+
+ /* Adjust by relative CPU power of the group */
+ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+ if (local_group) {
+ this_load = avg_load;
+ this = group;
+ goto nextgroup;
+ } else if (avg_load > max_load) {
+ max_load = avg_load;
+ busiest = group;
+ }
+nextgroup:
+ group = group->next;
+ } while (group != sd->groups);
+
+ if (!busiest || this_load >= max_load)
+ goto out_balanced;
+
+ avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+ if (this_load >= avg_load ||
+ 100*max_load <= sd->imbalance_pct*this_load)
+ goto out_balanced;
+
+ /*
+ * We're trying to get all the cpus to the average_load, so we don't
+ * want to push ourselves above the average load, nor do we wish to
+ * reduce the max loaded cpu below the average load, as either of these
+ * actions would just result in more rebalancing later, and ping-pong
+ * tasks around. Thus we look for the minimum possible imbalance.
+ * Negative imbalances (*we* are more loaded than anyone else) will
+ * be counted as no imbalance for these purposes -- we can't fix that
+ * by pulling tasks to us. Be careful of negative numbers as they'll
+ * appear as very large values with unsigned longs.
+ */
+ *imbalance = min(max_load - avg_load, avg_load - this_load);
+
+ /* How much load to actually move to equalise the imbalance */
+ *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
+ / SCHED_LOAD_SCALE;
+
+ if (*imbalance < SCHED_LOAD_SCALE - 1) {
+ unsigned long pwr_now = 0, pwr_move = 0;
+ unsigned long tmp;
+
+ if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
+ *imbalance = 1;
+ return busiest;
+ }
+
+ /*
+ * OK, we don't have enough imbalance to justify moving tasks,
+ * however we may be able to increase total CPU power used by
+ * moving them.
+ */
+
+ pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+ pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+ pwr_now /= SCHED_LOAD_SCALE;
+
+ /* Amount of load we'd subtract */
+ tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+ if (max_load > tmp)
+ pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+ max_load - tmp);
+
+ /* Amount of load we'd add */
+ tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+ if (max_load < tmp)
+ tmp = max_load;
+ pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+ pwr_move /= SCHED_LOAD_SCALE;
+
+ /* Move if we gain another 8th of a CPU worth of throughput */
+ if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+ goto out_balanced;
+
+ *imbalance = 1;
+ return busiest;
+ }
+
+ /* Get rid of the scaling factor, rounding down as we divide */
+ *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
+
+ return busiest;
+
+out_balanced:
+ if (busiest && (idle == NEWLY_IDLE ||
+ (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
+ *imbalance = 1;
+ return busiest;
+ }
+
+ *imbalance = 0;
+ return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static runqueue_t *find_busiest_queue(struct sched_group *group)
+{
+ unsigned long load, max_load = 0;
+ runqueue_t *busiest = NULL;
+ int i;
+
+ for_each_cpu_mask(i, group->cpumask) {
+ load = source_load(i);
+
+ if (load > max_load) {
+ max_load = load;
+ busiest = cpu_rq(i);
+ }
+ }
+
+ return busiest;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called with this_rq unlocked.
+ */
+static int load_balance(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
+{
+ struct sched_group *group;
+ runqueue_t *busiest;
+ unsigned long imbalance;
+ int nr_moved;
+
+ spin_lock(&this_rq->lock);
+ schedstat_inc(sd, lb_cnt[idle]);
+
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+ if (!group) {
+ schedstat_inc(sd, lb_nobusyg[idle]);
+ goto out_balanced;
+ }
+
+ busiest = find_busiest_queue(group);
+ if (!busiest) {
+ schedstat_inc(sd, lb_nobusyq[idle]);
+ goto out_balanced;
+ }
+
+ /*
+ * This should be "impossible", but since load
+ * balancing is inherently racy and statistical,
+ * it could happen in theory.
+ */
+ if (unlikely(busiest == this_rq)) {
+ WARN_ON(1);
+ goto out_balanced;
+ }
+
+ schedstat_add(sd, lb_imbalance[idle], imbalance);
+
+ nr_moved = 0;
+ if (busiest->nr_running > 1) {
+ /*
+ * Attempt to move tasks. If find_busiest_group has found
+ * an imbalance but busiest->nr_running <= 1, the group is
+ * still unbalanced. nr_moved simply stays zero, so it is
+ * correctly treated as an imbalance.
+ */
+ double_lock_balance(this_rq, busiest);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, idle);
+ spin_unlock(&busiest->lock);
+ }
+ spin_unlock(&this_rq->lock);
+
+ if (!nr_moved) {
+ schedstat_inc(sd, lb_failed[idle]);
+ sd->nr_balance_failed++;
+
+ if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+ int wake = 0;
+
+ spin_lock(&busiest->lock);
+ if (!busiest->active_balance) {
+ busiest->active_balance = 1;
+ busiest->push_cpu = this_cpu;
+ wake = 1;
+ }
+ spin_unlock(&busiest->lock);
+ if (wake)
+ wake_up_process(busiest->migration_thread);
+
+ /*
+ * We've kicked active balancing, reset the failure
+ * counter.
+ */
+ sd->nr_balance_failed = sd->cache_nice_tries;
+ }
+
+ /*
+ * We were unbalanced, but unsuccessful in move_tasks(),
+ * so bump the balance_interval to lessen the lock contention.
+ */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval++;
+ } else {
+ sd->nr_balance_failed = 0;
+
+ /* We were unbalanced, so reset the balancing interval */
+ sd->balance_interval = sd->min_interval;
+ }
+
+ return nr_moved;
+
+out_balanced:
+ spin_unlock(&this_rq->lock);
+
+ /* tune up the balancing interval */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
+
+ return 0;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * this_rq is locked.
+ */
+static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd)
+{
+ struct sched_group *group;
+ runqueue_t *busiest = NULL;
+ unsigned long imbalance;
+ int nr_moved = 0;
+
+ schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+ if (!group) {
+ schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+ goto out;
+ }
+
+ busiest = find_busiest_queue(group);
+ if (!busiest || busiest == this_rq) {
+ schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+ goto out;
+ }
+
+ /* Attempt to move tasks */
+ double_lock_balance(this_rq, busiest);
+
+ schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, NEWLY_IDLE);
+ if (!nr_moved)
+ schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+
+ spin_unlock(&busiest->lock);
+
+out:
+ return nr_moved;
+}
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(this_cpu, sd) {
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
+ if (load_balance_newidle(this_cpu, this_rq, sd)) {
+ /* We've pulled tasks over so stop searching */
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * active_load_balance is run by migration threads. It pushes running tasks
+ * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running on each physical CPU where possible, and avoids physical /
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
+ */
+static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *cpu_group;
+ runqueue_t *target_rq;
+ cpumask_t visited_cpus;
+ int cpu;
+
+ schedstat_inc(busiest_rq, alb_cnt);
+ /*
+ * Search for suitable CPUs to push tasks to in successively higher
+ * domains with SD_LOAD_BALANCE set.
+ */
+ visited_cpus = CPU_MASK_NONE;
+ for_each_domain(busiest_cpu, sd) {
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ /* no more domains to search */
+ break;
+
+ cpu_group = sd->groups;
+ do {
+ for_each_cpu_mask(cpu, cpu_group->cpumask) {
+ if (busiest_rq->nr_running <= 1)
+ /* no more tasks left to move */
+ return;
+ if (cpu_isset(cpu, visited_cpus))
+ continue;
+ cpu_set(cpu, visited_cpus);
+ if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
+ continue;
+
+ target_rq = cpu_rq(cpu);
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+ if (move_tasks(target_rq, cpu, busiest_rq,
+ 1, sd, SCHED_IDLE)) {
+ schedstat_inc(busiest_rq, alb_lost);
+ schedstat_inc(target_rq, alb_gained);
+ } else {
+ schedstat_inc(busiest_rq, alb_failed);
+ }
+ spin_unlock(&target_rq->lock);
+ }
+ cpu_group = cpu_group->next;
+ } while (cpu_group != sd->groups);
+ }
+}
+
+/*
+ * rebalance_tick will get called every timer tick, on every CPU.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+
+/* Don't have all balancing operations going off at once */
+#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+
+static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
+ enum idle_type idle)
+{
+ unsigned long old_load, this_load;
+ unsigned long j = jiffies + CPU_OFFSET(this_cpu);
+ struct sched_domain *sd;
+
+ /* Update our load */
+ old_load = this_rq->cpu_load;
+ this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (this_load > old_load)
+ old_load++;
+ this_rq->cpu_load = (old_load + this_load) / 2;
+
+ for_each_domain(this_cpu, sd) {
+ unsigned long interval;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ interval = sd->balance_interval;
+ if (idle != SCHED_IDLE)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ if (unlikely(!interval))
+ interval = 1;
+
+ if (j - sd->last_balance >= interval) {
+ if (load_balance(this_cpu, this_rq, sd, idle)) {
+ /* We've pulled tasks over so no longer idle */
+ idle = NOT_IDLE;
+ }
+ sd->last_balance += interval;