+
+ /* Get rid of the scaling factor, rounding down as we divide */
+ *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
+
+ return busiest;
+
+out_balanced:
+ if (busiest && (idle == NEWLY_IDLE ||
+ (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
+ *imbalance = 1;
+ return busiest;
+ }
+
+ *imbalance = 0;
+ return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static runqueue_t *find_busiest_queue(struct sched_group *group)
+{
+ unsigned long load, max_load = 0;
+ runqueue_t *busiest = NULL;
+ int i;
+
+ for_each_cpu_mask(i, group->cpumask) {
+ load = source_load(i);
+
+ if (load > max_load) {
+ max_load = load;
+ busiest = cpu_rq(i);
+ }
+ }
+
+ return busiest;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called with this_rq unlocked.
+ */
+static int load_balance(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
+{
+ struct sched_group *group;
+ runqueue_t *busiest;
+ unsigned long imbalance;
+ int nr_moved;
+
+ spin_lock(&this_rq->lock);
+ schedstat_inc(sd, lb_cnt[idle]);
+
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+ if (!group) {
+ schedstat_inc(sd, lb_nobusyg[idle]);
+ goto out_balanced;
+ }
+
+ busiest = find_busiest_queue(group);
+ if (!busiest) {
+ schedstat_inc(sd, lb_nobusyq[idle]);
+ goto out_balanced;
+ }
+
+ /*
+ * This should be "impossible", but since load
+ * balancing is inherently racy and statistical,
+ * it could happen in theory.
+ */
+ if (unlikely(busiest == this_rq)) {
+ WARN_ON(1);
+ goto out_balanced;
+ }
+
+ schedstat_add(sd, lb_imbalance[idle], imbalance);
+
+ nr_moved = 0;
+ if (busiest->nr_running > 1) {
+ /*
+ * Attempt to move tasks. If find_busiest_group has found
+ * an imbalance but busiest->nr_running <= 1, the group is
+ * still unbalanced. nr_moved simply stays zero, so it is
+ * correctly treated as an imbalance.
+ */
+ double_lock_balance(this_rq, busiest);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, idle);
+ spin_unlock(&busiest->lock);
+ }
+ spin_unlock(&this_rq->lock);
+
+ if (!nr_moved) {
+ schedstat_inc(sd, lb_failed[idle]);
+ sd->nr_balance_failed++;
+
+ if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+ int wake = 0;
+
+ spin_lock(&busiest->lock);
+ if (!busiest->active_balance) {
+ busiest->active_balance = 1;
+ busiest->push_cpu = this_cpu;
+ wake = 1;
+ }
+ spin_unlock(&busiest->lock);
+ if (wake)
+ wake_up_process(busiest->migration_thread);
+
+ /*
+ * We've kicked active balancing, reset the failure
+ * counter.
+ */
+ sd->nr_balance_failed = sd->cache_nice_tries;
+ }
+
+ /*
+ * We were unbalanced, but unsuccessful in move_tasks(),
+ * so bump the balance_interval to lessen the lock contention.
+ */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval++;
+ } else {
+ sd->nr_balance_failed = 0;
+
+ /* We were unbalanced, so reset the balancing interval */
+ sd->balance_interval = sd->min_interval;
+ }
+
+ return nr_moved;
+
+out_balanced:
+ spin_unlock(&this_rq->lock);
+
+ /* tune up the balancing interval */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
+
+ return 0;
+}
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ *
+ * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * this_rq is locked.
+ */
+static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd)
+{
+ struct sched_group *group;
+ runqueue_t *busiest = NULL;
+ unsigned long imbalance;
+ int nr_moved = 0;
+
+ schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+ if (!group) {
+ schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
+ goto out;
+ }
+
+ busiest = find_busiest_queue(group);
+ if (!busiest || busiest == this_rq) {
+ schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+ goto out;
+ }
+
+ /* Attempt to move tasks */
+ double_lock_balance(this_rq, busiest);
+
+ schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ imbalance, sd, NEWLY_IDLE);
+ if (!nr_moved)
+ schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+
+ spin_unlock(&busiest->lock);
+
+out:
+ return nr_moved;
+}
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
+{
+ struct sched_domain *sd;
+
+ for_each_domain(this_cpu, sd) {
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
+ if (load_balance_newidle(this_cpu, this_rq, sd)) {
+ /* We've pulled tasks over so stop searching */
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * active_load_balance is run by migration threads. It pushes running tasks
+ * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running on each physical CPU where possible, and avoids physical /
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
+ */
+static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *cpu_group;
+ runqueue_t *target_rq;
+ cpumask_t visited_cpus;
+ int cpu;
+
+ schedstat_inc(busiest_rq, alb_cnt);
+ /*
+ * Search for suitable CPUs to push tasks to in successively higher
+ * domains with SD_LOAD_BALANCE set.
+ */
+ visited_cpus = CPU_MASK_NONE;
+ for_each_domain(busiest_cpu, sd) {
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ /* no more domains to search */
+ break;
+
+ cpu_group = sd->groups;
+ do {
+ for_each_cpu_mask(cpu, cpu_group->cpumask) {
+ if (busiest_rq->nr_running <= 1)
+ /* no more tasks left to move */
+ return;
+ if (cpu_isset(cpu, visited_cpus))
+ continue;
+ cpu_set(cpu, visited_cpus);
+ if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
+ continue;
+
+ target_rq = cpu_rq(cpu);
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+ if (move_tasks(target_rq, cpu, busiest_rq,
+ 1, sd, SCHED_IDLE)) {
+ schedstat_inc(busiest_rq, alb_lost);
+ schedstat_inc(target_rq, alb_gained);
+ } else {
+ schedstat_inc(busiest_rq, alb_failed);
+ }
+ spin_unlock(&target_rq->lock);
+ }
+ cpu_group = cpu_group->next;
+ } while (cpu_group != sd->groups);
+ }
+}
+
+/*
+ * rebalance_tick will get called every timer tick, on every CPU.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+
+/* Don't have all balancing operations going off at once */
+#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+
+static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
+ enum idle_type idle)
+{
+ unsigned long old_load, this_load;
+ unsigned long j = jiffies + CPU_OFFSET(this_cpu);
+ struct sched_domain *sd;
+
+ /* Update our load */
+ old_load = this_rq->cpu_load;
+ this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (this_load > old_load)
+ old_load++;
+ this_rq->cpu_load = (old_load + this_load) / 2;
+
+ for_each_domain(this_cpu, sd) {
+ unsigned long interval;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ interval = sd->balance_interval;
+ if (idle != SCHED_IDLE)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ if (unlikely(!interval))
+ interval = 1;
+
+ if (j - sd->last_balance >= interval) {
+ if (load_balance(this_cpu, this_rq, sd, idle)) {
+ /* We've pulled tasks over so no longer idle */
+ idle = NOT_IDLE;
+ }
+ sd->last_balance += interval;
+ }