X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=kernel%2Fsched.c;h=f1780c9057b9552226d54e7ce17c8f3eaf12274f;hb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;hp=1493acff560f9cc49d0a900fae8536cf2600b6e7;hpb=5273a3df6485dc2ad6aa7ddd441b9a21970f003b;p=linux-2.6.git diff --git a/kernel/sched.c b/kernel/sched.c index 1493acff5..f1780c905 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -15,6 +15,7 @@ * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin */ #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +41,15 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include + +#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -63,8 +74,6 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ - (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) /* * Some helpers for converting nanosecond timing to jiffy resolution @@ -75,12 +84,12 @@ /* * These are the 'tuning knobs' of the scheduler: * - * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, - * maximum timeslice is 200 msecs. Timeslices get refilled after - * they expire. + * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), + * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * Timeslices get refilled after they expire. */ -#define MIN_TIMESLICE ( 10 * HZ / 1000) -#define MAX_TIMESLICE (200 * HZ / 1000) +#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +#define DEF_TIMESLICE (100 * HZ / 1000) #define ON_RUNQUEUE_WEIGHT 30 #define CHILD_PENALTY 95 #define PARENT_PENALTY 100 @@ -88,10 +97,9 @@ #define PRIO_BONUS_RATIO 25 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) #define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) +#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) #define STARVATION_LIMIT (MAX_SLEEP_AVG) #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define NODE_THRESHOLD 125 #define CREDIT_LIMIT 100 /* @@ -139,8 +147,7 @@ (v1) * (v2_max) / (v1_max) #define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ - INTERACTIVE_DELTA) + (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) @@ -159,24 +166,36 @@ ((p)->prio < (rq)->curr->prio) /* - * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] - * to time slice values. + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] * * The higher a thread's priority, the bigger timeslices * it gets during one round of execution. But even the lowest * priority thread gets MIN_TIMESLICE worth of execution time. - * - * task_timeslice() is the interface that is used by the scheduler. */ -#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ - ((MAX_TIMESLICE - MIN_TIMESLICE) * \ - (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) +#define SCALE_PRIO(x, prio) \ + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) -static inline unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { - return BASE_TIMESLICE(p); + if (p->static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); } +#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ + < (long long) (sd)->cache_hot_time) + +enum idle_type +{ + IDLE, + NOT_IDLE, + NEWLY_IDLE, + MAX_IDLE_TYPES +}; + +struct sched_domain; /* * These are the runqueue data structures: @@ -187,7 +206,7 @@ static inline unsigned int task_timeslice(task_t *p) typedef struct runqueue runqueue_t; struct prio_array { - int nr_active; + unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; }; @@ -201,97 +220,241 @@ struct prio_array { */ struct runqueue { spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#ifdef CONFIG_SMP + unsigned long cpu_load; +#endif unsigned long long nr_switches; - unsigned long nr_running, expired_timestamp, nr_uninterruptible, - timestamp_last_tick; + unsigned long expired_timestamp, nr_uninterruptible; + unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; prio_array_t *active, *expired, arrays[2]; - int best_expired_prio, prev_cpu_load[NR_CPUS]; -#ifdef CONFIG_NUMA - atomic_t *node_nr_running; - int prev_node_load[MAX_NUMNODES]; -#endif + int best_expired_prio; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + task_t *migration_thread; struct list_head migration_queue; +#endif +#ifdef CONFIG_VSERVER_HARDCPU + struct list_head hold_queue; + int idle_tokens; +#endif - atomic_t nr_iowait; +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule() stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + unsigned long sched_goidle; + + /* pull_task() stats */ + unsigned long pt_gained[MAX_IDLE_TYPES]; + unsigned long pt_lost[MAX_IDLE_TYPES]; + + /* active_load_balance() stats */ + unsigned long alb_cnt; + unsigned long alb_lost; + unsigned long alb_gained; + unsigned long alb_failed; + + /* try_to_wake_up() stats */ + unsigned long ttwu_cnt; + unsigned long ttwu_attempts; + unsigned long ttwu_moved; + + /* wake_up_new_task() stats */ + unsigned long wunt_cnt; + unsigned long wunt_moved; + + /* sched_migrate_task() stats */ + unsigned long smt_cnt; + + /* sched_balance_exec() stats */ + unsigned long sbe_cnt; +#endif }; static DEFINE_PER_CPU(struct runqueue, runqueues); -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) - -extern unsigned long __scheduling_functions_start_here; -extern unsigned long __scheduling_functions_end_here; -const unsigned long scheduling_functions_start_here = - (unsigned long)&__scheduling_functions_start_here; -const unsigned long scheduling_functions_end_here = - (unsigned long)&__scheduling_functions_end_here; - /* - * Default context-switch locking: + * sched-domains (multiprocessor balancing) declarations: */ -#ifndef prepare_arch_switch -# define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) -#endif +#ifdef CONFIG_SMP +#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ -#ifdef CONFIG_NUMA +#define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 2 /* Balance on exec */ +#define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */ +#define SD_WAKE_BALANCE 16 /* Perform balancing at task wakeup */ +#define SD_SHARE_CPUPOWER 32 /* Domain members share cpu power */ -/* - * Keep track of running tasks. - */ +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + cpumask_t cpumask; -static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = - {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. This should be read only (except for setup). Although + * it will need to be written to at cpu hot(un)plug time, perhaps the + * cpucontrol semaphore will provide enough exclusion? + */ + unsigned long cpu_power; +}; -static inline void nr_running_init(struct runqueue *rq) -{ - rq->node_nr_running = &node_nr_running[0]; -} +struct sched_domain { + /* These fields must be setup */ + struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + cpumask_t span; /* span of all CPUs in this domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ + int flags; /* See SD_* */ + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ + +#ifdef CONFIG_SCHEDSTATS + /* load_balance() stats */ + unsigned long lb_cnt[MAX_IDLE_TYPES]; + unsigned long lb_failed[MAX_IDLE_TYPES]; + unsigned long lb_imbalance[MAX_IDLE_TYPES]; + unsigned long lb_nobusyg[MAX_IDLE_TYPES]; + unsigned long lb_nobusyq[MAX_IDLE_TYPES]; + + /* sched_balance_exec() stats */ + unsigned long sbe_attempts; + unsigned long sbe_pushed; + + /* try_to_wake_up() stats */ + unsigned long ttwu_wake_affine; + unsigned long ttwu_wake_balance; +#endif +}; -static inline void nr_running_inc(runqueue_t *rq) -{ - atomic_inc(rq->node_nr_running); - rq->nr_running++; +#ifndef ARCH_HAS_SCHED_TUNE +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_WAKE_IDLE +/* Common values for SMT siblings */ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 8, \ + .imbalance_pct = 110, \ + .cache_hot_time = 0, \ + .cache_nice_tries = 0, \ + .per_cpu_gain = 25, \ + .flags = SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ + | SD_SHARE_CPUPOWER, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ } +#endif -static inline void nr_running_dec(runqueue_t *rq) -{ - atomic_dec(rq->node_nr_running); - rq->nr_running--; +/* Common values for CPUs */ +#define SD_CPU_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_hot_time = cache_decay_ticks*1000000 ? : (5*1000000/2),\ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_AFFINE \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ } -__init void node_nr_running_init(void) -{ - int i; - - for (i = 0; i < NR_CPUS; i++) { - if (cpu_possible(i)) - cpu_rq(i)->node_nr_running = - &node_nr_running[cpu_to_node(i)]; - } +/* Arch can override this macro in processor.h */ +#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT) +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ + .max_interval = 32, \ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_BALANCE_EXEC \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ } +#endif +#endif /* ARCH_HAS_SCHED_TUNE */ +#endif + -#else /* !CONFIG_NUMA */ +#define for_each_domain(cpu, domain) \ + for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) -# define nr_running_init(rq) do { } while (0) -# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) -# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#endif /* CONFIG_NUMA */ +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while (0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) +#endif /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { struct runqueue *rq; @@ -311,10 +474,108 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) spin_unlock_irqrestore(&rq->lock, *flags); } +#ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 10 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + enum idle_type itype; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + runqueue_t *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcnt = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + cpu, rq->yld_both_empty, + rq->yld_act_empty, rq->yld_exp_empty, + rq->yld_cnt, rq->sched_noswitch, + rq->sched_switch, rq->sched_cnt, rq->sched_goidle, + rq->alb_cnt, rq->alb_gained, rq->alb_lost, + rq->alb_failed, + rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts, + rq->wunt_cnt, rq->wunt_moved, + rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); + + for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) + seq_printf(seq, " %lu %lu", rq->pt_gained[itype], + rq->pt_lost[itype]); + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + for_each_domain(cpu, sd) { + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcnt++, mask_str); + for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu", + sd->lb_cnt[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %lu %lu %lu %lu\n", + sd->sbe_pushed, sd->sbe_attempts, + sd->ttwu_wake_affine, sd->ttwu_wake_balance); + } +#endif + } + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +# define schedstat_inc(rq, field) rq->field++; +# define schedstat_add(rq, field, amt) rq->field += amt; +#else /* !CONFIG_SCHEDSTATS */ +# define schedstat_inc(rq, field) do { } while (0); +# define schedstat_add(rq, field, amt) do { } while (0); +#endif + /* * rq_lock - lock a given runqueue and disable interrupts. */ -static inline runqueue_t *this_rq_lock(void) +static runqueue_t *this_rq_lock(void) { runqueue_t *rq; @@ -330,10 +591,116 @@ static inline void rq_unlock(runqueue_t *rq) spin_unlock_irq(&rq->lock); } +#ifdef CONFIG_SCHEDSTATS +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(task_t *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies, diff = 0; + struct runqueue *rq = task_rq(t); + + if (t->sched_info.last_queued) + diff = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += diff; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; + + if (!rq) + return; + + rq->rq_sched_info.run_delay += diff; + rq->rq_sched_info.pcnt++; +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(task_t *t) +{ + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff = jiffies - t->sched_info.last_arrival; + + t->sched_info.cpu_time += diff; + + if (rq) + rq->rq_sched_info.cpu_time += diff; +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS */ + /* * Adding/removing a task to/from a priority array: */ -static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +static void dequeue_task(struct task_struct *p, prio_array_t *array) { array->nr_active--; list_del(&p->run_list); @@ -341,14 +708,28 @@ static inline void dequeue_task(struct task_struct *p, prio_array_t *array) __clear_bit(p->prio, array->bitmap); } -static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +static void enqueue_task(struct task_struct *p, prio_array_t *array) { + sched_info_queued(p); list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; } +/* + * Used by the migration code - we pull tasks from the head of the + * remote queue so we want these tasks to show up at the head of the + * local queue: + */ +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} + /* * effective_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. @@ -373,6 +754,9 @@ static int effective_prio(task_t *p) bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; + if (task_vx_flags(p, VXF_SCHED_PRIO, 0)) + prio += effective_vavavoom(p, MAX_USER_PRIO); + if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; if (prio > MAX_PRIO-1) @@ -386,7 +770,16 @@ static int effective_prio(task_t *p) static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - nr_running_inc(rq); + rq->nr_running++; +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq->active); + rq->nr_running++; } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -409,7 +802,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now) if (p->mm && p->activated != -1 && sleep_time > INTERACTIVE_SLEEP(p)) { p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - AVG_TIMESLICE); + DEF_TIMESLICE); if (!HIGH_CREDIT(p)) p->interactive_credit++; } else { @@ -469,9 +862,19 @@ static void recalc_task_prio(task_t *p, unsigned long long now) * Update all the scheduling statistics stuff. (sleep average * calculation, priority modifiers, etc.) */ -static inline void activate_task(task_t *p, runqueue_t *rq) +static void activate_task(task_t *p, runqueue_t *rq, int local) { - unsigned long long now = sched_clock(); + unsigned long long now; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif recalc_task_prio(p, now); @@ -499,21 +902,29 @@ static inline void activate_task(task_t *p, runqueue_t *rq) } p->timestamp = now; + vx_activate_task(p); __activate_task(p, rq); } /* * deactivate_task - remove a task from the runqueue. */ -static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +static void __deactivate_task(struct task_struct *p, runqueue_t *rq) { - nr_running_dec(rq); + rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); + p->array = NULL; } +static void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + __deactivate_task(p, rq); + vx_deactivate_task(p); +} + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -521,12 +932,13 @@ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -static inline void resched_task(task_t *p) -{ #ifdef CONFIG_SMP +static void resched_task(task_t *p) +{ int need_resched, nrpolling; - preempt_disable(); + BUG_ON(!spin_is_locked(&task_rq(p)->lock)); + /* minimise the chance of sending an interrupt to poll_idle() */ nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); @@ -534,56 +946,64 @@ static inline void resched_task(task_t *p) if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); - preempt_enable(); +} #else +static inline void resched_task(task_t *p) +{ set_tsk_need_resched(p); -#endif } +#endif /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ -inline int task_curr(task_t *p) +inline int task_curr(const task_t *p) { return cpu_curr(task_cpu(p)) == p; } #ifdef CONFIG_SMP +enum request_type { + REQ_MOVE_TASK, + REQ_SET_DOMAIN, +}; + typedef struct { struct list_head list; + enum request_type type; + + /* For REQ_MOVE_TASK */ task_t *task; + int dest_cpu; + + /* For REQ_SET_DOMAIN */ + struct sched_domain *sd; + struct completion done; } migration_req_t; /* - * The task's runqueue lock must be held, and the new mask must be valid. + * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static int __set_cpus_allowed(task_t *p, cpumask_t new_mask, - migration_req_t *req) +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) { runqueue_t *rq = task_rq(p); - p->cpus_allowed = new_mask; - /* - * Can the task run on the task's current CPU? If not then - * migrate the thread off to a proper CPU. - */ - if (cpu_isset(task_cpu(p), new_mask)) - return 0; - /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ if (!p->array && !task_running(rq, p)) { - set_task_cpu(p, any_online_cpu(p->cpus_allowed)); + set_task_cpu(p, dest_cpu); return 0; } init_completion(&req->done); + req->type = REQ_MOVE_TASK; req->task = p; + req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); return 1; } @@ -638,6 +1058,70 @@ void kick_process(task_t *p) EXPORT_SYMBOL_GPL(kick_process); +/* + * Return a low guess at the load of a migration-source cpu. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static inline unsigned long source_load(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return min(rq->cpu_load, load_now); +} + +/* + * Return a high guess at the load of a migration-target cpu + */ +static inline unsigned long target_load(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return max(rq->cpu_load, load_now); +} + +#endif + +/* + * wake_idle() is useful especially on SMT architectures to wake a + * task onto an idle sibling if we would otherwise wake it onto a + * busy sibling. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, task_t *p) +{ + cpumask_t tmp; + runqueue_t *rq = cpu_rq(cpu); + struct sched_domain *sd; + int i; + + if (idle_cpu(cpu)) + return cpu; + + sd = rq->sd; + if (!(sd->flags & SD_WAKE_IDLE)) + return cpu; + + cpus_and(tmp, sd->span, cpu_online_map); + cpus_and(tmp, tmp, p->cpus_allowed); + + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) + return i; + } + + return cpu; +} +#else +static inline int wake_idle(int cpu, task_t *p) +{ + return cpu; +} #endif /*** @@ -656,65 +1140,160 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int try_to_wake_up(task_t * p, unsigned int state, int sync) { + int cpu, this_cpu, success = 0; unsigned long flags; - int success = 0; long old_state; runqueue_t *rq; +#ifdef CONFIG_SMP + unsigned long load, this_load; + struct sched_domain *sd; + int new_cpu; +#endif -repeat_lock_task: rq = task_rq_lock(p, &flags); + schedstat_inc(rq, ttwu_cnt); old_state = p->state; - if (old_state & state) { - if (!p->array) { + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + +#ifdef CONFIG_SMP + if (unlikely(task_running(rq, p))) + goto out_activate; + + new_cpu = cpu; + + if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + load = source_load(cpu); + this_load = target_load(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) effect of + * the currently running task from the load of the current CPU: + */ + if (sync) + this_load -= SCHED_LOAD_SCALE; + + /* Don't pull the task off an idle CPU to a busy one */ + if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) + goto out_set_cpu; + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + /* + * Scan domains for affine wakeup and passive balancing + * possibilities. + */ + for_each_domain(this_cpu, sd) { + unsigned int imbalance; + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + + if ((sd->flags & SD_WAKE_AFFINE) && + !task_hot(p, rq->timestamp_last_tick, sd)) { /* - * Fast-migrate the task if it's not running or runnable - * currently. Do not violate hard affinity. + * This domain has SD_WAKE_AFFINE and p is cache cold + * in this domain. */ - if (unlikely(sync && !task_running(rq, p) && - (task_cpu(p) != smp_processor_id()) && - cpu_isset(smp_processor_id(), - p->cpus_allowed) && - !cpu_is_offline(smp_processor_id()))) { - set_task_cpu(p, smp_processor_id()); - task_rq_unlock(rq, &flags); - goto repeat_lock_task; - } - if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_affine); + goto out_set_cpu; } - if (sync && (task_cpu(p) == smp_processor_id())) - __activate_task(p, rq); - else { - activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else if ((sd->flags & SD_WAKE_BALANCE) && + imbalance*this_load <= 100*load) { + /* + * This domain has SD_WAKE_BALANCE and there is + * an imbalance. + */ + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_balance); + goto out_set_cpu; } - success = 1; } - p->state = TASK_RUNNING; } - task_rq_unlock(rq, &flags); - - return success; -} -int fastcall wake_up_process(task_t * p) -{ - return try_to_wake_up(p, TASK_STOPPED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); -} -EXPORT_SYMBOL(wake_up_process); + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + schedstat_inc(rq, ttwu_attempts); + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { + schedstat_inc(rq, ttwu_moved); + set_task_cpu(p, new_cpu); + task_rq_unlock(rq, &flags); + /* might preempt at this point */ + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p, rq, cpu == this_cpu); + if (!sync || cpu != this_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: + task_rq_unlock(rq, &flags); + + return success; +} + +int fastcall wake_up_process(task_t * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); +} + +EXPORT_SYMBOL(wake_up_process); int fastcall wake_up_state(task_t *p, unsigned int state) { return try_to_wake_up(p, state, 0); } +#ifdef CONFIG_SMP +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd); +#endif + /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -731,6 +1310,9 @@ void fastcall sched_fork(task_t *p) INIT_LIST_HEAD(&p->run_list); p->array = NULL; spin_lock_init(&p->switch_lock); +#ifdef CONFIG_SCHEDSTATS + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which @@ -754,10 +1336,10 @@ void fastcall sched_fork(task_t *p) p->first_time_slice = 1; current->time_slice >>= 1; p->timestamp = sched_clock(); - if (!current->time_slice) { + if (unlikely(!current->time_slice)) { /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the * runqueue lock is not a problem. */ current->time_slice = 1; @@ -770,44 +1352,90 @@ void fastcall sched_fork(task_t *p) } /* - * wake_up_forked_process - wake up a freshly forked process. + * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created process. + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. */ -void fastcall wake_up_forked_process(task_t * p) +void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) { unsigned long flags; - runqueue_t *rq = task_rq_lock(current, &flags); + int this_cpu, cpu; + runqueue_t *rq, *this_rq; + + rq = task_rq_lock(p, &flags); + cpu = task_cpu(p); + this_cpu = smp_processor_id(); BUG_ON(p->state != TASK_RUNNING); + schedstat_inc(rq, wunt_cnt); /* * We decrease the sleep average of forking parents * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. */ - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); p->interactive_credit = 0; p->prio = effective_prio(p); - set_task_cpu(p, smp_processor_id()); - if (unlikely(!current->array)) + vx_activate_task(p); + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; + } + set_need_resched(); + } else + /* Run child last */ + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; + } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; __activate_task(p, rq); - else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - nr_running_inc(rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + + schedstat_inc(rq, wunt_moved); + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sleep_avg: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); } - task_rq_unlock(rq, &flags); + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); } /* @@ -824,18 +1452,16 @@ void fastcall sched_exit(task_t * p) unsigned long flags; runqueue_t *rq; - local_irq_save(flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) - p->parent->time_slice = MAX_TIMESLICE; - } - local_irq_restore(flags); /* * If the child was a (relative-) CPU hog then decrease * the sleep_avg of the parent as well. */ rq = task_rq_lock(p->parent, &flags); + if (p->first_time_slice) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > task_timeslice(p))) + p->parent->time_slice = task_timeslice(p); + } if (p->sleep_avg < p->parent->sleep_avg) p->parent->sleep_avg = p->parent->sleep_avg / (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / @@ -856,7 +1482,7 @@ void fastcall sched_exit(task_t * p) * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(task_t *prev) +static void finish_task_switch(task_t *prev) { runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; @@ -873,7 +1499,7 @@ static inline void finish_task_switch(task_t *prev) * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. - * Manfred Spraul + * Manfred Spraul */ prev_task_flags = prev->flags; finish_arch_switch(rq, prev); @@ -935,7 +1561,7 @@ unsigned long nr_running(void) { unsigned long i, sum = 0; - for (i = 0; i < NR_CPUS; i++) + for_each_online_cpu(i) sum += cpu_rq(i)->nr_running; return sum; @@ -971,13 +1597,15 @@ unsigned long nr_iowait(void) return sum; } +#ifdef CONFIG_SMP + /* * double_rq_lock - safely lock two runqueues * * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ -static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { if (rq1 == rq2) spin_lock(&rq1->lock); @@ -998,252 +1626,140 @@ static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) * Note this does not restore interrupts like task_rq_unlock, * you need to do so manually after calling. */ -static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) { spin_unlock(&rq1->lock); if (rq1 != rq2) spin_unlock(&rq2->lock); } -#ifdef CONFIG_NUMA /* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ -static void sched_migrate_task(task_t *p, int dest_cpu) +static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) { - runqueue_t *rq; - migration_req_t req; - unsigned long flags; - cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu); - - lock_cpu_hotplug(); - rq = task_rq_lock(p, &flags); - old_mask = p->cpus_allowed; - if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu)) - goto out; - - /* force the process onto the specified CPU */ - if (__set_cpus_allowed(p, new_mask, &req)) { - /* Need to wait for migration thread. */ - task_rq_unlock(rq, &flags); - wake_up_process(rq->migration_thread); - wait_for_completion(&req.done); - - /* If we raced with sys_sched_setaffinity, don't - * restore mask. */ - rq = task_rq_lock(p, &flags); - if (likely(cpus_equal(p->cpus_allowed, new_mask))) { - /* Restore old mask: won't need migration - * thread, since current cpu is allowed. */ - BUG_ON(__set_cpus_allowed(p, old_mask, NULL)); - } + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + } else + spin_lock(&busiest->lock); } -out: - task_rq_unlock(rq, &flags); - unlock_cpu_hotplug(); } /* - * Find the least loaded CPU. Slightly favor the current CPU by - * setting its runqueue length as the minimum to start. + * find_idlest_cpu - find the least busy runqueue. */ -static int sched_best_cpu(struct task_struct *p) +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd) { - int i, minload, load, best_cpu, node = 0; - cpumask_t cpumask; + unsigned long load, min_load, this_load; + int i, min_cpu; + cpumask_t mask; - best_cpu = task_cpu(p); - if (cpu_rq(best_cpu)->nr_running <= 2) - return best_cpu; + min_cpu = UINT_MAX; + min_load = ULONG_MAX; - minload = 10000000; - for_each_node_with_cpus(i) { - /* - * Node load is always divided by nr_cpus_node to normalise - * load values in case cpu count differs from node to node. - * We first multiply node_nr_running by 10 to get a little - * better resolution. - */ - load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i); - if (load < minload) { - minload = load; - node = i; - } - } + cpus_and(mask, sd->span, cpu_online_map); + cpus_and(mask, mask, p->cpus_allowed); - minload = 10000000; - cpumask = node_to_cpumask(node); - for (i = 0; i < NR_CPUS; ++i) { - if (!cpu_isset(i, cpumask)) - continue; - if (cpu_rq(i)->nr_running < minload) { - best_cpu = i; - minload = cpu_rq(i)->nr_running; + for_each_cpu_mask(i, mask) { + load = target_load(i); + + if (load < min_load) { + min_cpu = i; + min_load = load; + + /* break out early on an idle CPU: */ + if (!min_load) + break; } } - return best_cpu; -} -void sched_balance_exec(void) -{ - int new_cpu; + /* add +1 to account for the new task */ + this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; - if (numnodes > 1) { - new_cpu = sched_best_cpu(current); - if (new_cpu != smp_processor_id()) - sched_migrate_task(current, new_cpu); - } + /* + * Would with the addition of the new task to the + * current CPU there be an imbalance between this + * CPU and the idlest CPU? + * + * Use half of the balancing threshold - new-context is + * a good opportunity to balance. + */ + if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) + return min_cpu; + + return this_cpu; } /* - * Find the busiest node. All previous node loads contribute with a - * geometrically deccaying weight to the load measure: - * load_{t} = load_{t-1}/2 + nr_node_running_{t} - * This way sudden load peaks are flattened out a bit. - * Node load is divided by nr_cpus_node() in order to compare nodes - * of different cpu count but also [first] multiplied by 10 to - * provide better resolution. + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. */ -static int find_busiest_node(int this_node) +static void sched_migrate_task(task_t *p, int dest_cpu) { - int i, node = -1, load, this_load, maxload; - - if (!nr_cpus_node(this_node)) - return node; - this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) - + (10 * atomic_read(&node_nr_running[this_node]) - / nr_cpus_node(this_node)); - this_rq()->prev_node_load[this_node] = this_load; - for_each_node_with_cpus(i) { - if (i == this_node) - continue; - load = (this_rq()->prev_node_load[i] >> 1) - + (10 * atomic_read(&node_nr_running[i]) - / nr_cpus_node(i)); - this_rq()->prev_node_load[i] = load; - if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) { - maxload = load; - node = i; - } - } - return node; -} - -#endif /* CONFIG_NUMA */ + migration_req_t req; + runqueue_t *rq; + unsigned long flags; -#ifdef CONFIG_SMP + rq = task_rq_lock(p, &flags); + if (!cpu_isset(dest_cpu, p->cpus_allowed) + || unlikely(cpu_is_offline(dest_cpu))) + goto out; -/* - * double_lock_balance - lock the busiest runqueue - * - * this_rq is locked already. Recalculate nr_running if we have to - * drop the runqueue lock. - */ -static inline -unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest, - int this_cpu, int idle, - unsigned int nr_running) -{ - if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock(&this_rq->lock); - /* Need to recalculate nr_running */ - if (idle || (this_rq->nr_running > - this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_cpu_load[this_cpu]; - } else - spin_lock(&busiest->lock); + schedstat_inc(rq, smt_cnt); + /* force the process onto the specified CPU */ + if (migrate_task(p, dest_cpu, &req)) { + /* Need to wait for migration thread (might exit: take ref). */ + struct task_struct *mt = rq->migration_thread; + get_task_struct(mt); + task_rq_unlock(rq, &flags); + wake_up_process(mt); + put_task_struct(mt); + wait_for_completion(&req.done); + return; } - return nr_running; +out: + task_rq_unlock(rq, &flags); } /* - * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. + * sched_exec(): find the highest-level, exec-balance-capable + * domain and try to migrate the task to the least loaded CPU. + * + * execve() is a valuable balancing opportunity, because at this point + * the task has the smallest effective memory and cache footprint. */ -static inline -runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, - int *imbalance, cpumask_t cpumask) +void sched_exec(void) { - int nr_running, load, max_load, i; - runqueue_t *busiest, *rq_src; - - /* - * We search all runqueues to find the most busy one. - * We do this lockless to reduce cache-bouncing overhead, - * we re-check the 'best' source CPU later on again, with - * the lock held. - * - * We fend off statistical fluctuations in runqueue lengths by - * saving the runqueue length (as seen by the balancing CPU) during - * the previous load-balancing operation and using the smaller one - * of the current and saved lengths. If a runqueue is long enough - * for a longer amount of time then we recognize it and pull tasks - * from it. - * - * The 'current runqueue length' is a statistical maximum variable, - * for that one we take the longer one - to avoid fluctuations in - * the other direction. So for a load-balance to happen it needs - * stable long runqueue on the target CPU and stable short runqueue - * on the local runqueue. - * - * We make an exception if this CPU is about to become idle - in - * that case we are less picky about moving a task across CPUs and - * take what can be taken. - */ - if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_cpu_load[this_cpu]; - - busiest = NULL; - max_load = 1; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpumask)) - continue; - - rq_src = cpu_rq(i); - if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) - load = rq_src->nr_running; - else - load = this_rq->prev_cpu_load[i]; - this_rq->prev_cpu_load[i] = rq_src->nr_running; - - if ((load > max_load) && (rq_src != this_rq)) { - busiest = rq_src; - max_load = load; - } - } + struct sched_domain *tmp, *sd = NULL; + int new_cpu, this_cpu = get_cpu(); - if (likely(!busiest)) + schedstat_inc(this_rq(), sbe_cnt); + /* Prefer the current CPU if there's only this task running */ + if (this_rq()->nr_running <= 1) goto out; - *imbalance = max_load - nr_running; - - /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && ((*imbalance)*4 < max_load)) { - busiest = NULL; - goto out; - } + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_BALANCE_EXEC) + sd = tmp; - nr_running = double_lock_balance(this_rq, busiest, this_cpu, - idle, nr_running); - /* - * Make sure nothing changed since we checked the - * runqueue length. - */ - if (busiest->nr_running <= nr_running) { - spin_unlock(&busiest->lock); - busiest = NULL; + if (sd) { + schedstat_inc(sd, sbe_attempts); + new_cpu = find_idlest_cpu(current, this_cpu, sd); + if (new_cpu != this_cpu) { + schedstat_inc(sd, sbe_pushed); + put_cpu(); + sched_migrate_task(current, new_cpu); + return; + } } out: - return busiest; + put_cpu(); } /* @@ -1252,86 +1768,83 @@ out: */ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, int this_cpu) + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - nr_running_dec(src_rq); + src_rq->nr_running--; set_task_cpu(p, this_cpu); - nr_running_inc(this_rq); - enqueue_task(p, this_rq->active); - p->timestamp = sched_clock() - - (src_rq->timestamp_last_tick - p->timestamp); + this_rq->nr_running++; + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ if (TASK_PREEMPTS_CURR(p, this_rq)) - set_need_resched(); + resched_task(this_rq->curr); } /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static inline -int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle) { - unsigned long delta = rq->timestamp_last_tick - tsk->timestamp; - /* * We do not migrate tasks that are: * 1) running (obviously), or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (task_running(rq, tsk)) + if (task_running(rq, p)) return 0; - if (!cpu_isset(this_cpu, tsk->cpus_allowed)) - return 0; - if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) + if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; + + /* Aggressive migration if we've failed balancing */ + if (idle == NEWLY_IDLE || + sd->nr_balance_failed < sd->cache_nice_tries) { + if (task_hot(p, rq->timestamp_last_tick, sd)) + return 0; + } + return 1; } /* - * Current runqueue is empty, or rebalance tick: if there is an - * inbalance (current runqueue is too short) then pull from - * busiest runqueue(s). + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. * - * We call this with the current runqueue locked, - * irqs disabled. + * Called with both runqueues locked. */ -static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) { - int imbalance, idx, this_cpu = smp_processor_id(); - runqueue_t *busiest; - prio_array_t *array; + prio_array_t *array, *dst_array; struct list_head *head, *curr; + int idx, pulled = 0; task_t *tmp; - if (cpu_is_offline(this_cpu)) - goto out; - - busiest = find_busiest_queue(this_rq, this_cpu, idle, - &imbalance, cpumask); - if (!busiest) + if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; - /* - * We only want to steal a number of tasks equal to 1/2 the imbalance, - * otherwise we'll just shift the imbalance to the new queue: - */ - imbalance /= 2; - /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to * be cache-cold, thus switching CPUs has the least effect * on them. */ - if (busiest->expired->nr_active) + if (busiest->expired->nr_active) { array = busiest->expired; - else + dst_array = this_rq->expired; + } else { array = busiest->active; + dst_array = this_rq->active; + } new_array: /* Start searching at priority 0: */ @@ -1342,11 +1855,12 @@ skip_bitmap: else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); if (idx >= MAX_PRIO) { - if (array == busiest->expired) { + if (array == busiest->expired && busiest->active->nr_active) { array = busiest->active; + dst_array = this_rq->active; goto new_array; } - goto out_unlock; + goto out; } head = array->queue + idx; @@ -1356,113 +1870,519 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, this_cpu); - /* Only migrate one task if we are idle */ - if (!idle && --imbalance) { + /* + * Right now, this is the only place pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_inc(this_rq, pt_gained[idle]); + schedstat_inc(busiest, pt_lost[idle]); + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } -out_unlock: - spin_unlock(&busiest->lock); out: - ; + return pulled; } /* - * One of the idle_cpu_tick() and busy_cpu_tick() functions will - * get called every timer tick, on every CPU. Our balancing action - * frequency and balancing agressivity depends on whether the CPU is - * idle or not. - * - * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on - * systems with HZ=100, every 10 msecs.) - * - * On NUMA, do a node-rebalance every 400 msecs. + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. */ -#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) -#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) -#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) -#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) - -#ifdef CONFIG_NUMA -static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) { - int node = find_busiest_node(cpu_to_node(this_cpu)); + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; - if (node >= 0) { - cpumask_t cpumask = node_to_cpumask(node); - cpu_set(this_cpu, cpumask); - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpumask); - spin_unlock(&this_rq->lock); - } -} -#endif + max_load = this_load = total_load = total_pwr = 0; -static void rebalance_tick(runqueue_t *this_rq, int idle) -{ -#ifdef CONFIG_NUMA - int this_cpu = smp_processor_id(); -#endif - unsigned long j = jiffies; + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto nextgroup; + + for_each_cpu_mask(i, tmp) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i); + else + load = source_load(i); + + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: + group = group->next; + } while (group != sd->groups); + + if (!busiest || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) + goto out_balanced; /* - * First do inter-node rebalancing, then intra-node rebalancing, - * if both events happen in the same tick. The inter-node - * rebalancing does not necessarily have to create a perfect - * balance within the node, since we load-balance the most loaded - * node with the current CPU. (ie. other CPUs in the local node - * are not balanced.) + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. */ - if (idle) { -#ifdef CONFIG_NUMA - if (!(j % IDLE_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % IDLE_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); + *imbalance = min(max_load - avg_load, avg_load - this_load); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + / SCHED_LOAD_SCALE; + + if (*imbalance < SCHED_LOAD_SCALE - 1) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + if (max_load - this_load >= SCHED_LOAD_SCALE*2) { + *imbalance = 1; + return busiest; } - return; + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + if (max_load < tmp) + tmp = max_load; + pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; + + *imbalance = 1; + return busiest; } -#ifdef CONFIG_NUMA - if (!(j % BUSY_NODE_REBALANCE_TICK)) - balance_node(this_rq, idle, this_cpu); -#endif - if (!(j % BUSY_REBALANCE_TICK)) { - spin_lock(&this_rq->lock); - load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); - spin_unlock(&this_rq->lock); + + /* Get rid of the scaling factor, rounding down as we divide */ + *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; + + return busiest; + +out_balanced: + if (busiest && (idle == NEWLY_IDLE || + (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) { + *imbalance = 1; + return busiest; } + + *imbalance = 0; + return NULL; } -#else + /* - * on UP we do not need to balance between CPUs: + * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static inline void rebalance_tick(runqueue_t *this_rq, int idle) +static runqueue_t *find_busiest_queue(struct sched_group *group) { -} -#endif + cpumask_t tmp; + unsigned long load, max_load = 0; + runqueue_t *busiest = NULL; + int i; -DEFINE_PER_CPU(struct kernel_stat, kstat); + cpus_and(tmp, group->cpumask, cpu_online_map); + for_each_cpu_mask(i, tmp) { + load = source_load(i); -EXPORT_PER_CPU_SYMBOL(kstat); + if (load > max_load) { + max_load = load; + busiest = cpu_rq(i); + } + } + + return busiest; +} /* - * We place interactive tasks back into the active array, if possible. + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + struct sched_group *group; + runqueue_t *busiest; + unsigned long imbalance; + int nr_moved; + + spin_lock(&this_rq->lock); + schedstat_inc(sd, lb_cnt[idle]); + + group = find_busiest_group(sd, this_cpu, &imbalance, idle); + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + goto out_balanced; + } + + busiest = find_busiest_queue(group); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + schedstat_add(sd, lb_imbalance[idle], imbalance); + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle); + spin_unlock(&busiest->lock); + } + spin_unlock(&this_rq->lock); + + if (!nr_moved) { + schedstat_inc(sd, lb_failed[idle]); + sd->nr_balance_failed++; + + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries; + } + } else + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + + return nr_moved; + +out_balanced: + spin_unlock(&this_rq->lock); + + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. + */ +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; + + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) { + schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); + goto out; + } + + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) { + schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + goto out; + } + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, NEWLY_IDLE); + if (!nr_moved) + schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + + spin_unlock(&busiest->lock); + +out: + return nr_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd; + + for_each_domain(this_cpu, sd) { + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { + /* We've pulled tasks over so stop searching */ + break; + } + } + } +} + +/* + * active_load_balance is run by migration threads. It pushes a running + * task off the cpu. It can be required to correctly have at least 1 task + * running on each physical CPU where possible, and not have a physical / + * logical imbalance. + * + * Called with busiest locked. + */ +static void active_load_balance(runqueue_t *busiest, int busiest_cpu) +{ + struct sched_domain *sd; + struct sched_group *group, *busy_group; + int i; + + schedstat_inc(busiest, alb_cnt); + if (busiest->nr_running <= 1) + return; + + for_each_domain(busiest_cpu, sd) + if (cpu_isset(busiest->push_cpu, sd->span)) + break; + if (!sd) + return; + + group = sd->groups; + while (!cpu_isset(busiest_cpu, group->cpumask)) + group = group->next; + busy_group = group; + + group = sd->groups; + do { + cpumask_t tmp; + runqueue_t *rq; + int push_cpu = 0; + + if (group == busy_group) + goto next_group; + + cpus_and(tmp, group->cpumask, cpu_online_map); + if (!cpus_weight(tmp)) + goto next_group; + + for_each_cpu_mask(i, tmp) { + if (!idle_cpu(i)) + goto next_group; + push_cpu = i; + } + + rq = cpu_rq(push_cpu); + + /* + * This condition is "impossible", but since load + * balancing is inherently a bit racy and statistical, + * it can trigger.. Reported by Bjorn Helgaas on a + * 128-cpu setup. + */ + if (unlikely(busiest == rq)) + goto next_group; + double_lock_balance(busiest, rq); + if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) { + schedstat_inc(busiest, alb_lost); + schedstat_inc(rq, alb_gained); + } else { + schedstat_inc(busiest, alb_failed); + } + spin_unlock(&rq->lock); +next_group: + group = group->next; + } while (group != sd->groups); +} + +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, + enum idle_type idle) +{ + unsigned long old_load, this_load; + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *sd; + + /* Update our load */ + old_load = this_rq->cpu_load; + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (this_load > old_load) + old_load++; + this_rq->cpu_load = (old_load + this_load) / 2; + + for_each_domain(this_cpu, sd) { + unsigned long interval = sd->balance_interval; + + if (idle != IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + if (unlikely(!interval)) + interval = 1; + + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + sd->last_balance += interval; + } + } +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +{ +} +static inline void idle_balance(int cpu, runqueue_t *rq) +{ +} +#endif + +static inline int wake_priority_sleeper(runqueue_t *rq) +{ + int ret = 0; +#ifdef CONFIG_SCHED_SMT + spin_lock(&rq->lock); + /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + ret = 1; + } + spin_unlock(&rq->lock); +#endif + return ret; +} + +DEFINE_PER_CPU(struct kernel_stat, kstat); + +EXPORT_PER_CPU_SYMBOL(kstat); + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more * than a 'reasonable' amount of time. This deadline timeout is * load-dependent, as the frequency of array switched decreases with * increasing number of running tasks. We also ignore the interactivity @@ -1487,12 +2407,18 @@ void scheduler_tick(int user_ticks, int sys_ticks) struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; runqueue_t *rq = this_rq(); task_t *p = current; + struct vx_info *vxi = p->vx_info; rq->timestamp_last_tick = sched_clock(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); + if (vxi) { + vxi->sched.cpu[cpu].user_ticks += user_ticks; + vxi->sched.cpu[cpu].sys_ticks += sys_ticks; + } + /* note: this timer irq context must be accounted for as well */ if (hardirq_count() - HARDIRQ_OFFSET) { cpustat->irq += sys_ticks; @@ -1505,9 +2431,19 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (p == rq->idle) { if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; + // vx_cpustat_acc(vxi, iowait, cpu, cpustat, sys_ticks); else cpustat->idle += sys_ticks; - rebalance_tick(rq, 1); + // vx_cpustat_acc(vxi, idle, cpu, cpustat, sys_ticks); + + if (wake_priority_sleeper(rq)) + goto out; + +#ifdef CONFIG_VSERVER_HARDCPU_IDLE + if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) + set_need_resched(); +#endif + rebalance_tick(cpu, rq, IDLE); return; } if (TASK_NICE(p) > 0) @@ -1529,7 +2465,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) * timeslice. This makes it possible for interactive tasks * to use up their timeslices at their highest priority levels. */ - if (unlikely(rt_task(p))) { + if (rt_task(p)) { /* * RR tasks need a special form of timeslice management. * FIFO tasks have no timeslices. @@ -1545,7 +2481,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) } goto out_unlock; } - if (!--p->time_slice) { + if (vx_need_resched(p)) { dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); @@ -1591,9 +2527,134 @@ void scheduler_tick(int user_ticks, int sys_ticks) out_unlock: spin_unlock(&rq->lock); out: - rebalance_tick(rq, 0); + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd = this_rq->sd; + cpumask_t sibling_map; + int i; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return; + + /* + * Unlock the current runqueue because we have to lock in + * CPU order to avoid deadlocks. Caller knows that we might + * unlock. We keep IRQs disabled. + */ + spin_unlock(&this_rq->lock); + + cpus_and(sibling_map, sd->span, cpu_online_map); + + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + /* + * We clear this CPU from the mask. This both simplifies the + * inner loop and keps this_rq locked when we exit: + */ + cpu_clear(this_cpu, sibling_map); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } + + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + /* + * We exit with this_cpu's rq still held and IRQs + * still disabled: + */ +} + +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd = this_rq->sd; + cpumask_t sibling_map; + prio_array_t *array; + int ret = 0, i; + task_t *p; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return 0; + + /* + * The same locking rules and details apply as for + * wake_sleeping_dependent(): + */ + spin_unlock(&this_rq->lock); + cpus_and(sibling_map, sd->span, cpu_online_map); + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + cpu_clear(this_cpu, sibling_map); + + /* + * Establish next task to be run - it might have gone away because + * we released the runqueue lock above: + */ + if (!this_rq->nr_running) + goto out_unlock; + array = this_rq->active; + if (!array->nr_active) + array = this_rq->expired; + BUG_ON(!array->nr_active); + + p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, + task_t, run_list); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + task_t *smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p) || rt_task(smt_curr)) && + p->mm && smt_curr->mm && !rt_task(p)) + ret = 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(smt_curr) || rt_task(p)) && + smt_curr->mm && p->mm && !rt_task(smt_curr)) || + (smt_curr == smt_rq->idle && smt_rq->nr_running)) + resched_task(smt_curr); + } +out_unlock: + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + return ret; +} +#else +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ } +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + return 0; +} +#endif + /* * schedule() is the main scheduler function. */ @@ -1606,7 +2667,11 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int idx; +#ifdef CONFIG_VSERVER_HARDCPU + struct vx_info *vxi; + int maxidle = -HZ; +#endif + int cpu, idx; /* * Test if we are atomic. Since do_exit() needs to call into @@ -1625,7 +2690,17 @@ need_resched: prev = current; rq = this_rq(); + /* + * The idle thread is not allowed to schedule! + * Remove this check after it has been exercised a bit. + */ + if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) { + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + } + release_kernel_lock(prev); + schedstat_inc(rq, sched_cnt); now = sched_clock(); if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) run_time = now - prev->timestamp; @@ -1656,15 +2731,74 @@ need_resched: deactivate_task(prev, rq); } - if (unlikely(!rq->nr_running)) { -#ifdef CONFIG_SMP - load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); +#ifdef CONFIG_VSERVER_HARDCPU + if (!list_empty(&rq->hold_queue)) { + struct list_head *l, *n; + int ret; + + vxi = NULL; + list_for_each_safe(l, n, &rq->hold_queue) { + next = list_entry(l, task_t, run_list); + if (vxi == next->vx_info) + continue; + + vxi = next->vx_info; + ret = vx_tokens_recalc(vxi); + // tokens = vx_tokens_avail(next); + + if (ret > 0) { + list_del(&next->run_list); + next->state &= ~TASK_ONHOLD; + // one less waiting + vx_onhold_dec(vxi); + array = rq->expired; + next->prio = MAX_PRIO-1; + enqueue_task(next, array); + rq->nr_running++; + if (next->static_prio < rq->best_expired_prio) + rq->best_expired_prio = next->static_prio; + + // printk("··· %8lu unhold %p [%d]\n", jiffies, next, next->prio); + break; + } + if ((ret < 0) && (maxidle < ret)) + maxidle = ret; + } + } + rq->idle_tokens = -maxidle; + +pick_next: #endif + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { +go_idle: + idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (!rq->nr_running) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + schedstat_inc(rq, sched_goidle); + next = rq->idle; goto switch_tasks; } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; } array = rq->active; @@ -1672,17 +2806,39 @@ need_resched: /* * Switch the active and expired arrays. */ + schedstat_inc(rq, sched_switch); rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; rq->best_expired_prio = MAX_PRIO; - } + } else + schedstat_inc(rq, sched_noswitch); idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); +#ifdef CONFIG_VSERVER_HARDCPU + vxi = next->vx_info; + if (vx_info_flags(vxi, VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) { + int ret = vx_tokens_recalc(vxi); + + if (unlikely(ret <= 0)) { + if (ret && (rq->idle_tokens > -ret)) + rq->idle_tokens = -ret; + __deactivate_task(next, rq); + recalc_task_prio(next, now); + // a new one on hold + vx_onhold_inc(vxi); + next->state |= TASK_ONHOLD; + list_add_tail(&next->run_list, &rq->hold_queue); + //printk("··· %8lu hold %p [%d]\n", jiffies, next, next->prio); + goto pick_next; + } + } +#endif + if (!rt_task(next) && next->activated > 0) { unsigned long long delta = now - next->timestamp; @@ -1698,7 +2854,7 @@ need_resched: switch_tasks: prefetch(next); clear_tsk_need_resched(prev); - RCU_qsctr(task_cpu(prev))++; + rcu_qsctr_inc(task_cpu(prev)); prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) { @@ -1706,8 +2862,9 @@ switch_tasks: if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) prev->interactive_credit--; } - prev->timestamp = now; + prev->timestamp = prev->last_ran = now; + sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = now; rq->nr_switches++; @@ -1724,7 +2881,7 @@ switch_tasks: reacquire_kernel_lock(current); preempt_enable_no_resched(); - if (test_thread_flag(TIF_NEED_RESCHED)) + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; } @@ -1761,7 +2918,7 @@ need_resched: EXPORT_SYMBOL(preempt_schedule); #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync) +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { task_t *p = curr->task; return try_to_wake_up(p, mode, sync); @@ -1779,7 +2936,7 @@ EXPORT_SYMBOL(default_wake_function); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync) + int nr_exclusive, int sync, void *key) { struct list_head *tmp, *next; @@ -1788,7 +2945,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, unsigned flags; curr = list_entry(tmp, wait_queue_t, task_list); flags = curr->flags; - if (curr->func(curr, mode, sync) && + if (curr->func(curr, mode, sync, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; @@ -1801,12 +2958,13 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up */ -void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0); + __wake_up_common(q, mode, nr_exclusive, 0, key); spin_unlock_irqrestore(&q->lock, flags); } @@ -1817,7 +2975,7 @@ EXPORT_SYMBOL(__wake_up); */ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) { - __wake_up_common(q, mode, 1, 0); + __wake_up_common(q, mode, 1, 0, NULL); } /** @@ -1836,15 +2994,16 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; + int sync = 1; if (unlikely(!q)) return; + if (unlikely(!nr_exclusive)) + sync = 0; + spin_lock_irqsave(&q->lock, flags); - if (likely(nr_exclusive)) - __wake_up_common(q, mode, nr_exclusive, 1); - else - __wake_up_common(q, mode, nr_exclusive, 0); + __wake_up_common(q, mode, nr_exclusive, sync, NULL); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ @@ -1856,7 +3015,7 @@ void fastcall complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done++; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 1, 0); + 1, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -1868,7 +3027,7 @@ void fastcall complete_all(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 0, 0); + 0, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -2015,7 +3174,7 @@ out_unlock: EXPORT_SYMBOL(set_user_nice); -#ifndef __alpha__ +#ifdef __ARCH_WANT_SYS_NICE /* * sys_nice - change the priority of the current process. @@ -2035,6 +3194,8 @@ asmlinkage long sys_nice(int increment) * and we have a single winner. */ if (increment < 0) { + if (vx_flags(VXF_IGNEG_NICE, 0)) + return 0; if (!capable(CAP_SYS_NICE)) return -EPERM; if (increment < -40) @@ -2067,7 +3228,7 @@ asmlinkage long sys_nice(int increment) * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ -int task_prio(task_t *p) +int task_prio(const task_t *p) { return p->prio - MAX_RT_PRIO; } @@ -2076,7 +3237,7 @@ int task_prio(task_t *p) * task_nice - return the nice value of a given task. * @p: the task in question. */ -int task_nice(task_t *p) +int task_nice(const task_t *p) { return TASK_NICE(p); } @@ -2160,6 +3321,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) policy != SCHED_NORMAL) goto out_unlock; } + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); /* * Valid priorities for SCHED_FIFO and SCHED_RR are @@ -2190,6 +3352,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) oldprio = p->prio; __setscheduler(p, policy, lp.sched_priority); if (array) { + vx_activate_task(p); __activate_task(p, task_rq(p)); /* * Reschedule if we are currently running on this runqueue and @@ -2199,7 +3362,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (p->prio < rq->curr->prio) + } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } @@ -2300,24 +3463,10 @@ out_unlock: return retval; } -/** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask - */ -asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, - unsigned long __user *user_mask_ptr) +long sched_setaffinity(pid_t pid, cpumask_t new_mask) { - cpumask_t new_mask; - int retval; task_t *p; - - if (len < sizeof(new_mask)) - return -EINVAL; - - if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) - return -EFAULT; + int retval; lock_cpu_hotplug(); read_lock(&tasklist_lock); @@ -2350,24 +3499,57 @@ out_unlock: return retval; } +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + cpumask_t *new_mask) +{ + if (len < sizeof(cpumask_t)) { + memset(new_mask, 0, sizeof(cpumask_t)); + } else if (len > sizeof(cpumask_t)) { + len = sizeof(cpumask_t); + } + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + /** - * sys_sched_getaffinity - get the cpu affinity of a process + * sys_sched_setaffinity - set the cpu affinity of a process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask + * @user_mask_ptr: user-space pointer to the new cpu mask */ -asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { - unsigned int real_len; - cpumask_t mask; + cpumask_t new_mask; int retval; - task_t *p; - real_len = sizeof(mask); - if (len < real_len) - return -EINVAL; + retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (retval) + return retval; + + return sched_setaffinity(pid, new_mask); +} + +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ + +cpumask_t cpu_present_map; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP +cpumask_t cpu_online_map = CPU_MASK_ALL; +cpumask_t cpu_possible_map = CPU_MASK_ALL; +#endif + +long sched_getaffinity(pid_t pid, cpumask_t *mask) +{ + int retval; + task_t *p; + lock_cpu_hotplug(); read_lock(&tasklist_lock); retval = -ESRCH; @@ -2376,15 +3558,40 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, goto out_unlock; retval = 0; - cpus_and(mask, p->cpus_allowed, cpu_possible_map); + cpus_and(*mask, p->cpus_allowed, cpu_possible_map); out_unlock: read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); if (retval) return retval; - if (copy_to_user(user_mask_ptr, &mask, real_len)) + + return 0; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + int ret; + cpumask_t mask; + + if (len < sizeof(cpumask_t)) + return -EINVAL; + + ret = sched_getaffinity(pid, &mask); + if (ret < 0) + return ret; + + if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) return -EFAULT; - return real_len; + + return sizeof(cpumask_t); } /** @@ -2398,7 +3605,9 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + prio_array_t *target = rq->expired; + schedstat_inc(rq, yld_cnt); /* * We implement yielding by moving the task into the expired * queue. @@ -2406,16 +3615,22 @@ asmlinkage long sys_sched_yield(void) * (special rule: RT tasks will just roundrobin in the active * array.) */ - if (likely(!rt_task(current))) { - dequeue_task(current, array); - enqueue_task(current, rq->expired); - } else { - list_del(¤t->run_list); - list_add_tail(¤t->run_list, array->queue + current->prio); - } + if (rt_task(current)) + target = rq->active; + + if (current->array->nr_active == 1) { + schedstat_inc(rq, yld_act_empty); + if (!rq->expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->expired->nr_active) + schedstat_inc(rq, yld_exp_empty); + + dequeue_task(current, array); + enqueue_task(current, target); + /* * Since we are going to call schedule() anyway, there's - * no need to preempt: + * no need to preempt or enable interrupts: */ _raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); @@ -2583,7 +3798,7 @@ static void show_task(task_t * p) task_t *relative; unsigned state; unsigned long free = 0; - static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" }; + static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; printk("%-13.13s ", p->comm); state = p->state ? __ffs(p->state) + 1 : 0; @@ -2658,23 +3873,22 @@ void show_state(void) read_unlock(&tasklist_lock); } -void __init init_idle(task_t *idle, int cpu) +void __devinit init_idle(task_t *idle, int cpu) { - runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); + runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - local_irq_save(flags); - double_rq_lock(idle_rq, rq); - - idle_rq->curr = idle_rq->idle = idle; - deactivate_task(idle, rq); + idle->sleep_avg = 0; + idle->interactive_credit = 0; idle->array = NULL; idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; set_task_cpu(idle, cpu); - double_rq_unlock(idle_rq, rq); + + spin_lock_irqsave(&rq->lock, flags); + rq->curr = rq->idle = idle; set_tsk_need_resched(idle); - local_irq_restore(flags); + spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ #ifdef CONFIG_PREEMPT @@ -2685,13 +3899,13 @@ void __init init_idle(task_t *idle, int cpu) } /* - * In a system that switches off the HZ timer idle_cpu_mask + * In a system that switches off the HZ timer nohz_cpu_mask * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer idle_cpu_mask should + * which do not switch off the HZ timer nohz_cpu_mask should * always be CPU_MASK_NONE. */ -cpumask_t idle_cpu_mask = CPU_MASK_NONE; +cpumask_t nohz_cpu_mask = CPU_MASK_NONE; #ifdef CONFIG_SMP /* @@ -2727,16 +3941,22 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask) runqueue_t *rq; rq = task_rq_lock(p, &flags); - if (any_online_cpu(new_mask) == NR_CPUS) { + if (!cpus_intersects(new_mask, cpu_online_map)) { ret = -EINVAL; goto out; } - if (__set_cpus_allowed(p, new_mask, &req)) { + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); wait_for_completion(&req.done); + tlb_migrate_finish(p->mm); return 0; } out: @@ -2746,28 +3966,51 @@ out: EXPORT_SYMBOL_GPL(set_cpus_allowed); -/* Move (not current) task off this cpu, onto dest cpu. */ -static void move_task_away(struct task_struct *p, int dest_cpu) +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { - runqueue_t *rq_dest; + runqueue_t *rq_dest, *rq_src; + if (unlikely(cpu_is_offline(dest_cpu))) + return; + + rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); - double_rq_lock(this_rq(), rq_dest); - if (task_cpu(p) != smp_processor_id()) - goto out; /* Already moved */ + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; set_task_cpu(p, dest_cpu); if (p->array) { - deactivate_task(p, this_rq()); - activate_task(p, rq_dest); - if (p->prio < rq_dest->curr->prio) + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + activate_task(p, rq_dest, 0); + if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } - p->timestamp = rq_dest->timestamp_last_tick; out: - double_rq_unlock(this_rq(), rq_dest); + double_rq_unlock(rq_src, rq_dest); } /* @@ -2783,6 +4026,7 @@ static int migration_thread(void * data) rq = cpu_rq(cpu); BUG_ON(rq->migration_thread != current); + set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { struct list_head *head; migration_req_t *req; @@ -2791,79 +4035,174 @@ static int migration_thread(void * data) refrigerator(PF_FREEZE); spin_lock_irq(&rq->lock); + + if (cpu_is_offline(cpu)) { + spin_unlock_irq(&rq->lock); + goto wait_to_die; + } + + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + head = &rq->migration_queue; - current->state = TASK_INTERRUPTIBLE; + if (list_empty(head)) { spin_unlock_irq(&rq->lock); schedule(); + set_current_state(TASK_INTERRUPTIBLE); continue; } req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); - spin_unlock(&rq->lock); - move_task_away(req->task, - any_online_cpu(req->task->cpus_allowed)); - local_irq_enable(); + if (req->type == REQ_MOVE_TASK) { + spin_unlock(&rq->lock); + __migrate_task(req->task, smp_processor_id(), + req->dest_cpu); + local_irq_enable(); + } else if (req->type == REQ_SET_DOMAIN) { + rq->sd = req->sd; + spin_unlock_irq(&rq->lock); + } else { + spin_unlock_irq(&rq->lock); + WARN_ON(1); + } + complete(&req->done); } + __set_current_state(TASK_RUNNING); + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); return 0; } #ifdef CONFIG_HOTPLUG_CPU -/* migrate_all_tasks - function to migrate all the tasks from the - * current cpu caller must have already scheduled this to the target - * cpu via set_cpus_allowed. Machine is stopped. */ -void migrate_all_tasks(void) +/* Figure out where task on dead CPU should go, use force if neccessary. */ +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) { - struct task_struct *tsk, *t; - int dest_cpu, src_cpu; - unsigned int node; + int dest_cpu; + cpumask_t mask; - /* We're nailed to this CPU. */ - src_cpu = smp_processor_id(); + /* On same node? */ + mask = node_to_cpumask(cpu_to_node(dead_cpu)); + cpus_and(mask, mask, tsk->cpus_allowed); + dest_cpu = any_online_cpu(mask); - /* Not required, but here for neatness. */ - write_lock(&tasklist_lock); + /* On any allowed CPU? */ + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(tsk->cpus_allowed); - /* watch out for per node tasks, let's stay on this node */ - node = cpu_to_node(src_cpu); + /* No more Mr. Nice Guy. */ + if (dest_cpu == NR_CPUS) { + cpus_setall(tsk->cpus_allowed); + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (tsk->mm && printk_ratelimit()) + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + tsk->pid, tsk->comm, dead_cpu); + } + __migrate_task(tsk, dead_cpu, dest_cpu); +} + +/* Run through task list and migrate tasks from the dead cpu. */ +static void migrate_live_tasks(int src_cpu) +{ + struct task_struct *tsk, *t; + + write_lock_irq(&tasklist_lock); do_each_thread(t, tsk) { - cpumask_t mask; if (tsk == current) continue; - if (task_cpu(tsk) != src_cpu) - continue; + if (task_cpu(tsk) == src_cpu) + move_task_off_dead_cpu(src_cpu, tsk); + } while_each_thread(t, tsk); - /* Figure out where this task should go (attempting to - * keep it on-node), and check if it can be migrated - * as-is. NOTE that kernel threads bound to more than - * one online cpu will be migrated. */ - mask = node_to_cpumask(node); - cpus_and(mask, mask, tsk->cpus_allowed); - dest_cpu = any_online_cpu(mask); - if (dest_cpu == NR_CPUS) - dest_cpu = any_online_cpu(tsk->cpus_allowed); - if (dest_cpu == NR_CPUS) { - cpus_clear(tsk->cpus_allowed); - cpus_complement(tsk->cpus_allowed); - dest_cpu = any_online_cpu(tsk->cpus_allowed); - - /* Don't tell them about moving exiting tasks - or kernel threads (both mm NULL), since - they never leave kernel. */ - if (tsk->mm && printk_ratelimit()) - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - tsk->pid, tsk->comm, src_cpu); - } + write_unlock_irq(&tasklist_lock); +} - move_task_away(tsk, dest_cpu); - } while_each_thread(t, tsk); +/* Schedules idle task to be the next runnable task on current CPU. + * It does so by boosting its priority to highest possible and adding it to + * the _front_ of runqueue. Used by CPU offline code. + */ +void sched_idle_next(void) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + struct task_struct *p = rq->idle; + unsigned long flags; + + /* cpu has to be offline */ + BUG_ON(cpu_online(cpu)); + + /* Strictly not necessary since rest of the CPUs are stopped by now + * and interrupts disabled on current cpu. + */ + spin_lock_irqsave(&rq->lock, flags); - write_unlock(&tasklist_lock); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(p, rq); + + spin_unlock_irqrestore(&rq->lock, flags); +} + +static void migrate_dead(unsigned int dead_cpu, task_t *tsk) +{ + struct runqueue *rq = cpu_rq(dead_cpu); + + /* Must be exiting, otherwise would be on tasklist. */ + BUG_ON(tsk->state != TASK_ZOMBIE && tsk->state != TASK_DEAD); + + /* Cannot have done final schedule yet: would have vanished. */ + BUG_ON(tsk->flags & PF_DEAD); + + get_task_struct(tsk); + + /* + * Drop lock around migration; if someone else moves it, + * that's OK. No task can be added to this CPU, so iteration is + * fine. + */ + spin_unlock_irq(&rq->lock); + move_task_off_dead_cpu(dead_cpu, tsk); + spin_lock_irq(&rq->lock); + + put_task_struct(tsk); +} + +/* release_task() removes task from tasklist, so we won't find dead tasks. */ +static void migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned arr, i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < MAX_PRIO; i++) { + struct list_head *list = &rq->arrays[arr].queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + run_list)); + } + } } #endif /* CONFIG_HOTPLUG_CPU */ @@ -2884,6 +4223,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); if (IS_ERR(p)) return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); @@ -2899,18 +4239,48 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, case CPU_UP_CANCELED: /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); - case CPU_DEAD: kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; - BUG_ON(cpu_rq(cpu)->nr_running != 0); - break; + break; + case CPU_DEAD: + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); + kthread_stop(rq->migration_thread); + rq->migration_thread = NULL; + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); + rq->idle->static_prio = MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); + migrate_dead_tasks(cpu); + task_rq_unlock(rq, &flags); + BUG_ON(rq->nr_running != 0); + + /* No need to migrate the tasks: it was best-effort if + * they didn't do lock_cpu_hotplug(). Just wake up + * the requestors. */ + spin_lock_irq(&rq->lock); + while (!list_empty(&rq->migration_queue)) { + migration_req_t *req; + req = list_entry(rq->migration_queue.next, + migration_req_t, list); + BUG_ON(req->type != REQ_MOVE_TASK); + list_del_init(&req->list); + complete(&req->done); + } + spin_unlock_irq(&rq->lock); + break; #endif } return NOTIFY_OK; } +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ static struct notifier_block __devinitdata migration_notifier = { .notifier_call = migration_call, + .priority = 10 }; int __init migration_init(void) @@ -2939,23 +4309,519 @@ int __init migration_init(void) spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL(kernel_flag); +#ifdef CONFIG_SMP +/* Attach the domain 'sd' to 'cpu' as its base domain */ +static void cpu_attach_domain(struct sched_domain *sd, int cpu) +{ + migration_req_t req; + unsigned long flags; + runqueue_t *rq = cpu_rq(cpu); + int local = 1; + + lock_cpu_hotplug(); + + spin_lock_irqsave(&rq->lock, flags); + + if (cpu == smp_processor_id() || !cpu_online(cpu)) { + rq->sd = sd; + } else { + init_completion(&req.done); + req.type = REQ_SET_DOMAIN; + req.sd = sd; + list_add(&req.list, &rq->migration_queue); + local = 0; + } + + spin_unlock_irqrestore(&rq->lock, flags); + + if (!local) { + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + } + + unlock_cpu_hotplug(); +} + +/* + * To enable disjoint top-level NUMA domains, define SD_NODES_PER_DOMAIN + * in arch code. That defines the number of nearby nodes in a node's top + * level scheduling domain. + */ +#if defined(CONFIG_NUMA) && defined(SD_NODES_PER_DOMAIN) +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int __init find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < numnodes; i++) { + /* Start at @node */ + n = (node + i) % numnodes; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, i); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +cpumask_t __init sched_domain_node_span(int node) +{ + int i; + cpumask_t span; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + for (i = 0; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, used_nodes); + cpumask_t nodemask; + + nodemask = node_to_cpumask(next_node); + cpus_or(span, span, nodemask); + } + + return span; +} +#else /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */ +cpumask_t __init sched_domain_node_span(int node) +{ + return cpu_possible_map; +} +#endif /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */ + +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +__init static int cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +__init static int cpu_to_phys_group(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + return first_cpu(cpu_sibling_map[cpu]); +#else + return cpu; +#endif +} + +#ifdef CONFIG_NUMA + +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +__init static int cpu_to_node_group(int cpu) +{ + return cpu_to_node(cpu); +} +#endif + +/* Groups for isolated scheduling domains */ +static struct sched_group sched_group_isolated[NR_CPUS]; + +/* cpus with isolated domains */ +cpumask_t __initdata cpu_isolated_map = CPU_MASK_NONE; + +__init static int cpu_to_isolated_group(int cpu) +{ + return cpu; +} + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + int ints[NR_CPUS], i; + + str = get_options(str, ARRAY_SIZE(ints), ints); + cpus_clear(cpu_isolated_map); + for (i = 1; i <= ints[0]; i++) + cpu_set(ints[i], cpu_isolated_map); + return 1; +} + +__setup ("isolcpus=", isolated_cpu_setup); + +/* + * init_sched_build_groups takes an array of groups, the cpumask we wish + * to span, and a pointer to a function which identifies what group a CPU + * belongs to. The return value of group_fn must be a valid index into the + * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we + * keep track of groups covered with a cpumask_t). + * + * init_sched_build_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + */ +__init static void init_sched_build_groups(struct sched_group groups[], + cpumask_t span, int (*group_fn)(int cpu)) +{ + struct sched_group *first = NULL, *last = NULL; + cpumask_t covered = CPU_MASK_NONE; + int i; + + for_each_cpu_mask(i, span) { + int group = group_fn(i); + struct sched_group *sg = &groups[group]; + int j; + + if (cpu_isset(i, covered)) + continue; + + sg->cpumask = CPU_MASK_NONE; + sg->cpu_power = 0; + + for_each_cpu_mask(j, span) { + if (group_fn(j) != group) + continue; + + cpu_set(j, covered); + cpu_set(j, sg->cpumask); + } + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; +} + +__init static void arch_init_sched_domains(void) +{ + int i; + cpumask_t cpu_default_map; + + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_complement(cpu_default_map, cpu_isolated_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_possible_map); + + /* Set up domains */ + for_each_cpu(i) { + int group; + struct sched_domain *sd = NULL, *p; + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + + cpus_and(nodemask, nodemask, cpu_default_map); + + /* + * Set up isolated domains. + * Unlike those of other cpus, the domains and groups are + * single level, and span a single cpu. + */ + if (cpu_isset(i, cpu_isolated_map)) { +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); +#else + sd = &per_cpu(phys_domains, i); +#endif + group = cpu_to_isolated_group(i); + *sd = SD_CPU_INIT; + cpu_set(i, sd->span); + sd->balance_interval = INT_MAX; /* Don't balance */ + sd->flags = 0; /* Avoid WAKE_ */ + sd->groups = &sched_group_isolated[group]; + printk(KERN_INFO "Setting up cpu %d isolated.\n", i); + /* Single level, so continue with next cpu */ + continue; + } + +#ifdef CONFIG_NUMA + sd = &per_cpu(node_domains, i); + group = cpu_to_node_group(i); + *sd = SD_NODE_INIT; + /* FIXME: should be multilevel, in arch code */ + sd->span = sched_domain_node_span(i); + cpus_and(sd->span, sd->span, cpu_default_map); + sd->groups = &sched_group_nodes[group]; +#endif + + p = sd; + sd = &per_cpu(phys_domains, i); + group = cpu_to_phys_group(i); + *sd = SD_CPU_INIT; +#ifdef CONFIG_NUMA + sd->span = nodemask; +#else + sd->span = cpu_possible_map; +#endif + sd->parent = p; + sd->groups = &sched_group_phys[group]; + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i); + group = cpu_to_cpu_group(i); + *sd = SD_SIBLING_INIT; + sd->span = cpu_sibling_map[i]; + cpus_and(sd->span, sd->span, cpu_default_map); + sd->parent = p; + sd->groups = &sched_group_cpus[group]; +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + cpumask_t this_sibling_map = cpu_sibling_map[i]; + cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + if (i != first_cpu(this_sibling_map)) + continue; + + init_sched_build_groups(sched_group_cpus, this_sibling_map, + &cpu_to_cpu_group); + } +#endif + + /* Set up isolated groups */ + for_each_cpu_mask(i, cpu_isolated_map) { + cpumask_t mask; + cpus_clear(mask); + cpu_set(i, mask); + init_sched_build_groups(sched_group_isolated, mask, + &cpu_to_isolated_group); + } + +#ifdef CONFIG_NUMA + /* Set up physical groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + + cpus_and(nodemask, nodemask, cpu_default_map); + if (cpus_empty(nodemask)) + continue; + + init_sched_build_groups(sched_group_phys, nodemask, + &cpu_to_phys_group); + } +#else + init_sched_build_groups(sched_group_phys, cpu_possible_map, + &cpu_to_phys_group); +#endif + +#ifdef CONFIG_NUMA + /* Set up node groups */ + init_sched_build_groups(sched_group_nodes, cpu_default_map, + &cpu_to_node_group); +#endif + + /* Calculate CPU power for physical packages and nodes */ + for_each_cpu_mask(i, cpu_default_map) { + int power; + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); + power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = power; +#endif + + sd = &per_cpu(phys_domains, i); + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + +#ifdef CONFIG_NUMA + if (i == first_cpu(sd->groups->cpumask)) { + /* Only add "power" once for each physical package. */ + sd = &per_cpu(node_domains, i); + sd->groups->cpu_power += power; + } +#endif + } + + /* Attach the domains */ + for_each_cpu(i) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); +#else + sd = &per_cpu(phys_domains, i); +#endif + cpu_attach_domain(sd, i); + } +} + +#undef SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +void sched_domain_debug(void) +{ + int i; + + for_each_cpu(i) { + runqueue_t *rq = cpu_rq(i); + struct sched_domain *sd; + int level = 0; + + sd = rq->sd; + + printk(KERN_DEBUG "CPU%d: %s\n", + i, (cpu_online(i) ? " online" : "offline")); + + do { + int j; + char str[NR_CPUS]; + struct sched_group *group = sd->groups; + cpumask_t groupmask; + + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (j = 0; j < level + 1; j++) + printk(" "); + printk("domain %d: span %s\n", level, str); + + if (!cpu_isset(i, sd->span)) + printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); + if (!cpu_isset(i, group->cpumask)) + printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); + if (!group->cpu_power) + printk(KERN_DEBUG "ERROR domain->cpu_power not set\n"); + + printk(KERN_DEBUG); + for (j = 0; j < level + 2; j++) + printk(" "); + printk("groups:"); + do { + if (!group) { + printk(" ERROR: NULL"); + break; + } + + if (!cpus_weight(group->cpumask)) + printk(" ERROR empty group:"); + + if (cpus_intersects(groupmask, group->cpumask)) + printk(" ERROR repeated CPUs:"); + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != sd->groups); + printk("\n"); + + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); + + level++; + sd = sd->parent; + + if (sd) { + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); + } + + } while (sd); + } +} +#else +#define sched_domain_debug() {} +#endif + +void __init sched_init_smp(void) +{ + arch_init_sched_domains(); + sched_domain_debug(); +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + +int in_sched_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __sched functions */ + extern char __sched_text_start[], __sched_text_end[]; + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + void __init sched_init(void) { runqueue_t *rq; int i, j, k; +#ifdef CONFIG_SMP + /* Set up an initial dummy domain for early boot */ + static struct sched_domain sched_domain_init; + static struct sched_group sched_group_init; + + memset(&sched_domain_init, 0, sizeof(struct sched_domain)); + sched_domain_init.span = CPU_MASK_ALL; + sched_domain_init.groups = &sched_group_init; + sched_domain_init.last_balance = jiffies; + sched_domain_init.balance_interval = INT_MAX; /* Don't balance */ + sched_domain_init.busy_factor = 1; + + memset(&sched_group_init, 0, sizeof(struct sched_group)); + sched_group_init.cpumask = CPU_MASK_ALL; + sched_group_init.next = &sched_group_init; + sched_group_init.cpu_power = SCHED_LOAD_SCALE; +#endif + for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; rq = cpu_rq(i); + spin_lock_init(&rq->lock); rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; - spin_lock_init(&rq->lock); +#ifdef CONFIG_SMP + rq->sd = &sched_domain_init; + rq->cpu_load = 0; + rq->active_balance = 0; + rq->push_cpu = 0; + rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); +#endif +#ifdef CONFIG_VSERVER_HARDCPU + INIT_LIST_HEAD(&rq->hold_queue); +#endif atomic_set(&rq->nr_iowait, 0); - nr_running_init(rq); for (j = 0; j < 2; j++) { array = rq->arrays + j; @@ -2967,23 +4833,20 @@ void __init sched_init(void) __set_bit(MAX_PRIO, array->bitmap); } } - /* - * We have to do a little magic to get the first - * thread right in SMP mode. - */ - rq = this_rq(); - rq->curr = current; - rq->idle = current; - set_task_cpu(current, smp_processor_id()); - wake_up_forked_process(current); - - init_timers(); /* * The boot idle thread does lazy MMU switching as well: */ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -3007,49 +4870,3 @@ void __might_sleep(char *file, int line) } EXPORT_SYMBOL(__might_sleep); #endif - - -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) -/* - * This could be a long-held lock. If another CPU holds it for a long time, - * and that CPU is not asked to reschedule then *this* CPU will spin on the - * lock for a long time, even if *this* CPU is asked to reschedule. - * - * So what we do here, in the slow (contended) path is to spin on the lock by - * hand while permitting preemption. - * - * Called inside preempt_disable(). - */ -void __sched __preempt_spin_lock(spinlock_t *lock) -{ - if (preempt_count() > 1) { - _raw_spin_lock(lock); - return; - } - do { - preempt_enable(); - while (spin_is_locked(lock)) - cpu_relax(); - preempt_disable(); - } while (!_raw_spin_trylock(lock)); -} - -EXPORT_SYMBOL(__preempt_spin_lock); - -void __sched __preempt_write_lock(rwlock_t *lock) -{ - if (preempt_count() > 1) { - _raw_write_lock(lock); - return; - } - - do { - preempt_enable(); - while (rwlock_is_locked(lock)) - cpu_relax(); - preempt_disable(); - } while (!_raw_write_trylock(lock)); -} - -EXPORT_SYMBOL(__preempt_write_lock); -#endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */