Fedora Core 2 - 1.492
[linux-2.6.git] / kernel / sched.c
1 /*
2  *  kernel/sched.c
3  *
4  *  Kernel scheduler and related syscalls
5  *
6  *  Copyright (C) 1991-2002  Linus Torvalds
7  *
8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9  *              make semaphores SMP safe
10  *  1998-11-19  Implemented schedule_timeout() and related stuff
11  *              by Andrea Arcangeli
12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
13  *              hybrid priority-list and round-robin design with
14  *              an array-switch method of distributing timeslices
15  *              and per-CPU runqueues.  Cleanups and useful suggestions
16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
17  *  2003-09-03  Interactivity tuning by Con Kolivas.
18  *  2004-04-02  Scheduler domains code by Nick Piggin
19  */
20
21 #include <linux/mm.h>
22 #include <linux/module.h>
23 #include <linux/nmi.h>
24 #include <linux/init.h>
25 #include <asm/uaccess.h>
26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h>
28 #include <linux/pagemap.h>
29 #include <asm/mmu_context.h>
30 #include <linux/interrupt.h>
31 #include <linux/completion.h>
32 #include <linux/kernel_stat.h>
33 #include <linux/security.h>
34 #include <linux/notifier.h>
35 #include <linux/suspend.h>
36 #include <linux/blkdev.h>
37 #include <linux/delay.h>
38 #include <linux/smp.h>
39 #include <linux/timer.h>
40 #include <linux/rcupdate.h>
41 #include <linux/cpu.h>
42 #include <linux/percpu.h>
43 #include <linux/kthread.h>
44 #include <asm/tlb.h>
45
46 #include <asm/unistd.h>
47
48 #ifdef CONFIG_NUMA
49 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
50 #else
51 #define cpu_to_node_mask(cpu) (cpu_online_map)
52 #endif
53
54 /*
55  * Convert user-nice values [ -20 ... 0 ... 19 ]
56  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
57  * and back.
58  */
59 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
60 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
61 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
62
63 /*
64  * 'User priority' is the nice value converted to something we
65  * can work with better when scaling various scheduler parameters,
66  * it's a [ 0 ... 39 ] range.
67  */
68 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
69 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
70 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
71 #define AVG_TIMESLICE   (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
72                         (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
73
74 /*
75  * Some helpers for converting nanosecond timing to jiffy resolution
76  */
77 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
78 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
79
80 /*
81  * These are the 'tuning knobs' of the scheduler:
82  *
83  * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
84  * maximum timeslice is 200 msecs. Timeslices get refilled after
85  * they expire.
86  */
87 #define MIN_TIMESLICE           ( 10 * HZ / 1000)
88 #define MAX_TIMESLICE           (200 * HZ / 1000)
89 #define ON_RUNQUEUE_WEIGHT       30
90 #define CHILD_PENALTY            95
91 #define PARENT_PENALTY          100
92 #define EXIT_WEIGHT               3
93 #define PRIO_BONUS_RATIO         25
94 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
95 #define INTERACTIVE_DELTA         2
96 #define MAX_SLEEP_AVG           (AVG_TIMESLICE * MAX_BONUS)
97 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
98 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
99 #define CREDIT_LIMIT            100
100
101 /*
102  * If a task is 'interactive' then we reinsert it in the active
103  * array after it has expired its current timeslice. (it will not
104  * continue to run immediately, it will still roundrobin with
105  * other interactive tasks.)
106  *
107  * This part scales the interactivity limit depending on niceness.
108  *
109  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
110  * Here are a few examples of different nice levels:
111  *
112  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
113  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
114  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
115  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
116  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
117  *
118  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
119  *  priority range a task can explore, a value of '1' means the
120  *  task is rated interactive.)
121  *
122  * Ie. nice +19 tasks can never get 'interactive' enough to be
123  * reinserted into the active array. And only heavily CPU-hog nice -20
124  * tasks will be expired. Default nice 0 tasks are somewhere between,
125  * it takes some effort for them to get interactive, but it's not
126  * too hard.
127  */
128
129 #define CURRENT_BONUS(p) \
130         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
131                 MAX_SLEEP_AVG)
132
133 #ifdef CONFIG_SMP
134 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
135                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
136                         num_online_cpus())
137 #else
138 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
139                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
140 #endif
141
142 #define SCALE(v1,v1_max,v2_max) \
143         (v1) * (v2_max) / (v1_max)
144
145 #define DELTA(p) \
146         (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
147
148 #define TASK_INTERACTIVE(p) \
149         ((p)->prio <= (p)->static_prio - DELTA(p))
150
151 #define INTERACTIVE_SLEEP(p) \
152         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
153                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
154
155 #define HIGH_CREDIT(p) \
156         ((p)->interactive_credit > CREDIT_LIMIT)
157
158 #define LOW_CREDIT(p) \
159         ((p)->interactive_credit < -CREDIT_LIMIT)
160
161 #define TASK_PREEMPTS_CURR(p, rq) \
162         ((p)->prio < (rq)->curr->prio)
163
164 /*
165  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
166  * to time slice values.
167  *
168  * The higher a thread's priority, the bigger timeslices
169  * it gets during one round of execution. But even the lowest
170  * priority thread gets MIN_TIMESLICE worth of execution time.
171  *
172  * task_timeslice() is the interface that is used by the scheduler.
173  */
174
175 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
176                 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
177                         (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
178
179 static unsigned int task_timeslice(task_t *p)
180 {
181         return BASE_TIMESLICE(p);
182 }
183
184 #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
185
186 /*
187  * These are the runqueue data structures:
188  */
189
190 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
191
192 typedef struct runqueue runqueue_t;
193
194 struct prio_array {
195         unsigned int nr_active;
196         unsigned long bitmap[BITMAP_SIZE];
197         struct list_head queue[MAX_PRIO];
198 };
199
200 /*
201  * This is the main, per-CPU runqueue data structure.
202  *
203  * Locking rule: those places that want to lock multiple runqueues
204  * (such as the load balancing or the thread migration code), lock
205  * acquire operations must be ordered by ascending &runqueue.
206  */
207 struct runqueue {
208         spinlock_t lock;
209
210         /*
211          * nr_running and cpu_load should be in the same cacheline because
212          * remote CPUs use both these fields when doing load calculation.
213          */
214         unsigned long nr_running;
215 #ifdef CONFIG_SMP
216         unsigned long cpu_load;
217 #endif
218         unsigned long long nr_switches, nr_preempt;
219         unsigned long expired_timestamp, nr_uninterruptible;
220         unsigned long long timestamp_last_tick;
221         task_t *curr, *idle;
222         struct mm_struct *prev_mm;
223         prio_array_t *active, *expired, arrays[2];
224         int best_expired_prio;
225         atomic_t nr_iowait;
226
227 #ifdef CONFIG_SMP
228         struct sched_domain *sd;
229
230         /* For active balancing */
231         int active_balance;
232         int push_cpu;
233
234         task_t *migration_thread;
235         struct list_head migration_queue;
236 #endif
237 };
238
239 static DEFINE_PER_CPU(struct runqueue, runqueues);
240
241 #define for_each_domain(cpu, domain) \
242         for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
243
244 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
245 #define this_rq()               (&__get_cpu_var(runqueues))
246 #define task_rq(p)              cpu_rq(task_cpu(p))
247 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
248
249 /*
250  * Default context-switch locking:
251  */
252 #ifndef prepare_arch_switch
253 # define prepare_arch_switch(rq, next)  do { } while (0)
254 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
255 # define task_running(rq, p)            ((rq)->curr == (p))
256 #endif
257
258 /*
259  * task_rq_lock - lock the runqueue a given task resides on and disable
260  * interrupts.  Note the ordering: we can safely lookup the task_rq without
261  * explicitly disabling preemption.
262  */
263 static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
264 {
265         struct runqueue *rq;
266
267 repeat_lock_task:
268         local_irq_save(*flags);
269         rq = task_rq(p);
270         spin_lock(&rq->lock);
271         if (unlikely(rq != task_rq(p))) {
272                 spin_unlock_irqrestore(&rq->lock, *flags);
273                 goto repeat_lock_task;
274         }
275         return rq;
276 }
277
278 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
279 {
280         spin_unlock_irqrestore(&rq->lock, *flags);
281 }
282
283 /*
284  * rq_lock - lock a given runqueue and disable interrupts.
285  */
286 static runqueue_t *this_rq_lock(void)
287 {
288         runqueue_t *rq;
289
290         local_irq_disable();
291         rq = this_rq();
292         spin_lock(&rq->lock);
293
294         return rq;
295 }
296
297 static inline void rq_unlock(runqueue_t *rq)
298 {
299         spin_unlock_irq(&rq->lock);
300 }
301
302 /*
303  * Adding/removing a task to/from a priority array:
304  */
305 static void dequeue_task(struct task_struct *p, prio_array_t *array)
306 {
307         array->nr_active--;
308         list_del(&p->run_list);
309         if (list_empty(array->queue + p->prio))
310                 __clear_bit(p->prio, array->bitmap);
311 }
312
313 static void enqueue_task(struct task_struct *p, prio_array_t *array)
314 {
315         list_add_tail(&p->run_list, array->queue + p->prio);
316         __set_bit(p->prio, array->bitmap);
317         array->nr_active++;
318         p->array = array;
319 }
320
321 /*
322  * Used by the migration code - we pull tasks from the head of the
323  * remote queue so we want these tasks to show up at the head of the
324  * local queue:
325  */
326 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
327 {
328         list_add(&p->run_list, array->queue + p->prio);
329         __set_bit(p->prio, array->bitmap);
330         array->nr_active++;
331         p->array = array;
332 }
333
334 /*
335  * effective_prio - return the priority that is based on the static
336  * priority but is modified by bonuses/penalties.
337  *
338  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
339  * into the -5 ... 0 ... +5 bonus/penalty range.
340  *
341  * We use 25% of the full 0...39 priority range so that:
342  *
343  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
344  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
345  *
346  * Both properties are important to certain workloads.
347  */
348 static int effective_prio(task_t *p)
349 {
350         int bonus, prio;
351
352         if (rt_task(p))
353                 return p->prio;
354
355         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
356
357         prio = p->static_prio - bonus;
358         if (prio < MAX_RT_PRIO)
359                 prio = MAX_RT_PRIO;
360         if (prio > MAX_PRIO-1)
361                 prio = MAX_PRIO-1;
362         return prio;
363 }
364
365 /*
366  * __activate_task - move a task to the runqueue.
367  */
368 static inline void __activate_task(task_t *p, runqueue_t *rq)
369 {
370         enqueue_task(p, rq->active);
371         rq->nr_running++;
372 }
373
374 /*
375  * __activate_idle_task - move idle task to the _front_ of runqueue.
376  */
377 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
378 {
379         enqueue_task_head(p, rq->active);
380         rq->nr_running++;
381 }
382
383 static void recalc_task_prio(task_t *p, unsigned long long now)
384 {
385         unsigned long long __sleep_time = now - p->timestamp;
386         unsigned long sleep_time;
387
388         if (__sleep_time > NS_MAX_SLEEP_AVG)
389                 sleep_time = NS_MAX_SLEEP_AVG;
390         else
391                 sleep_time = (unsigned long)__sleep_time;
392
393         if (likely(sleep_time > 0)) {
394                 /*
395                  * User tasks that sleep a long time are categorised as
396                  * idle and will get just interactive status to stay active &
397                  * prevent them suddenly becoming cpu hogs and starving
398                  * other processes.
399                  */
400                 if (p->mm && p->activated != -1 &&
401                         sleep_time > INTERACTIVE_SLEEP(p)) {
402                                 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
403                                                 AVG_TIMESLICE);
404                                 if (!HIGH_CREDIT(p))
405                                         p->interactive_credit++;
406                 } else {
407                         /*
408                          * The lower the sleep avg a task has the more
409                          * rapidly it will rise with sleep time.
410                          */
411                         sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
412
413                         /*
414                          * Tasks with low interactive_credit are limited to
415                          * one timeslice worth of sleep avg bonus.
416                          */
417                         if (LOW_CREDIT(p) &&
418                             sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
419                                 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
420
421                         /*
422                          * Non high_credit tasks waking from uninterruptible
423                          * sleep are limited in their sleep_avg rise as they
424                          * are likely to be cpu hogs waiting on I/O
425                          */
426                         if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
427                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
428                                         sleep_time = 0;
429                                 else if (p->sleep_avg + sleep_time >=
430                                                 INTERACTIVE_SLEEP(p)) {
431                                         p->sleep_avg = INTERACTIVE_SLEEP(p);
432                                         sleep_time = 0;
433                                 }
434                         }
435
436                         /*
437                          * This code gives a bonus to interactive tasks.
438                          *
439                          * The boost works by updating the 'average sleep time'
440                          * value here, based on ->timestamp. The more time a
441                          * task spends sleeping, the higher the average gets -
442                          * and the higher the priority boost gets as well.
443                          */
444                         p->sleep_avg += sleep_time;
445
446                         if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
447                                 p->sleep_avg = NS_MAX_SLEEP_AVG;
448                                 if (!HIGH_CREDIT(p))
449                                         p->interactive_credit++;
450                         }
451                 }
452         }
453
454         p->prio = effective_prio(p);
455 }
456
457 /*
458  * activate_task - move a task to the runqueue and do priority recalculation
459  *
460  * Update all the scheduling statistics stuff. (sleep average
461  * calculation, priority modifiers, etc.)
462  */
463 static void activate_task(task_t *p, runqueue_t *rq, int local)
464 {
465         unsigned long long now;
466
467         now = sched_clock();
468 #ifdef CONFIG_SMP
469         if (!local) {
470                 /* Compensate for drifting sched_clock */
471                 runqueue_t *this_rq = this_rq();
472                 now = (now - this_rq->timestamp_last_tick)
473                         + rq->timestamp_last_tick;
474         }
475 #endif
476
477         recalc_task_prio(p, now);
478
479         /*
480          * This checks to make sure it's not an uninterruptible task
481          * that is now waking up.
482          */
483         if (!p->activated) {
484                 /*
485                  * Tasks which were woken up by interrupts (ie. hw events)
486                  * are most likely of interactive nature. So we give them
487                  * the credit of extending their sleep time to the period
488                  * of time they spend on the runqueue, waiting for execution
489                  * on a CPU, first time around:
490                  */
491                 if (in_interrupt())
492                         p->activated = 2;
493                 else {
494                         /*
495                          * Normal first-time wakeups get a credit too for
496                          * on-runqueue time, but it will be weighted down:
497                          */
498                         p->activated = 1;
499                 }
500         }
501         p->timestamp = now;
502
503         __activate_task(p, rq);
504 }
505
506 /*
507  * deactivate_task - remove a task from the runqueue.
508  */
509 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
510 {
511         rq->nr_running--;
512         if (p->state == TASK_UNINTERRUPTIBLE)
513                 rq->nr_uninterruptible++;
514         dequeue_task(p, p->array);
515         p->array = NULL;
516 }
517
518 /*
519  * resched_task - mark a task 'to be rescheduled now'.
520  *
521  * On UP this means the setting of the need_resched flag, on SMP it
522  * might also involve a cross-CPU call to trigger the scheduler on
523  * the target CPU.
524  */
525 #ifdef CONFIG_SMP
526 static void resched_task(task_t *p)
527 {
528         int need_resched, nrpolling;
529
530         preempt_disable();
531         /* minimise the chance of sending an interrupt to poll_idle() */
532         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
533         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
534         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
535
536         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
537                 smp_send_reschedule(task_cpu(p));
538         preempt_enable();
539 }
540 #else
541 static inline void resched_task(task_t *p)
542 {
543         set_tsk_need_resched(p);
544 }
545 #endif
546
547 /**
548  * task_curr - is this task currently executing on a CPU?
549  * @p: the task in question.
550  */
551 inline int task_curr(const task_t *p)
552 {
553         return cpu_curr(task_cpu(p)) == p;
554 }
555
556 #ifdef CONFIG_SMP
557 enum request_type {
558         REQ_MOVE_TASK,
559         REQ_SET_DOMAIN,
560 };
561
562 typedef struct {
563         struct list_head list;
564         enum request_type type;
565
566         /* For REQ_MOVE_TASK */
567         task_t *task;
568         int dest_cpu;
569
570         /* For REQ_SET_DOMAIN */
571         struct sched_domain *sd;
572
573         struct completion done;
574 } migration_req_t;
575
576 /*
577  * The task's runqueue lock must be held.
578  * Returns true if you have to wait for migration thread.
579  */
580 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
581 {
582         runqueue_t *rq = task_rq(p);
583
584         /*
585          * If the task is not on a runqueue (and not running), then
586          * it is sufficient to simply update the task's cpu field.
587          */
588         if (!p->array && !task_running(rq, p)) {
589                 set_task_cpu(p, dest_cpu);
590                 return 0;
591         }
592
593         init_completion(&req->done);
594         req->type = REQ_MOVE_TASK;
595         req->task = p;
596         req->dest_cpu = dest_cpu;
597         list_add(&req->list, &rq->migration_queue);
598         return 1;
599 }
600
601 /*
602  * wait_task_inactive - wait for a thread to unschedule.
603  *
604  * The caller must ensure that the task *will* unschedule sometime soon,
605  * else this function might spin for a *long* time. This function can't
606  * be called with interrupts off, or it may introduce deadlock with
607  * smp_call_function() if an IPI is sent by the same process we are
608  * waiting to become inactive.
609  */
610 void wait_task_inactive(task_t * p)
611 {
612         unsigned long flags;
613         runqueue_t *rq;
614         int preempted;
615
616 repeat:
617         rq = task_rq_lock(p, &flags);
618         /* Must be off runqueue entirely, not preempted. */
619         if (unlikely(p->array)) {
620                 /* If it's preempted, we yield.  It could be a while. */
621                 preempted = !task_running(rq, p);
622                 task_rq_unlock(rq, &flags);
623                 cpu_relax();
624                 if (preempted)
625                         yield();
626                 goto repeat;
627         }
628         task_rq_unlock(rq, &flags);
629 }
630
631 /***
632  * kick_process - kick a running thread to enter/exit the kernel
633  * @p: the to-be-kicked thread
634  *
635  * Cause a process which is running on another CPU to enter
636  * kernel-mode, without any delay. (to get signals handled.)
637  */
638 void kick_process(task_t *p)
639 {
640         int cpu;
641
642         preempt_disable();
643         cpu = task_cpu(p);
644         if ((cpu != smp_processor_id()) && task_curr(p))
645                 smp_send_reschedule(cpu);
646         preempt_enable();
647 }
648
649 EXPORT_SYMBOL_GPL(kick_process);
650
651 /*
652  * Return a low guess at the load of a migration-source cpu.
653  *
654  * We want to under-estimate the load of migration sources, to
655  * balance conservatively.
656  */
657 static inline unsigned long source_load(int cpu)
658 {
659         runqueue_t *rq = cpu_rq(cpu);
660         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
661
662         return min(rq->cpu_load, load_now);
663 }
664
665 /*
666  * Return a high guess at the load of a migration-target cpu
667  */
668 static inline unsigned long target_load(int cpu)
669 {
670         runqueue_t *rq = cpu_rq(cpu);
671         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
672
673         return max(rq->cpu_load, load_now);
674 }
675
676 #endif
677
678 /*
679  * wake_idle() is useful especially on SMT architectures to wake a
680  * task onto an idle sibling if we would otherwise wake it onto a
681  * busy sibling.
682  *
683  * Returns the CPU we should wake onto.
684  */
685 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
686 static int wake_idle(int cpu, task_t *p)
687 {
688         cpumask_t tmp;
689         runqueue_t *rq = cpu_rq(cpu);
690         struct sched_domain *sd;
691         int i;
692
693         if (idle_cpu(cpu))
694                 return cpu;
695
696         sd = rq->sd;
697         if (!(sd->flags & SD_WAKE_IDLE))
698                 return cpu;
699
700         cpus_and(tmp, sd->span, cpu_online_map);
701         cpus_and(tmp, tmp, p->cpus_allowed);
702
703         for_each_cpu_mask(i, tmp) {
704                 if (idle_cpu(i))
705                         return i;
706         }
707
708         return cpu;
709 }
710 #else
711 static inline int wake_idle(int cpu, task_t *p)
712 {
713         return cpu;
714 }
715 #endif
716
717 /***
718  * try_to_wake_up - wake up a thread
719  * @p: the to-be-woken-up thread
720  * @state: the mask of task states that can be woken
721  * @sync: do a synchronous wakeup?
722  *
723  * Put it on the run-queue if it's not already there. The "current"
724  * thread is always on the run-queue (except when the actual
725  * re-schedule is in progress), and as such you're allowed to do
726  * the simpler "current->state = TASK_RUNNING" to mark yourself
727  * runnable without the overhead of this.
728  *
729  * returns failure only if the task is already active.
730  */
731 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
732 {
733         int cpu, this_cpu, success = 0;
734         unsigned long flags;
735         long old_state;
736         runqueue_t *rq;
737 #ifdef CONFIG_SMP
738         unsigned long load, this_load;
739         struct sched_domain *sd;
740         int new_cpu;
741 #endif
742
743         rq = task_rq_lock(p, &flags);
744         old_state = p->state;
745         if (!(old_state & state))
746                 goto out;
747
748         if (p->array)
749                 goto out_running;
750
751         cpu = task_cpu(p);
752         this_cpu = smp_processor_id();
753
754 #ifdef CONFIG_SMP
755         if (unlikely(task_running(rq, p)))
756                 goto out_activate;
757
758         new_cpu = cpu;
759
760         if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
761                 goto out_set_cpu;
762
763         load = source_load(cpu);
764         this_load = target_load(this_cpu);
765
766         /*
767          * If sync wakeup then subtract the (maximum possible) effect of
768          * the currently running task from the load of the current CPU:
769          */
770         if (sync)
771                 this_load -= SCHED_LOAD_SCALE;
772
773         /* Don't pull the task off an idle CPU to a busy one */
774         if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
775                 goto out_set_cpu;
776
777         new_cpu = this_cpu; /* Wake to this CPU if we can */
778
779         /*
780          * Scan domains for affine wakeup and passive balancing
781          * possibilities.
782          */
783         for_each_domain(this_cpu, sd) {
784                 unsigned int imbalance;
785                 /*
786                  * Start passive balancing when half the imbalance_pct
787                  * limit is reached.
788                  */
789                 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
790
791                 if ( ((sd->flags & SD_WAKE_AFFINE) &&
792                                 !task_hot(p, rq->timestamp_last_tick, sd))
793                         || ((sd->flags & SD_WAKE_BALANCE) &&
794                                 imbalance*this_load <= 100*load) ) {
795                         /*
796                          * Now sd has SD_WAKE_AFFINE and p is cache cold in sd
797                          * or sd has SD_WAKE_BALANCE and there is an imbalance
798                          */
799                         if (cpu_isset(cpu, sd->span))
800                                 goto out_set_cpu;
801                 }
802         }
803
804         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
805 out_set_cpu:
806         new_cpu = wake_idle(new_cpu, p);
807         if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
808                 set_task_cpu(p, new_cpu);
809                 task_rq_unlock(rq, &flags);
810                 /* might preempt at this point */
811                 rq = task_rq_lock(p, &flags);
812                 old_state = p->state;
813                 if (!(old_state & state))
814                         goto out;
815                 if (p->array)
816                         goto out_running;
817
818                 this_cpu = smp_processor_id();
819                 cpu = task_cpu(p);
820         }
821
822 out_activate:
823 #endif /* CONFIG_SMP */
824         if (old_state == TASK_UNINTERRUPTIBLE) {
825                 rq->nr_uninterruptible--;
826                 /*
827                  * Tasks on involuntary sleep don't earn
828                  * sleep_avg beyond just interactive state.
829                  */
830                 p->activated = -1;
831         }
832
833         /*
834          * Sync wakeups (i.e. those types of wakeups where the waker
835          * has indicated that it will leave the CPU in short order)
836          * don't trigger a preemption, if the woken up task will run on
837          * this cpu. (in this case the 'I will reschedule' promise of
838          * the waker guarantees that the freshly woken up task is going
839          * to be considered on this CPU.)
840          */
841         activate_task(p, rq, cpu == this_cpu);
842         if (!sync || cpu != this_cpu) {
843                 if (TASK_PREEMPTS_CURR(p, rq))
844                         resched_task(rq->curr);
845         }
846         success = 1;
847
848 out_running:
849         p->state = TASK_RUNNING;
850 out:
851         task_rq_unlock(rq, &flags);
852
853         return success;
854 }
855
856 int fastcall wake_up_process(task_t * p)
857 {
858         return try_to_wake_up(p, TASK_STOPPED |
859                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
860 }
861
862 EXPORT_SYMBOL(wake_up_process);
863
864 int fastcall wake_up_state(task_t *p, unsigned int state)
865 {
866         return try_to_wake_up(p, state, 0);
867 }
868
869 /*
870  * Perform scheduler related setup for a newly forked process p.
871  * p is forked by current.
872  */
873 void fastcall sched_fork(task_t *p)
874 {
875         /*
876          * We mark the process as running here, but have not actually
877          * inserted it onto the runqueue yet. This guarantees that
878          * nobody will actually run it, and a signal or other external
879          * event cannot wake it up and insert it on the runqueue either.
880          */
881         p->state = TASK_RUNNING;
882         INIT_LIST_HEAD(&p->run_list);
883         p->array = NULL;
884         spin_lock_init(&p->switch_lock);
885 #ifdef CONFIG_PREEMPT
886         /*
887          * During context-switch we hold precisely one spinlock, which
888          * schedule_tail drops. (in the common case it's this_rq()->lock,
889          * but it also can be p->switch_lock.) So we compensate with a count
890          * of 1. Also, we want to start with kernel preemption disabled.
891          */
892         p->thread_info->preempt_count = 1;
893 #endif
894         /*
895          * Share the timeslice between parent and child, thus the
896          * total amount of pending timeslices in the system doesn't change,
897          * resulting in more scheduling fairness.
898          */
899         local_irq_disable();
900         p->time_slice = (current->time_slice + 1) >> 1;
901         /*
902          * The remainder of the first timeslice might be recovered by
903          * the parent if the child exits early enough.
904          */
905         p->first_time_slice = 1;
906         current->time_slice >>= 1;
907         p->timestamp = sched_clock();
908         if (!current->time_slice) {
909                 /*
910                  * This case is rare, it happens when the parent has only
911                  * a single jiffy left from its timeslice. Taking the
912                  * runqueue lock is not a problem.
913                  */
914                 current->time_slice = 1;
915                 preempt_disable();
916                 scheduler_tick(0, 0);
917                 local_irq_enable();
918                 preempt_enable();
919         } else
920                 local_irq_enable();
921 }
922
923 /*
924  * wake_up_forked_process - wake up a freshly forked process.
925  *
926  * This function will do some initial scheduler statistics housekeeping
927  * that must be done for every newly created process.
928  */
929 void fastcall wake_up_forked_process(task_t * p)
930 {
931         unsigned long flags;
932         runqueue_t *rq = task_rq_lock(current, &flags);
933
934         BUG_ON(p->state != TASK_RUNNING);
935
936         /*
937          * We decrease the sleep average of forking parents
938          * and children as well, to keep max-interactive tasks
939          * from forking tasks that are max-interactive.
940          */
941         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
942                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
943
944         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
945                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
946
947         p->interactive_credit = 0;
948
949         p->prio = effective_prio(p);
950         set_task_cpu(p, smp_processor_id());
951
952         if (unlikely(!current->array))
953                 __activate_task(p, rq);
954         else {
955                 p->prio = current->prio;
956                 list_add_tail(&p->run_list, &current->run_list);
957                 p->array = current->array;
958                 p->array->nr_active++;
959                 rq->nr_running++;
960         }
961         task_rq_unlock(rq, &flags);
962 }
963
964 /*
965  * Potentially available exiting-child timeslices are
966  * retrieved here - this way the parent does not get
967  * penalized for creating too many threads.
968  *
969  * (this cannot be used to 'generate' timeslices
970  * artificially, because any timeslice recovered here
971  * was given away by the parent in the first place.)
972  */
973 void fastcall sched_exit(task_t * p)
974 {
975         unsigned long flags;
976         runqueue_t *rq;
977
978         local_irq_save(flags);
979         if (p->first_time_slice) {
980                 p->parent->time_slice += p->time_slice;
981                 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
982                         p->parent->time_slice = MAX_TIMESLICE;
983         }
984         local_irq_restore(flags);
985         /*
986          * If the child was a (relative-) CPU hog then decrease
987          * the sleep_avg of the parent as well.
988          */
989         rq = task_rq_lock(p->parent, &flags);
990         if (p->sleep_avg < p->parent->sleep_avg)
991                 p->parent->sleep_avg = p->parent->sleep_avg /
992                 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
993                 (EXIT_WEIGHT + 1);
994         task_rq_unlock(rq, &flags);
995 }
996
997 /**
998  * finish_task_switch - clean up after a task-switch
999  * @prev: the thread we just switched away from.
1000  *
1001  * We enter this with the runqueue still locked, and finish_arch_switch()
1002  * will unlock it along with doing any other architecture-specific cleanup
1003  * actions.
1004  *
1005  * Note that we may have delayed dropping an mm in context_switch(). If
1006  * so, we finish that here outside of the runqueue lock.  (Doing it
1007  * with the lock held can cause deadlocks; see schedule() for
1008  * details.)
1009  */
1010 static void finish_task_switch(task_t *prev)
1011 {
1012         runqueue_t *rq = this_rq();
1013         struct mm_struct *mm = rq->prev_mm;
1014         unsigned long prev_task_flags;
1015
1016         rq->prev_mm = NULL;
1017
1018         /*
1019          * A task struct has one reference for the use as "current".
1020          * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
1021          * schedule one last time. The schedule call will never return,
1022          * and the scheduled task must drop that reference.
1023          * The test for TASK_ZOMBIE must occur while the runqueue locks are
1024          * still held, otherwise prev could be scheduled on another cpu, die
1025          * there before we look at prev->state, and then the reference would
1026          * be dropped twice.
1027          *              Manfred Spraul <manfred@colorfullife.com>
1028          */
1029         prev_task_flags = prev->flags;
1030         finish_arch_switch(rq, prev);
1031         if (mm)
1032                 mmdrop(mm);
1033         if (unlikely(prev_task_flags & PF_DEAD))
1034                 put_task_struct(prev);
1035 }
1036
1037 /**
1038  * schedule_tail - first thing a freshly forked thread must call.
1039  * @prev: the thread we just switched away from.
1040  */
1041 asmlinkage void schedule_tail(task_t *prev)
1042 {
1043         finish_task_switch(prev);
1044
1045         if (current->set_child_tid)
1046                 put_user(current->pid, current->set_child_tid);
1047 }
1048
1049 /*
1050  * context_switch - switch to the new MM and the new
1051  * thread's register state.
1052  */
1053 static inline
1054 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1055 {
1056         struct mm_struct *mm = next->mm;
1057         struct mm_struct *oldmm = prev->active_mm;
1058
1059         if (unlikely(!mm)) {
1060                 next->active_mm = oldmm;
1061                 atomic_inc(&oldmm->mm_count);
1062                 enter_lazy_tlb(oldmm, next);
1063         } else
1064                 switch_mm(oldmm, mm, next);
1065
1066         if (unlikely(!prev->mm)) {
1067                 prev->active_mm = NULL;
1068                 WARN_ON(rq->prev_mm);
1069                 rq->prev_mm = oldmm;
1070         }
1071
1072         /* Here we just switch the register state and the stack. */
1073         switch_to(prev, next, prev);
1074
1075         return prev;
1076 }
1077
1078 /*
1079  * nr_running, nr_uninterruptible and nr_context_switches:
1080  *
1081  * externally visible scheduler statistics: current number of runnable
1082  * threads, current number of uninterruptible-sleeping threads, total
1083  * number of context switches performed since bootup.
1084  */
1085 unsigned long nr_running(void)
1086 {
1087         unsigned long i, sum = 0;
1088
1089         for_each_cpu(i)
1090                 sum += cpu_rq(i)->nr_running;
1091
1092         return sum;
1093 }
1094
1095 unsigned long nr_uninterruptible(void)
1096 {
1097         unsigned long i, sum = 0;
1098
1099         for_each_online_cpu(i)
1100                 sum += cpu_rq(i)->nr_uninterruptible;
1101
1102         return sum;
1103 }
1104
1105 unsigned long long nr_context_switches(void)
1106 {
1107         unsigned long long i, sum = 0;
1108
1109         for_each_online_cpu(i)
1110                 sum += cpu_rq(i)->nr_switches;
1111
1112         return sum;
1113 }
1114
1115 unsigned long nr_iowait(void)
1116 {
1117         unsigned long i, sum = 0;
1118
1119         for_each_online_cpu(i)
1120                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1121
1122         return sum;
1123 }
1124
1125 /*
1126  * double_rq_lock - safely lock two runqueues
1127  *
1128  * Note this does not disable interrupts like task_rq_lock,
1129  * you need to do so manually before calling.
1130  */
1131 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1132 {
1133         if (rq1 == rq2)
1134                 spin_lock(&rq1->lock);
1135         else {
1136                 if (rq1 < rq2) {
1137                         spin_lock(&rq1->lock);
1138                         spin_lock(&rq2->lock);
1139                 } else {
1140                         spin_lock(&rq2->lock);
1141                         spin_lock(&rq1->lock);
1142                 }
1143         }
1144 }
1145
1146 /*
1147  * double_rq_unlock - safely unlock two runqueues
1148  *
1149  * Note this does not restore interrupts like task_rq_unlock,
1150  * you need to do so manually after calling.
1151  */
1152 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1153 {
1154         spin_unlock(&rq1->lock);
1155         if (rq1 != rq2)
1156                 spin_unlock(&rq2->lock);
1157 }
1158
1159 unsigned long long nr_preempt(void)
1160 {
1161         unsigned long long i, sum = 0;
1162
1163         for_each_online_cpu(i)
1164                 sum += cpu_rq(i)->nr_preempt;
1165
1166         return sum;
1167 }
1168
1169 enum idle_type
1170 {
1171         IDLE,
1172         NOT_IDLE,
1173         NEWLY_IDLE,
1174 };
1175
1176 #ifdef CONFIG_SMP
1177
1178 /*
1179  * find_idlest_cpu - find the least busy runqueue.
1180  */
1181 static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1182                            struct sched_domain *sd)
1183 {
1184         unsigned long load, min_load, this_load;
1185         int i, min_cpu;
1186         cpumask_t mask;
1187
1188         min_cpu = UINT_MAX;
1189         min_load = ULONG_MAX;
1190
1191         cpus_and(mask, sd->span, cpu_online_map);
1192         cpus_and(mask, mask, p->cpus_allowed);
1193
1194         for_each_cpu_mask(i, mask) {
1195                 load = target_load(i);
1196
1197                 if (load < min_load) {
1198                         min_cpu = i;
1199                         min_load = load;
1200
1201                         /* break out early on an idle CPU: */
1202                         if (!min_load)
1203                                 break;
1204                 }
1205         }
1206
1207         /* add +1 to account for the new task */
1208         this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1209
1210         /*
1211          * Would with the addition of the new task to the
1212          * current CPU there be an imbalance between this
1213          * CPU and the idlest CPU?
1214          *
1215          * Use half of the balancing threshold - new-context is
1216          * a good opportunity to balance.
1217          */
1218         if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1219                 return min_cpu;
1220
1221         return this_cpu;
1222 }
1223
1224 /*
1225  * wake_up_forked_thread - wake up a freshly forked thread.
1226  *
1227  * This function will do some initial scheduler statistics housekeeping
1228  * that must be done for every newly created context, and it also does
1229  * runqueue balancing.
1230  */
1231 void fastcall wake_up_forked_thread(task_t * p)
1232 {
1233         unsigned long flags;
1234         int this_cpu = get_cpu(), cpu;
1235         struct sched_domain *tmp, *sd = NULL;
1236         runqueue_t *this_rq = cpu_rq(this_cpu), *rq;
1237
1238         /*
1239          * Find the largest domain that this CPU is part of that
1240          * is willing to balance on clone:
1241          */
1242         for_each_domain(this_cpu, tmp)
1243                 if (tmp->flags & SD_BALANCE_CLONE)
1244                         sd = tmp;
1245         if (sd)
1246                 cpu = find_idlest_cpu(p, this_cpu, sd);
1247         else
1248                 cpu = this_cpu;
1249
1250         local_irq_save(flags);
1251 lock_again:
1252         rq = cpu_rq(cpu);
1253         double_rq_lock(this_rq, rq);
1254
1255         BUG_ON(p->state != TASK_RUNNING);
1256
1257         /*
1258          * We did find_idlest_cpu() unlocked, so in theory
1259          * the mask could have changed - just dont migrate
1260          * in this case:
1261          */
1262         if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) {
1263                 cpu = this_cpu;
1264                 double_rq_unlock(this_rq, rq);
1265                 goto lock_again;
1266         }
1267         /*
1268          * We decrease the sleep average of forking parents
1269          * and children as well, to keep max-interactive tasks
1270          * from forking tasks that are max-interactive.
1271          */
1272         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1273                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1274
1275         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1276                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1277
1278         p->interactive_credit = 0;
1279
1280         p->prio = effective_prio(p);
1281         set_task_cpu(p, cpu);
1282
1283         if (cpu == this_cpu) {
1284                 if (unlikely(!current->array))
1285                         __activate_task(p, rq);
1286                 else {
1287                         p->prio = current->prio;
1288                         list_add_tail(&p->run_list, &current->run_list);
1289                         p->array = current->array;
1290                         p->array->nr_active++;
1291                         rq->nr_running++;
1292                 }
1293         } else {
1294                 /* Not the local CPU - must adjust timestamp */
1295                 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1296                                         + rq->timestamp_last_tick;
1297                 __activate_task(p, rq);
1298                 if (TASK_PREEMPTS_CURR(p, rq))
1299                         resched_task(rq->curr);
1300         }
1301
1302         double_rq_unlock(this_rq, rq);
1303         local_irq_restore(flags);
1304         put_cpu();
1305 }
1306
1307 /*
1308  * If dest_cpu is allowed for this process, migrate the task to it.
1309  * This is accomplished by forcing the cpu_allowed mask to only
1310  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1311  * the cpu_allowed mask is restored.
1312  */
1313 static void sched_migrate_task(task_t *p, int dest_cpu)
1314 {
1315         migration_req_t req;
1316         runqueue_t *rq;
1317         unsigned long flags;
1318
1319         rq = task_rq_lock(p, &flags);
1320         if (!cpu_isset(dest_cpu, p->cpus_allowed)
1321             || unlikely(cpu_is_offline(dest_cpu)))
1322                 goto out;
1323
1324         /* force the process onto the specified CPU */
1325         if (migrate_task(p, dest_cpu, &req)) {
1326                 /* Need to wait for migration thread (might exit: take ref). */
1327                 struct task_struct *mt = rq->migration_thread;
1328                 get_task_struct(mt);
1329                 task_rq_unlock(rq, &flags);
1330                 wake_up_process(mt);
1331                 put_task_struct(mt);
1332                 wait_for_completion(&req.done);
1333                 return;
1334         }
1335 out:
1336         task_rq_unlock(rq, &flags);
1337 }
1338
1339 /*
1340  * sched_balance_exec(): find the highest-level, exec-balance-capable
1341  * domain and try to migrate the task to the least loaded CPU.
1342  *
1343  * execve() is a valuable balancing opportunity, because at this point
1344  * the task has the smallest effective memory and cache footprint.
1345  */
1346 void sched_balance_exec(void)
1347 {
1348         struct sched_domain *tmp, *sd = NULL;
1349         int new_cpu, this_cpu = get_cpu();
1350
1351         /* Prefer the current CPU if there's only this task running */
1352         if (this_rq()->nr_running <= 1)
1353                 goto out;
1354
1355         for_each_domain(this_cpu, tmp)
1356                 if (tmp->flags & SD_BALANCE_EXEC)
1357                         sd = tmp;
1358
1359         if (sd) {
1360                 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1361                 if (new_cpu != this_cpu) {
1362                         put_cpu();
1363                         sched_migrate_task(current, new_cpu);
1364                         return;
1365                 }
1366         }
1367 out:
1368         put_cpu();
1369 }
1370
1371 /*
1372  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1373  */
1374 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1375 {
1376         if (unlikely(!spin_trylock(&busiest->lock))) {
1377                 if (busiest < this_rq) {
1378                         spin_unlock(&this_rq->lock);
1379                         spin_lock(&busiest->lock);
1380                         spin_lock(&this_rq->lock);
1381                 } else
1382                         spin_lock(&busiest->lock);
1383         }
1384 }
1385
1386 /*
1387  * pull_task - move a task from a remote runqueue to the local runqueue.
1388  * Both runqueues must be locked.
1389  */
1390 static inline
1391 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1392                runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1393 {
1394         dequeue_task(p, src_array);
1395         src_rq->nr_running--;
1396         set_task_cpu(p, this_cpu);
1397         this_rq->nr_running++;
1398         enqueue_task(p, this_array);
1399         p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1400                                 + this_rq->timestamp_last_tick;
1401         /*
1402          * Note that idle threads have a prio of MAX_PRIO, for this test
1403          * to be always true for them.
1404          */
1405         if (TASK_PREEMPTS_CURR(p, this_rq))
1406                 resched_task(this_rq->curr);
1407 }
1408
1409 /*
1410  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1411  */
1412 static inline
1413 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1414                      struct sched_domain *sd, enum idle_type idle)
1415 {
1416         /*
1417          * We do not migrate tasks that are:
1418          * 1) running (obviously), or
1419          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1420          * 3) are cache-hot on their current CPU.
1421          */
1422         if (task_running(rq, p))
1423                 return 0;
1424         if (!cpu_isset(this_cpu, p->cpus_allowed))
1425                 return 0;
1426
1427         /* Aggressive migration if we've failed balancing */
1428         if (idle == NEWLY_IDLE ||
1429                         sd->nr_balance_failed < sd->cache_nice_tries) {
1430                 if (task_hot(p, rq->timestamp_last_tick, sd))
1431                         return 0;
1432         }
1433
1434         return 1;
1435 }
1436
1437 /*
1438  * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1439  * as part of a balancing operation within "domain". Returns the number of
1440  * tasks moved.
1441  *
1442  * Called with both runqueues locked.
1443  */
1444 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1445                       unsigned long max_nr_move, struct sched_domain *sd,
1446                       enum idle_type idle)
1447 {
1448         prio_array_t *array, *dst_array;
1449         struct list_head *head, *curr;
1450         int idx, pulled = 0;
1451         task_t *tmp;
1452
1453         if (max_nr_move <= 0 || busiest->nr_running <= 1)
1454                 goto out;
1455
1456         /*
1457          * We first consider expired tasks. Those will likely not be
1458          * executed in the near future, and they are most likely to
1459          * be cache-cold, thus switching CPUs has the least effect
1460          * on them.
1461          */
1462         if (busiest->expired->nr_active) {
1463                 array = busiest->expired;
1464                 dst_array = this_rq->expired;
1465         } else {
1466                 array = busiest->active;
1467                 dst_array = this_rq->active;
1468         }
1469
1470 new_array:
1471         /* Start searching at priority 0: */
1472         idx = 0;
1473 skip_bitmap:
1474         if (!idx)
1475                 idx = sched_find_first_bit(array->bitmap);
1476         else
1477                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1478         if (idx >= MAX_PRIO) {
1479                 if (array == busiest->expired && busiest->active->nr_active) {
1480                         array = busiest->active;
1481                         dst_array = this_rq->active;
1482                         goto new_array;
1483                 }
1484                 goto out;
1485         }
1486
1487         head = array->queue + idx;
1488         curr = head->prev;
1489 skip_queue:
1490         tmp = list_entry(curr, task_t, run_list);
1491
1492         curr = curr->prev;
1493
1494         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1495                 if (curr != head)
1496                         goto skip_queue;
1497                 idx++;
1498                 goto skip_bitmap;
1499         }
1500         pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1501         pulled++;
1502
1503         /* We only want to steal up to the prescribed number of tasks. */
1504         if (pulled < max_nr_move) {
1505                 if (curr != head)
1506                         goto skip_queue;
1507                 idx++;
1508                 goto skip_bitmap;
1509         }
1510 out:
1511         return pulled;
1512 }
1513
1514 /*
1515  * find_busiest_group finds and returns the busiest CPU group within the
1516  * domain. It calculates and returns the number of tasks which should be
1517  * moved to restore balance via the imbalance parameter.
1518  */
1519 static struct sched_group *
1520 find_busiest_group(struct sched_domain *sd, int this_cpu,
1521                    unsigned long *imbalance, enum idle_type idle)
1522 {
1523         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1524         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1525
1526         max_load = this_load = total_load = total_pwr = 0;
1527
1528         do {
1529                 cpumask_t tmp;
1530                 unsigned long load;
1531                 int local_group;
1532                 int i, nr_cpus = 0;
1533
1534                 local_group = cpu_isset(this_cpu, group->cpumask);
1535
1536                 /* Tally up the load of all CPUs in the group */
1537                 avg_load = 0;
1538                 cpus_and(tmp, group->cpumask, cpu_online_map);
1539                 if (unlikely(cpus_empty(tmp)))
1540                         goto nextgroup;
1541
1542                 for_each_cpu_mask(i, tmp) {
1543                         /* Bias balancing toward cpus of our domain */
1544                         if (local_group)
1545                                 load = target_load(i);
1546                         else
1547                                 load = source_load(i);
1548
1549                         nr_cpus++;
1550                         avg_load += load;
1551                 }
1552
1553                 if (!nr_cpus)
1554                         goto nextgroup;
1555
1556                 total_load += avg_load;
1557                 total_pwr += group->cpu_power;
1558
1559                 /* Adjust by relative CPU power of the group */
1560                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1561
1562                 if (local_group) {
1563                         this_load = avg_load;
1564                         this = group;
1565                         goto nextgroup;
1566                 } else if (avg_load > max_load) {
1567                         max_load = avg_load;
1568                         busiest = group;
1569                 }
1570 nextgroup:
1571                 group = group->next;
1572         } while (group != sd->groups);
1573
1574         if (!busiest || this_load >= max_load)
1575                 goto out_balanced;
1576
1577         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1578
1579         if (this_load >= avg_load ||
1580                         100*max_load <= sd->imbalance_pct*this_load)
1581                 goto out_balanced;
1582
1583         /*
1584          * We're trying to get all the cpus to the average_load, so we don't
1585          * want to push ourselves above the average load, nor do we wish to
1586          * reduce the max loaded cpu below the average load, as either of these
1587          * actions would just result in more rebalancing later, and ping-pong
1588          * tasks around. Thus we look for the minimum possible imbalance.
1589          * Negative imbalances (*we* are more loaded than anyone else) will
1590          * be counted as no imbalance for these purposes -- we can't fix that
1591          * by pulling tasks to us.  Be careful of negative numbers as they'll
1592          * appear as very large values with unsigned longs.
1593          */
1594         *imbalance = min(max_load - avg_load, avg_load - this_load);
1595
1596         /* How much load to actually move to equalise the imbalance */
1597         *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
1598                                 / SCHED_LOAD_SCALE;
1599
1600         if (*imbalance < SCHED_LOAD_SCALE - 1) {
1601                 unsigned long pwr_now = 0, pwr_move = 0;
1602                 unsigned long tmp;
1603
1604                 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1605                         *imbalance = 1;
1606                         return busiest;
1607                 }
1608
1609                 /*
1610                  * OK, we don't have enough imbalance to justify moving tasks,
1611                  * however we may be able to increase total CPU power used by
1612                  * moving them.
1613                  */
1614
1615                 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1616                 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1617                 pwr_now /= SCHED_LOAD_SCALE;
1618
1619                 /* Amount of load we'd subtract */
1620                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1621                 if (max_load > tmp)
1622                         pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1623                                                         max_load - tmp);
1624
1625                 /* Amount of load we'd add */
1626                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1627                 if (max_load < tmp)
1628                         tmp = max_load;
1629                 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1630                 pwr_move /= SCHED_LOAD_SCALE;
1631
1632                 /* Move if we gain another 8th of a CPU worth of throughput */
1633                 if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
1634                         goto out_balanced;
1635
1636                 *imbalance = 1;
1637                 return busiest;
1638         }
1639
1640         /* Get rid of the scaling factor, rounding down as we divide */
1641         *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
1642
1643         return busiest;
1644
1645 out_balanced:
1646         if (busiest && (idle == NEWLY_IDLE ||
1647                         (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1648                 *imbalance = 1;
1649                 return busiest;
1650         }
1651
1652         *imbalance = 0;
1653         return NULL;
1654 }
1655
1656 /*
1657  * find_busiest_queue - find the busiest runqueue among the cpus in group.
1658  */
1659 static runqueue_t *find_busiest_queue(struct sched_group *group)
1660 {
1661         cpumask_t tmp;
1662         unsigned long load, max_load = 0;
1663         runqueue_t *busiest = NULL;
1664         int i;
1665
1666         cpus_and(tmp, group->cpumask, cpu_online_map);
1667         for_each_cpu_mask(i, tmp) {
1668                 load = source_load(i);
1669
1670                 if (load > max_load) {
1671                         max_load = load;
1672                         busiest = cpu_rq(i);
1673                 }
1674         }
1675
1676         return busiest;
1677 }
1678
1679 /*
1680  * Check this_cpu to ensure it is balanced within domain. Attempt to move
1681  * tasks if there is an imbalance.
1682  *
1683  * Called with this_rq unlocked.
1684  */
1685 static int load_balance(int this_cpu, runqueue_t *this_rq,
1686                         struct sched_domain *sd, enum idle_type idle)
1687 {
1688         struct sched_group *group;
1689         runqueue_t *busiest;
1690         unsigned long imbalance;
1691         int nr_moved;
1692
1693         spin_lock(&this_rq->lock);
1694
1695         group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1696         if (!group)
1697                 goto out_balanced;
1698
1699         busiest = find_busiest_queue(group);
1700         if (!busiest)
1701                 goto out_balanced;
1702         /*
1703          * This should be "impossible", but since load
1704          * balancing is inherently racy and statistical,
1705          * it could happen in theory.
1706          */
1707         if (unlikely(busiest == this_rq)) {
1708                 WARN_ON(1);
1709                 goto out_balanced;
1710         }
1711
1712         nr_moved = 0;
1713         if (busiest->nr_running > 1) {
1714                 /*
1715                  * Attempt to move tasks. If find_busiest_group has found
1716                  * an imbalance but busiest->nr_running <= 1, the group is
1717                  * still unbalanced. nr_moved simply stays zero, so it is
1718                  * correctly treated as an imbalance.
1719                  */
1720                 double_lock_balance(this_rq, busiest);
1721                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1722                                                 imbalance, sd, idle);
1723                 spin_unlock(&busiest->lock);
1724         }
1725         spin_unlock(&this_rq->lock);
1726
1727         if (!nr_moved) {
1728                 sd->nr_balance_failed++;
1729
1730                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1731                         int wake = 0;
1732
1733                         spin_lock(&busiest->lock);
1734                         if (!busiest->active_balance) {
1735                                 busiest->active_balance = 1;
1736                                 busiest->push_cpu = this_cpu;
1737                                 wake = 1;
1738                         }
1739                         spin_unlock(&busiest->lock);
1740                         if (wake)
1741                                 wake_up_process(busiest->migration_thread);
1742
1743                         /*
1744                          * We've kicked active balancing, reset the failure
1745                          * counter.
1746                          */
1747                         sd->nr_balance_failed = sd->cache_nice_tries;
1748                 }
1749         } else
1750                 sd->nr_balance_failed = 0;
1751
1752         /* We were unbalanced, so reset the balancing interval */
1753         sd->balance_interval = sd->min_interval;
1754
1755         return nr_moved;
1756
1757 out_balanced:
1758         spin_unlock(&this_rq->lock);
1759
1760         /* tune up the balancing interval */
1761         if (sd->balance_interval < sd->max_interval)
1762                 sd->balance_interval *= 2;
1763
1764         return 0;
1765 }
1766
1767 /*
1768  * Check this_cpu to ensure it is balanced within domain. Attempt to move
1769  * tasks if there is an imbalance.
1770  *
1771  * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
1772  * this_rq is locked.
1773  */
1774 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
1775                                 struct sched_domain *sd)
1776 {
1777         struct sched_group *group;
1778         runqueue_t *busiest = NULL;
1779         unsigned long imbalance;
1780         int nr_moved = 0;
1781
1782         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
1783         if (!group)
1784                 goto out;
1785
1786         busiest = find_busiest_queue(group);
1787         if (!busiest || busiest == this_rq)
1788                 goto out;
1789
1790         /* Attempt to move tasks */
1791         double_lock_balance(this_rq, busiest);
1792
1793         nr_moved = move_tasks(this_rq, this_cpu, busiest,
1794                                         imbalance, sd, NEWLY_IDLE);
1795
1796         spin_unlock(&busiest->lock);
1797
1798 out:
1799         return nr_moved;
1800 }
1801
1802 /*
1803  * idle_balance is called by schedule() if this_cpu is about to become
1804  * idle. Attempts to pull tasks from other CPUs.
1805  */
1806 static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
1807 {
1808         struct sched_domain *sd;
1809
1810         for_each_domain(this_cpu, sd) {
1811                 if (sd->flags & SD_BALANCE_NEWIDLE) {
1812                         if (load_balance_newidle(this_cpu, this_rq, sd)) {
1813                                 /* We've pulled tasks over so stop searching */
1814                                 break;
1815                         }
1816                 }
1817         }
1818 }
1819
1820 /*
1821  * active_load_balance is run by migration threads. It pushes a running
1822  * task off the cpu. It can be required to correctly have at least 1 task
1823  * running on each physical CPU where possible, and not have a physical /
1824  * logical imbalance.
1825  *
1826  * Called with busiest locked.
1827  */
1828 static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
1829 {
1830         struct sched_domain *sd;
1831         struct sched_group *group, *busy_group;
1832         int i;
1833
1834         if (busiest->nr_running <= 1)
1835                 return;
1836
1837         for_each_domain(busiest_cpu, sd)
1838                 if (cpu_isset(busiest->push_cpu, sd->span))
1839                         break;
1840         if (!sd) {
1841                 WARN_ON(1);
1842                 return;
1843         }
1844
1845         group = sd->groups;
1846         while (!cpu_isset(busiest_cpu, group->cpumask))
1847                 group = group->next;
1848         busy_group = group;
1849
1850         group = sd->groups;
1851         do {
1852                 cpumask_t tmp;
1853                 runqueue_t *rq;
1854                 int push_cpu = 0;
1855
1856                 if (group == busy_group)
1857                         goto next_group;
1858
1859                 cpus_and(tmp, group->cpumask, cpu_online_map);
1860                 if (!cpus_weight(tmp))
1861                         goto next_group;
1862
1863                 for_each_cpu_mask(i, tmp) {
1864                         if (!idle_cpu(i))
1865                                 goto next_group;
1866                         push_cpu = i;
1867                 }
1868
1869                 rq = cpu_rq(push_cpu);
1870
1871                 /*
1872                  * This condition is "impossible", but since load
1873                  * balancing is inherently a bit racy and statistical,
1874                  * it can trigger.. Reported by Bjorn Helgaas on a
1875                  * 128-cpu setup.
1876                  */
1877                 if (unlikely(busiest == rq))
1878                         goto next_group;
1879                 double_lock_balance(busiest, rq);
1880                 move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
1881                 spin_unlock(&rq->lock);
1882 next_group:
1883                 group = group->next;
1884         } while (group != sd->groups);
1885 }
1886
1887 /*
1888  * rebalance_tick will get called every timer tick, on every CPU.
1889  *
1890  * It checks each scheduling domain to see if it is due to be balanced,
1891  * and initiates a balancing operation if so.
1892  *
1893  * Balancing parameters are set up in arch_init_sched_domains.
1894  */
1895
1896 /* Don't have all balancing operations going off at once */
1897 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
1898
1899 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
1900                            enum idle_type idle)
1901 {
1902         unsigned long old_load, this_load;
1903         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
1904         struct sched_domain *sd;
1905
1906         /* Update our load */
1907         old_load = this_rq->cpu_load;
1908         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
1909         /*
1910          * Round up the averaging division if load is increasing. This
1911          * prevents us from getting stuck on 9 if the load is 10, for
1912          * example.
1913          */
1914         if (this_load > old_load)
1915                 old_load++;
1916         this_rq->cpu_load = (old_load + this_load) / 2;
1917
1918         for_each_domain(this_cpu, sd) {
1919                 unsigned long interval = sd->balance_interval;
1920
1921                 if (idle != IDLE)
1922                         interval *= sd->busy_factor;
1923
1924                 /* scale ms to jiffies */
1925                 interval = msecs_to_jiffies(interval);
1926                 if (unlikely(!interval))
1927                         interval = 1;
1928
1929                 if (j - sd->last_balance >= interval) {
1930                         if (load_balance(this_cpu, this_rq, sd, idle)) {
1931                                 /* We've pulled tasks over so no longer idle */
1932                                 idle = NOT_IDLE;
1933                         }
1934                         sd->last_balance += interval;
1935                 }
1936         }
1937 }
1938 #else
1939 /*
1940  * on UP we do not need to balance between CPUs:
1941  */
1942 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
1943 {
1944 }
1945 static inline void idle_balance(int cpu, runqueue_t *rq)
1946 {
1947 }
1948 #endif
1949
1950 static inline int wake_priority_sleeper(runqueue_t *rq)
1951 {
1952 #ifdef CONFIG_SCHED_SMT
1953         /*
1954          * If an SMT sibling task has been put to sleep for priority
1955          * reasons reschedule the idle task to see if it can now run.
1956          */
1957         if (rq->nr_running) {
1958                 resched_task(rq->idle);
1959                 return 1;
1960         }
1961 #endif
1962         return 0;
1963 }
1964
1965 DEFINE_PER_CPU(struct kernel_stat, kstat);
1966
1967 EXPORT_PER_CPU_SYMBOL(kstat);
1968
1969 /*
1970  * We place interactive tasks back into the active array, if possible.
1971  *
1972  * To guarantee that this does not starve expired tasks we ignore the
1973  * interactivity of a task if the first expired task had to wait more
1974  * than a 'reasonable' amount of time. This deadline timeout is
1975  * load-dependent, as the frequency of array switched decreases with
1976  * increasing number of running tasks. We also ignore the interactivity
1977  * if a better static_prio task has expired:
1978  */
1979 #define EXPIRED_STARVING(rq) \
1980         ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1981                 (jiffies - (rq)->expired_timestamp >= \
1982                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1983                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
1984
1985 /*
1986  * This function gets called by the timer code, with HZ frequency.
1987  * We call it with interrupts disabled.
1988  *
1989  * It also gets called by the fork code, when changing the parent's
1990  * timeslices.
1991  */
1992 void scheduler_tick(int user_ticks, int sys_ticks)
1993 {
1994         int cpu = smp_processor_id();
1995         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1996         runqueue_t *rq = this_rq();
1997         task_t *p = current;
1998
1999         rq->timestamp_last_tick = sched_clock();
2000
2001         if (rcu_pending(cpu))
2002                 rcu_check_callbacks(cpu, user_ticks);
2003
2004         /* note: this timer irq context must be accounted for as well */
2005         if (hardirq_count() - HARDIRQ_OFFSET) {
2006                 cpustat->irq += sys_ticks;
2007                 sys_ticks = 0;
2008         } else if (softirq_count()) {
2009                 cpustat->softirq += sys_ticks;
2010                 sys_ticks = 0;
2011         }
2012
2013         if (p == rq->idle) {
2014                 if (atomic_read(&rq->nr_iowait) > 0)
2015                         cpustat->iowait += sys_ticks;
2016                 else
2017                         cpustat->idle += sys_ticks;
2018                 if (wake_priority_sleeper(rq))
2019                         goto out;
2020                 rebalance_tick(cpu, rq, IDLE);
2021                 return;
2022         }
2023         if (TASK_NICE(p) > 0)
2024                 cpustat->nice += user_ticks;
2025         else
2026                 cpustat->user += user_ticks;
2027         cpustat->system += sys_ticks;
2028
2029         /* Task might have expired already, but not scheduled off yet */
2030         if (p->array != rq->active) {
2031                 set_tsk_need_resched(p);
2032                 goto out;
2033         }
2034         spin_lock(&rq->lock);
2035         /*
2036          * The task was running during this tick - update the
2037          * time slice counter. Note: we do not update a thread's
2038          * priority until it either goes to sleep or uses up its
2039          * timeslice. This makes it possible for interactive tasks
2040          * to use up their timeslices at their highest priority levels.
2041          */
2042         if (unlikely(rt_task(p))) {
2043                 /*
2044                  * RR tasks need a special form of timeslice management.
2045                  * FIFO tasks have no timeslices.
2046                  */
2047                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2048                         p->time_slice = task_timeslice(p);
2049                         p->first_time_slice = 0;
2050                         set_tsk_need_resched(p);
2051
2052                         /* put it at the end of the queue: */
2053                         dequeue_task(p, rq->active);
2054                         enqueue_task(p, rq->active);
2055                 }
2056                 goto out_unlock;
2057         }
2058         if (!--p->time_slice) {
2059                 dequeue_task(p, rq->active);
2060                 set_tsk_need_resched(p);
2061                 p->prio = effective_prio(p);
2062                 p->time_slice = task_timeslice(p);
2063                 p->first_time_slice = 0;
2064
2065                 if (!rq->expired_timestamp)
2066                         rq->expired_timestamp = jiffies;
2067                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2068                         enqueue_task(p, rq->expired);
2069                         if (p->static_prio < rq->best_expired_prio)
2070                                 rq->best_expired_prio = p->static_prio;
2071                 } else
2072                         enqueue_task(p, rq->active);
2073         } else {
2074                 /*
2075                  * Prevent a too long timeslice allowing a task to monopolize
2076                  * the CPU. We do this by splitting up the timeslice into
2077                  * smaller pieces.
2078                  *
2079                  * Note: this does not mean the task's timeslices expire or
2080                  * get lost in any way, they just might be preempted by
2081                  * another task of equal priority. (one with higher
2082                  * priority would have preempted this task already.) We
2083                  * requeue this task to the end of the list on this priority
2084                  * level, which is in essence a round-robin of tasks with
2085                  * equal priority.
2086                  *
2087                  * This only applies to tasks in the interactive
2088                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
2089                  */
2090                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2091                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2092                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2093                         (p->array == rq->active)) {
2094
2095                         dequeue_task(p, rq->active);
2096                         set_tsk_need_resched(p);
2097                         p->prio = effective_prio(p);
2098                         enqueue_task(p, rq->active);
2099                 }
2100         }
2101 out_unlock:
2102         spin_unlock(&rq->lock);
2103 out:
2104         rebalance_tick(cpu, rq, NOT_IDLE);
2105 }
2106
2107 #ifdef CONFIG_SCHED_SMT
2108 static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
2109 {
2110         int i;
2111         struct sched_domain *sd = rq->sd;
2112         cpumask_t sibling_map;
2113
2114         if (!(sd->flags & SD_SHARE_CPUPOWER))
2115                 return;
2116
2117         cpus_and(sibling_map, sd->span, cpu_online_map);
2118         for_each_cpu_mask(i, sibling_map) {
2119                 runqueue_t *smt_rq;
2120
2121                 if (i == cpu)
2122                         continue;
2123
2124                 smt_rq = cpu_rq(i);
2125
2126                 /*
2127                  * If an SMT sibling task is sleeping due to priority
2128                  * reasons wake it up now.
2129                  */
2130                 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2131                         resched_task(smt_rq->idle);
2132         }
2133 }
2134
2135 static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
2136 {
2137         struct sched_domain *sd = rq->sd;
2138         cpumask_t sibling_map;
2139         int ret = 0, i;
2140
2141         if (!(sd->flags & SD_SHARE_CPUPOWER))
2142                 return 0;
2143
2144         cpus_and(sibling_map, sd->span, cpu_online_map);
2145         for_each_cpu_mask(i, sibling_map) {
2146                 runqueue_t *smt_rq;
2147                 task_t *smt_curr;
2148
2149                 if (i == cpu)
2150                         continue;
2151
2152                 smt_rq = cpu_rq(i);
2153                 smt_curr = smt_rq->curr;
2154
2155                 /*
2156                  * If a user task with lower static priority than the
2157                  * running task on the SMT sibling is trying to schedule,
2158                  * delay it till there is proportionately less timeslice
2159                  * left of the sibling task to prevent a lower priority
2160                  * task from using an unfair proportion of the
2161                  * physical cpu's resources. -ck
2162                  */
2163                 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2164                         task_timeslice(p) || rt_task(smt_curr)) &&
2165                         p->mm && smt_curr->mm && !rt_task(p))
2166                                 ret = 1;
2167
2168                 /*
2169                  * Reschedule a lower priority task on the SMT sibling,
2170                  * or wake it up if it has been put to sleep for priority
2171                  * reasons.
2172                  */
2173                 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2174                         task_timeslice(smt_curr) || rt_task(p)) &&
2175                         smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2176                         (smt_curr == smt_rq->idle && smt_rq->nr_running))
2177                                 resched_task(smt_curr);
2178         }
2179         return ret;
2180 }
2181 #else
2182 static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
2183 {
2184 }
2185
2186 static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
2187 {
2188         return 0;
2189 }
2190 #endif
2191
2192 /*
2193  * schedule() is the main scheduler function.
2194  */
2195 asmlinkage void __sched schedule(void)
2196 {
2197         long *switch_count;
2198         task_t *prev, *next;
2199         runqueue_t *rq;
2200         prio_array_t *array;
2201         struct list_head *queue;
2202         unsigned long long now;
2203         unsigned long run_time;
2204         int cpu, idx;
2205
2206         //WARN_ON(system_state == SYSTEM_BOOTING);
2207         /*
2208          * Test if we are atomic.  Since do_exit() needs to call into
2209          * schedule() atomically, we ignore that path for now.
2210          * Otherwise, whine if we are scheduling when we should not be.
2211          */
2212         if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
2213                 if (unlikely(in_atomic())) {
2214                         printk(KERN_ERR "bad: scheduling while atomic!\n");
2215                         dump_stack();
2216                 }
2217         }
2218
2219 need_resched:
2220         preempt_disable();
2221         prev = current;
2222         rq = this_rq();
2223
2224         release_kernel_lock(prev);
2225         now = sched_clock();
2226         if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
2227                 run_time = now - prev->timestamp;
2228         else
2229                 run_time = NS_MAX_SLEEP_AVG;
2230
2231         /*
2232          * Tasks with interactive credits get charged less run_time
2233          * at high sleep_avg to delay them losing their interactive
2234          * status
2235          */
2236         if (HIGH_CREDIT(prev))
2237                 run_time /= (CURRENT_BONUS(prev) ? : 1);
2238
2239         spin_lock_irq(&rq->lock);
2240
2241         /*
2242          * if entering off of a kernel preemption go straight
2243          * to picking the next task.
2244          */
2245         switch_count = &prev->nivcsw;
2246         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2247                 switch_count = &prev->nvcsw;
2248                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2249                                 unlikely(signal_pending(prev))))
2250                         prev->state = TASK_RUNNING;
2251                 else
2252                         deactivate_task(prev, rq);
2253         }
2254
2255         cpu = smp_processor_id();
2256         if (unlikely(!rq->nr_running)) {
2257                 idle_balance(cpu, rq);
2258                 if (!rq->nr_running) {
2259                         next = rq->idle;
2260                         rq->expired_timestamp = 0;
2261                         wake_sleeping_dependent(cpu, rq);
2262                         goto switch_tasks;
2263                 }
2264         }
2265
2266         array = rq->active;
2267         if (unlikely(!array->nr_active)) {
2268                 /*
2269                  * Switch the active and expired arrays.
2270                  */
2271                 rq->active = rq->expired;
2272                 rq->expired = array;
2273                 array = rq->active;
2274                 rq->expired_timestamp = 0;
2275                 rq->best_expired_prio = MAX_PRIO;
2276         }
2277
2278         idx = sched_find_first_bit(array->bitmap);
2279         queue = array->queue + idx;
2280         next = list_entry(queue->next, task_t, run_list);
2281
2282         if (dependent_sleeper(cpu, rq, next)) {
2283                 next = rq->idle;
2284                 goto switch_tasks;
2285         }
2286
2287         if (!rt_task(next) && next->activated > 0) {
2288                 unsigned long long delta = now - next->timestamp;
2289
2290                 if (next->activated == 1)
2291                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2292
2293                 array = next->array;
2294                 dequeue_task(next, array);
2295                 recalc_task_prio(next, next->timestamp + delta);
2296                 enqueue_task(next, array);
2297         }
2298         next->activated = 0;
2299 switch_tasks:
2300         prefetch(next);
2301         if (test_and_clear_tsk_thread_flag(prev,TIF_NEED_RESCHED))
2302                 rq->nr_preempt++;
2303         RCU_qsctr(task_cpu(prev))++;
2304
2305         prev->sleep_avg -= run_time;
2306         if ((long)prev->sleep_avg <= 0) {
2307                 prev->sleep_avg = 0;
2308                 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
2309                         prev->interactive_credit--;
2310         }
2311         prev->timestamp = now;
2312
2313         if (likely(prev != next)) {
2314                 next->timestamp = now;
2315                 rq->nr_switches++;
2316                 rq->curr = next;
2317                 ++*switch_count;
2318
2319                 prepare_arch_switch(rq, next);
2320                 prev = context_switch(rq, prev, next);
2321                 barrier();
2322
2323                 finish_task_switch(prev);
2324         } else
2325                 spin_unlock_irq(&rq->lock);
2326
2327         reacquire_kernel_lock(current);
2328         preempt_enable_no_resched();
2329         if (test_thread_flag(TIF_NEED_RESCHED))
2330                 goto need_resched;
2331 }
2332
2333 EXPORT_SYMBOL(schedule);
2334
2335 #ifdef CONFIG_PREEMPT
2336 /*
2337  * this is is the entry point to schedule() from in-kernel preemption
2338  * off of preempt_enable.  Kernel preemptions off return from interrupt
2339  * occur there and call schedule directly.
2340  */
2341 asmlinkage void __sched preempt_schedule(void)
2342 {
2343         struct thread_info *ti = current_thread_info();
2344
2345         /*
2346          * If there is a non-zero preempt_count or interrupts are disabled,
2347          * we do not want to preempt the current task.  Just return..
2348          */
2349         if (unlikely(ti->preempt_count || irqs_disabled()))
2350                 return;
2351
2352 need_resched:
2353         ti->preempt_count = PREEMPT_ACTIVE;
2354         schedule();
2355         ti->preempt_count = 0;
2356
2357         /* we could miss a preemption opportunity between schedule and now */
2358         barrier();
2359         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2360                 goto need_resched;
2361 }
2362
2363 EXPORT_SYMBOL(preempt_schedule);
2364 #endif /* CONFIG_PREEMPT */
2365
2366 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2367 {
2368         task_t *p = curr->task;
2369         return try_to_wake_up(p, mode, sync);
2370 }
2371
2372 EXPORT_SYMBOL(default_wake_function);
2373
2374 /*
2375  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
2376  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
2377  * number) then we wake all the non-exclusive tasks and one exclusive task.
2378  *
2379  * There are circumstances in which we can try to wake a task which has already
2380  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
2381  * zero in this (rare) case, and we handle it by continuing to scan the queue.
2382  */
2383 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2384                              int nr_exclusive, int sync, void *key)
2385 {
2386         struct list_head *tmp, *next;
2387
2388         list_for_each_safe(tmp, next, &q->task_list) {
2389                 wait_queue_t *curr;
2390                 unsigned flags;
2391                 curr = list_entry(tmp, wait_queue_t, task_list);
2392                 flags = curr->flags;
2393                 if (curr->func(curr, mode, sync, key) &&
2394                     (flags & WQ_FLAG_EXCLUSIVE) &&
2395                     !--nr_exclusive)
2396                         break;
2397         }
2398 }
2399
2400 /**
2401  * __wake_up - wake up threads blocked on a waitqueue.
2402  * @q: the waitqueue
2403  * @mode: which threads
2404  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2405  */
2406 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2407                                 int nr_exclusive, void *key)
2408 {
2409         unsigned long flags;
2410
2411         spin_lock_irqsave(&q->lock, flags);
2412         __wake_up_common(q, mode, nr_exclusive, 0, key);
2413         spin_unlock_irqrestore(&q->lock, flags);
2414 }
2415
2416 EXPORT_SYMBOL(__wake_up);
2417
2418 /*
2419  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2420  */
2421 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2422 {
2423         __wake_up_common(q, mode, 1, 0, NULL);
2424 }
2425
2426 /**
2427  * __wake_up - sync- wake up threads blocked on a waitqueue.
2428  * @q: the waitqueue
2429  * @mode: which threads
2430  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2431  *
2432  * The sync wakeup differs that the waker knows that it will schedule
2433  * away soon, so while the target thread will be woken up, it will not
2434  * be migrated to another CPU - ie. the two threads are 'synchronized'
2435  * with each other. This can prevent needless bouncing between CPUs.
2436  *
2437  * On UP it can prevent extra preemption.
2438  */
2439 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2440 {
2441         unsigned long flags;
2442         int sync = 1;
2443
2444         if (unlikely(!q))
2445                 return;
2446
2447         if (unlikely(!nr_exclusive))
2448                 sync = 0;
2449
2450         spin_lock_irqsave(&q->lock, flags);
2451         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
2452         spin_unlock_irqrestore(&q->lock, flags);
2453 }
2454 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
2455
2456 void fastcall complete(struct completion *x)
2457 {
2458         unsigned long flags;
2459
2460         spin_lock_irqsave(&x->wait.lock, flags);
2461         x->done++;
2462         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2463                          1, 0, NULL);
2464         spin_unlock_irqrestore(&x->wait.lock, flags);
2465 }
2466 EXPORT_SYMBOL(complete);
2467
2468 void fastcall complete_all(struct completion *x)
2469 {
2470         unsigned long flags;
2471
2472         spin_lock_irqsave(&x->wait.lock, flags);
2473         x->done += UINT_MAX/2;
2474         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2475                          0, 0, NULL);
2476         spin_unlock_irqrestore(&x->wait.lock, flags);
2477 }
2478 EXPORT_SYMBOL(complete_all);
2479
2480 void fastcall __sched wait_for_completion(struct completion *x)
2481 {
2482         might_sleep();
2483         spin_lock_irq(&x->wait.lock);
2484         if (!x->done) {
2485                 DECLARE_WAITQUEUE(wait, current);
2486
2487                 wait.flags |= WQ_FLAG_EXCLUSIVE;
2488                 __add_wait_queue_tail(&x->wait, &wait);
2489                 do {
2490                         __set_current_state(TASK_UNINTERRUPTIBLE);
2491                         spin_unlock_irq(&x->wait.lock);
2492                         schedule();
2493                         spin_lock_irq(&x->wait.lock);
2494                 } while (!x->done);
2495                 __remove_wait_queue(&x->wait, &wait);
2496         }
2497         x->done--;
2498         spin_unlock_irq(&x->wait.lock);
2499 }
2500 EXPORT_SYMBOL(wait_for_completion);
2501
2502 #define SLEEP_ON_VAR                                    \
2503         unsigned long flags;                            \
2504         wait_queue_t wait;                              \
2505         init_waitqueue_entry(&wait, current);
2506
2507 #define SLEEP_ON_HEAD                                   \
2508         spin_lock_irqsave(&q->lock,flags);              \
2509         __add_wait_queue(q, &wait);                     \
2510         spin_unlock(&q->lock);
2511
2512 #define SLEEP_ON_TAIL                                   \
2513         spin_lock_irq(&q->lock);                        \
2514         __remove_wait_queue(q, &wait);                  \
2515         spin_unlock_irqrestore(&q->lock, flags);
2516
2517 #define SLEEP_ON_BKLCHECK                               \
2518         if (unlikely(!kernel_locked()) &&               \
2519             sleep_on_bkl_warnings < 10) {               \
2520                 sleep_on_bkl_warnings++;                \
2521                 WARN_ON(1);                             \
2522         }
2523
2524 static int sleep_on_bkl_warnings;
2525
2526 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
2527 {
2528         SLEEP_ON_VAR
2529
2530         SLEEP_ON_BKLCHECK
2531
2532         current->state = TASK_INTERRUPTIBLE;
2533
2534         SLEEP_ON_HEAD
2535         schedule();
2536         SLEEP_ON_TAIL
2537 }
2538
2539 EXPORT_SYMBOL(interruptible_sleep_on);
2540
2541 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2542 {
2543         SLEEP_ON_VAR
2544
2545         SLEEP_ON_BKLCHECK
2546
2547         current->state = TASK_INTERRUPTIBLE;
2548
2549         SLEEP_ON_HEAD
2550         timeout = schedule_timeout(timeout);
2551         SLEEP_ON_TAIL
2552
2553         return timeout;
2554 }
2555
2556 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2557
2558 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2559 {
2560         SLEEP_ON_VAR
2561
2562         SLEEP_ON_BKLCHECK
2563
2564         current->state = TASK_UNINTERRUPTIBLE;
2565
2566         SLEEP_ON_HEAD
2567         timeout = schedule_timeout(timeout);
2568         SLEEP_ON_TAIL
2569
2570         return timeout;
2571 }
2572
2573 EXPORT_SYMBOL(sleep_on_timeout);
2574
2575 void set_user_nice(task_t *p, long nice)
2576 {
2577         unsigned long flags;
2578         prio_array_t *array;
2579         runqueue_t *rq;
2580         int old_prio, new_prio, delta;
2581
2582         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
2583                 return;
2584         /*
2585          * We have to be careful, if called from sys_setpriority(),
2586          * the task might be in the middle of scheduling on another CPU.
2587          */
2588         rq = task_rq_lock(p, &flags);
2589         /*
2590          * The RT priorities are set via setscheduler(), but we still
2591          * allow the 'normal' nice value to be set - but as expected
2592          * it wont have any effect on scheduling until the task is
2593          * not SCHED_NORMAL:
2594          */
2595         if (rt_task(p)) {
2596                 p->static_prio = NICE_TO_PRIO(nice);
2597                 goto out_unlock;
2598         }
2599         array = p->array;
2600         if (array)
2601                 dequeue_task(p, array);
2602
2603         old_prio = p->prio;
2604         new_prio = NICE_TO_PRIO(nice);
2605         delta = new_prio - old_prio;
2606         p->static_prio = NICE_TO_PRIO(nice);
2607         p->prio += delta;
2608
2609         if (array) {
2610                 enqueue_task(p, array);
2611                 /*
2612                  * If the task increased its priority or is running and
2613                  * lowered its priority, then reschedule its CPU:
2614                  */
2615                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2616                         resched_task(rq->curr);
2617         }
2618 out_unlock:
2619         task_rq_unlock(rq, &flags);
2620 }
2621
2622 EXPORT_SYMBOL(set_user_nice);
2623
2624 #ifdef __ARCH_WANT_SYS_NICE
2625
2626 /*
2627  * sys_nice - change the priority of the current process.
2628  * @increment: priority increment
2629  *
2630  * sys_setpriority is a more generic, but much slower function that
2631  * does similar things.
2632  */
2633 asmlinkage long sys_nice(int increment)
2634 {
2635         int retval;
2636         long nice;
2637
2638         /*
2639          * Setpriority might change our priority at the same moment.
2640          * We don't have to worry. Conceptually one call occurs first
2641          * and we have a single winner.
2642          */
2643         if (increment < 0) {
2644                 if (!capable(CAP_SYS_NICE))
2645                         return -EPERM;
2646                 if (increment < -40)
2647                         increment = -40;
2648         }
2649         if (increment > 40)
2650                 increment = 40;
2651
2652         nice = PRIO_TO_NICE(current->static_prio) + increment;
2653         if (nice < -20)
2654                 nice = -20;
2655         if (nice > 19)
2656                 nice = 19;
2657
2658         retval = security_task_setnice(current, nice);
2659         if (retval)
2660                 return retval;
2661
2662         set_user_nice(current, nice);
2663         return 0;
2664 }
2665
2666 #endif
2667
2668 /**
2669  * task_prio - return the priority value of a given task.
2670  * @p: the task in question.
2671  *
2672  * This is the priority value as seen by users in /proc.
2673  * RT tasks are offset by -200. Normal tasks are centered
2674  * around 0, value goes from -16 to +15.
2675  */
2676 int task_prio(const task_t *p)
2677 {
2678         return p->prio - MAX_RT_PRIO;
2679 }
2680
2681 /**
2682  * task_nice - return the nice value of a given task.
2683  * @p: the task in question.
2684  */
2685 int task_nice(const task_t *p)
2686 {
2687         return TASK_NICE(p);
2688 }
2689
2690 EXPORT_SYMBOL(task_nice);
2691
2692 /**
2693  * idle_cpu - is a given cpu idle currently?
2694  * @cpu: the processor in question.
2695  */
2696 int idle_cpu(int cpu)
2697 {
2698         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
2699 }
2700
2701 EXPORT_SYMBOL_GPL(idle_cpu);
2702
2703 /**
2704  * find_process_by_pid - find a process with a matching PID value.
2705  * @pid: the pid in question.
2706  */
2707 static inline task_t *find_process_by_pid(pid_t pid)
2708 {
2709         return pid ? find_task_by_pid(pid) : current;
2710 }
2711
2712 /* Actually do priority change: must hold rq lock. */
2713 static void __setscheduler(struct task_struct *p, int policy, int prio)
2714 {
2715         BUG_ON(p->array);
2716         p->policy = policy;
2717         p->rt_priority = prio;
2718         if (policy != SCHED_NORMAL)
2719                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
2720         else
2721                 p->prio = p->static_prio;
2722 }
2723
2724 /*
2725  * setscheduler - change the scheduling policy and/or RT priority of a thread.
2726  */
2727 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
2728 {
2729         struct sched_param lp;
2730         int retval = -EINVAL;
2731         int oldprio;
2732         prio_array_t *array;
2733         unsigned long flags;
2734         runqueue_t *rq;
2735         task_t *p;
2736
2737         if (!param || pid < 0)
2738                 goto out_nounlock;
2739
2740         retval = -EFAULT;
2741         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
2742                 goto out_nounlock;
2743
2744         /*
2745          * We play safe to avoid deadlocks.
2746          */
2747         read_lock_irq(&tasklist_lock);
2748
2749         p = find_process_by_pid(pid);
2750
2751         retval = -ESRCH;
2752         if (!p)
2753                 goto out_unlock_tasklist;
2754
2755         /*
2756          * To be able to change p->policy safely, the apropriate
2757          * runqueue lock must be held.
2758          */
2759         rq = task_rq_lock(p, &flags);
2760
2761         if (policy < 0)
2762                 policy = p->policy;
2763         else {
2764                 retval = -EINVAL;
2765                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
2766                                 policy != SCHED_NORMAL)
2767                         goto out_unlock;
2768         }
2769
2770         /*
2771          * Valid priorities for SCHED_FIFO and SCHED_RR are
2772          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
2773          */
2774         retval = -EINVAL;
2775         if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
2776                 goto out_unlock;
2777         if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
2778                 goto out_unlock;
2779
2780         retval = -EPERM;
2781         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
2782             !capable(CAP_SYS_NICE))
2783                 goto out_unlock;
2784         if ((current->euid != p->euid) && (current->euid != p->uid) &&
2785             !capable(CAP_SYS_NICE))
2786                 goto out_unlock;
2787
2788         retval = security_task_setscheduler(p, policy, &lp);
2789         if (retval)
2790                 goto out_unlock;
2791
2792         array = p->array;
2793         if (array)
2794                 deactivate_task(p, task_rq(p));
2795         retval = 0;
2796         oldprio = p->prio;
2797         __setscheduler(p, policy, lp.sched_priority);
2798         if (array) {
2799                 __activate_task(p, task_rq(p));
2800                 /*
2801                  * Reschedule if we are currently running on this runqueue and
2802                  * our priority decreased, or if we are not currently running on
2803                  * this runqueue and our priority is higher than the current's
2804                  */
2805                 if (task_running(rq, p)) {
2806                         if (p->prio > oldprio)
2807                                 resched_task(rq->curr);
2808                 } else if (TASK_PREEMPTS_CURR(p, rq))
2809                         resched_task(rq->curr);
2810         }
2811
2812 out_unlock:
2813         task_rq_unlock(rq, &flags);
2814 out_unlock_tasklist:
2815         read_unlock_irq(&tasklist_lock);
2816
2817 out_nounlock:
2818         return retval;
2819 }
2820
2821 /**
2822  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
2823  * @pid: the pid in question.
2824  * @policy: new policy
2825  * @param: structure containing the new RT priority.
2826  */
2827 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
2828                                        struct sched_param __user *param)
2829 {
2830         return setscheduler(pid, policy, param);
2831 }
2832
2833 /**
2834  * sys_sched_setparam - set/change the RT priority of a thread
2835  * @pid: the pid in question.
2836  * @param: structure containing the new RT priority.
2837  */
2838 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
2839 {
2840         return setscheduler(pid, -1, param);
2841 }
2842
2843 /**
2844  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
2845  * @pid: the pid in question.
2846  */
2847 asmlinkage long sys_sched_getscheduler(pid_t pid)
2848 {
2849         int retval = -EINVAL;
2850         task_t *p;
2851
2852         if (pid < 0)
2853                 goto out_nounlock;
2854
2855         retval = -ESRCH;
2856         read_lock(&tasklist_lock);
2857         p = find_process_by_pid(pid);
2858         if (p) {
2859                 retval = security_task_getscheduler(p);
2860                 if (!retval)
2861                         retval = p->policy;
2862         }
2863         read_unlock(&tasklist_lock);
2864
2865 out_nounlock:
2866         return retval;
2867 }
2868
2869 /**
2870  * sys_sched_getscheduler - get the RT priority of a thread
2871  * @pid: the pid in question.
2872  * @param: structure containing the RT priority.
2873  */
2874 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
2875 {
2876         struct sched_param lp;
2877         int retval = -EINVAL;
2878         task_t *p;
2879
2880         if (!param || pid < 0)
2881                 goto out_nounlock;
2882
2883         read_lock(&tasklist_lock);
2884         p = find_process_by_pid(pid);
2885         retval = -ESRCH;
2886         if (!p)
2887                 goto out_unlock;
2888
2889         retval = security_task_getscheduler(p);
2890         if (retval)
2891                 goto out_unlock;
2892
2893         lp.sched_priority = p->rt_priority;
2894         read_unlock(&tasklist_lock);
2895
2896         /*
2897          * This one might sleep, we cannot do it with a spinlock held ...
2898          */
2899         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
2900
2901 out_nounlock:
2902         return retval;
2903
2904 out_unlock:
2905         read_unlock(&tasklist_lock);
2906         return retval;
2907 }
2908
2909 /**
2910  * sys_sched_setaffinity - set the cpu affinity of a process
2911  * @pid: pid of the process
2912  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2913  * @user_mask_ptr: user-space pointer to the new cpu mask
2914  */
2915 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
2916                                       unsigned long __user *user_mask_ptr)
2917 {
2918         cpumask_t new_mask;
2919         int retval;
2920         task_t *p;
2921
2922         if (len < sizeof(new_mask))
2923                 return -EINVAL;
2924
2925         if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
2926                 return -EFAULT;
2927
2928         lock_cpu_hotplug();
2929         read_lock(&tasklist_lock);
2930
2931         p = find_process_by_pid(pid);
2932         if (!p) {
2933                 read_unlock(&tasklist_lock);
2934                 unlock_cpu_hotplug();
2935                 return -ESRCH;
2936         }
2937
2938         /*
2939          * It is not safe to call set_cpus_allowed with the
2940          * tasklist_lock held.  We will bump the task_struct's
2941          * usage count and then drop tasklist_lock.
2942          */
2943         get_task_struct(p);
2944         read_unlock(&tasklist_lock);
2945
2946         retval = -EPERM;
2947         if ((current->euid != p->euid) && (current->euid != p->uid) &&
2948                         !capable(CAP_SYS_NICE))
2949                 goto out_unlock;
2950
2951         retval = set_cpus_allowed(p, new_mask);
2952
2953 out_unlock:
2954         put_task_struct(p);
2955         unlock_cpu_hotplug();
2956         return retval;
2957 }
2958
2959 /*
2960  * Represents all cpu's present in the system
2961  * In systems capable of hotplug, this map could dynamically grow
2962  * as new cpu's are detected in the system via any platform specific
2963  * method, such as ACPI for e.g.
2964  */
2965
2966 cpumask_t cpu_present_map;
2967 EXPORT_SYMBOL(cpu_present_map);
2968
2969 #ifndef CONFIG_SMP
2970 cpumask_t cpu_online_map = CPU_MASK_ALL;
2971 cpumask_t cpu_possible_map = CPU_MASK_ALL;
2972 #endif
2973
2974 /**
2975  * sys_sched_getaffinity - get the cpu affinity of a process
2976  * @pid: pid of the process
2977  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2978  * @user_mask_ptr: user-space pointer to hold the current cpu mask
2979  */
2980 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
2981                                       unsigned long __user *user_mask_ptr)
2982 {
2983         unsigned int real_len;
2984         cpumask_t mask;
2985         int retval;
2986         task_t *p;
2987
2988         real_len = sizeof(mask);
2989         if (len < real_len)
2990                 return -EINVAL;
2991
2992         lock_cpu_hotplug();
2993         read_lock(&tasklist_lock);
2994
2995         retval = -ESRCH;
2996         p = find_process_by_pid(pid);
2997         if (!p)
2998                 goto out_unlock;
2999
3000         retval = 0;
3001         cpus_and(mask, p->cpus_allowed, cpu_possible_map);
3002
3003 out_unlock:
3004         read_unlock(&tasklist_lock);
3005         unlock_cpu_hotplug();
3006         if (retval)
3007                 return retval;
3008         if (copy_to_user(user_mask_ptr, &mask, real_len))
3009                 return -EFAULT;
3010         return real_len;
3011 }
3012
3013 /**
3014  * sys_sched_yield - yield the current processor to other threads.
3015  *
3016  * this function yields the current CPU by moving the calling thread
3017  * to the expired array. If there are no other threads running on this
3018  * CPU then this function will return.
3019  */
3020 asmlinkage long sys_sched_yield(void)
3021 {
3022         runqueue_t *rq = this_rq_lock();
3023         prio_array_t *array = current->array;
3024         prio_array_t *target = rq->expired;
3025
3026         /*
3027          * We implement yielding by moving the task into the expired
3028          * queue.
3029          *
3030          * (special rule: RT tasks will just roundrobin in the active
3031          *  array.)
3032          */
3033         if (unlikely(rt_task(current)))
3034                 target = rq->active;
3035
3036         dequeue_task(current, array);
3037         enqueue_task(current, target);
3038
3039         /*
3040          * Since we are going to call schedule() anyway, there's
3041          * no need to preempt or enable interrupts:
3042          */
3043         _raw_spin_unlock(&rq->lock);
3044         preempt_enable_no_resched();
3045
3046         schedule();
3047
3048         return 0;
3049 }
3050
3051 void __sched __cond_resched(void)
3052 {
3053 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
3054         __might_sleep(__FILE__, __LINE__, 0);
3055 #endif
3056         /*
3057          * The system_state check is somewhat ugly but we might be
3058          * called during early boot when we are not yet ready to reschedule.
3059          */
3060         if (need_resched() && system_state >= SYSTEM_BOOTING_SCHEDULER_OK) {
3061                 set_current_state(TASK_RUNNING);
3062                 schedule();
3063         }
3064 }
3065
3066 EXPORT_SYMBOL(__cond_resched);
3067
3068 void __sched __cond_resched_lock(spinlock_t * lock)
3069 {
3070         if (need_resched()) {
3071                 _raw_spin_unlock(lock);
3072                 preempt_enable_no_resched();
3073                 set_current_state(TASK_RUNNING);
3074                 schedule();
3075                 spin_lock(lock);
3076         }
3077 }
3078
3079 EXPORT_SYMBOL(__cond_resched_lock);
3080
3081 /**
3082  * yield - yield the current processor to other threads.
3083  *
3084  * this is a shortcut for kernel-space yielding - it marks the
3085  * thread runnable and calls sys_sched_yield().
3086  */
3087 void __sched yield(void)
3088 {
3089         set_current_state(TASK_RUNNING);
3090         sys_sched_yield();
3091 }
3092
3093 EXPORT_SYMBOL(yield);
3094
3095 /*
3096  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
3097  * that process accounting knows that this is a task in IO wait state.
3098  *
3099  * But don't do that if it is a deliberate, throttling IO wait (this task
3100  * has set its backing_dev_info: the queue against which it should throttle)
3101  */
3102 void __sched io_schedule(void)
3103 {
3104         struct runqueue *rq = this_rq();
3105
3106         atomic_inc(&rq->nr_iowait);
3107         schedule();
3108         atomic_dec(&rq->nr_iowait);
3109 }
3110
3111 EXPORT_SYMBOL(io_schedule);
3112
3113 long __sched io_schedule_timeout(long timeout)
3114 {
3115         struct runqueue *rq = this_rq();
3116         long ret;
3117
3118         atomic_inc(&rq->nr_iowait);
3119         ret = schedule_timeout(timeout);
3120         atomic_dec(&rq->nr_iowait);
3121         return ret;
3122 }
3123
3124 /**
3125  * sys_sched_get_priority_max - return maximum RT priority.
3126  * @policy: scheduling class.
3127  *
3128  * this syscall returns the maximum rt_priority that can be used
3129  * by a given scheduling class.
3130  */
3131 asmlinkage long sys_sched_get_priority_max(int policy)
3132 {
3133         int ret = -EINVAL;
3134
3135         switch (policy) {
3136         case SCHED_FIFO:
3137         case SCHED_RR:
3138                 ret = MAX_USER_RT_PRIO-1;
3139                 break;
3140         case SCHED_NORMAL:
3141                 ret = 0;
3142                 break;
3143         }
3144         return ret;
3145 }
3146
3147 /**
3148  * sys_sched_get_priority_min - return minimum RT priority.
3149  * @policy: scheduling class.
3150  *
3151  * this syscall returns the minimum rt_priority that can be used
3152  * by a given scheduling class.
3153  */
3154 asmlinkage long sys_sched_get_priority_min(int policy)
3155 {
3156         int ret = -EINVAL;
3157
3158         switch (policy) {
3159         case SCHED_FIFO:
3160         case SCHED_RR:
3161                 ret = 1;
3162                 break;
3163         case SCHED_NORMAL:
3164                 ret = 0;
3165         }
3166         return ret;
3167 }
3168
3169 /**
3170  * sys_sched_rr_get_interval - return the default timeslice of a process.
3171  * @pid: pid of the process.
3172  * @interval: userspace pointer to the timeslice value.
3173  *
3174  * this syscall writes the default timeslice value of a given process
3175  * into the user-space timespec buffer. A value of '0' means infinity.
3176  */
3177 asmlinkage
3178 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3179 {
3180         int retval = -EINVAL;
3181         struct timespec t;
3182         task_t *p;
3183
3184         if (pid < 0)
3185                 goto out_nounlock;
3186
3187         retval = -ESRCH;
3188         read_lock(&tasklist_lock);
3189         p = find_process_by_pid(pid);
3190         if (!p)
3191                 goto out_unlock;
3192
3193         retval = security_task_getscheduler(p);
3194         if (retval)
3195                 goto out_unlock;
3196
3197         jiffies_to_timespec(p->policy & SCHED_FIFO ?
3198                                 0 : task_timeslice(p), &t);
3199         read_unlock(&tasklist_lock);
3200         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3201 out_nounlock:
3202         return retval;
3203 out_unlock:
3204         read_unlock(&tasklist_lock);
3205         return retval;
3206 }
3207
3208 static inline struct task_struct *eldest_child(struct task_struct *p)
3209 {
3210         if (list_empty(&p->children)) return NULL;
3211         return list_entry(p->children.next,struct task_struct,sibling);
3212 }
3213
3214 static inline struct task_struct *older_sibling(struct task_struct *p)
3215 {
3216         if (p->sibling.prev==&p->parent->children) return NULL;
3217         return list_entry(p->sibling.prev,struct task_struct,sibling);
3218 }
3219
3220 static inline struct task_struct *younger_sibling(struct task_struct *p)
3221 {
3222         if (p->sibling.next==&p->parent->children) return NULL;
3223         return list_entry(p->sibling.next,struct task_struct,sibling);
3224 }
3225
3226 static void show_task(task_t * p)
3227 {
3228         task_t *relative;
3229         unsigned state;
3230         unsigned long free = 0;
3231         static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
3232
3233         printk("%-13.13s ", p->comm);
3234         state = p->state ? __ffs(p->state) + 1 : 0;
3235         if (state < ARRAY_SIZE(stat_nam))
3236                 printk(stat_nam[state]);
3237         else
3238                 printk("?");
3239 #if (BITS_PER_LONG == 32)
3240         if (state == TASK_RUNNING)
3241                 printk(" running ");
3242         else
3243                 printk(" %08lX ", thread_saved_pc(p));
3244 #else
3245         if (state == TASK_RUNNING)
3246                 printk("  running task   ");
3247         else
3248                 printk(" %016lx ", thread_saved_pc(p));
3249 #endif
3250 #ifdef CONFIG_DEBUG_STACK_USAGE
3251         {
3252                 unsigned long * n = (unsigned long *) (p->thread_info+1);
3253                 while (!*n)
3254                         n++;
3255                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3256         }
3257 #endif
3258         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3259         if ((relative = eldest_child(p)))
3260                 printk("%5d ", relative->pid);
3261         else
3262                 printk("      ");
3263         if ((relative = younger_sibling(p)))
3264                 printk("%7d", relative->pid);
3265         else
3266                 printk("       ");
3267         if ((relative = older_sibling(p)))
3268                 printk(" %5d", relative->pid);
3269         else
3270                 printk("      ");
3271         if (!p->mm)
3272                 printk(" (L-TLB)\n");
3273         else
3274                 printk(" (NOTLB)\n");
3275
3276         if (state != TASK_RUNNING)
3277                 show_stack(p, NULL);
3278 }
3279
3280 void show_state(void)
3281 {
3282         task_t *g, *p;
3283
3284 #if (BITS_PER_LONG == 32)
3285         printk("\n"
3286                "                                               sibling\n");
3287         printk("  task             PC      pid father child younger older\n");
3288 #else
3289         printk("\n"
3290                "                                                       sibling\n");
3291         printk("  task                 PC          pid father child younger older\n");
3292 #endif
3293         read_lock(&tasklist_lock);
3294         do_each_thread(g, p) {
3295                 /*
3296                  * reset the NMI-timeout, listing all files on a slow
3297                  * console might take alot of time:
3298                  */
3299                 touch_nmi_watchdog();
3300                 show_task(p);
3301         } while_each_thread(g, p);
3302
3303         read_unlock(&tasklist_lock);
3304 }
3305
3306 EXPORT_SYMBOL_GPL(show_state);
3307
3308 void __devinit init_idle(task_t *idle, int cpu)
3309 {
3310         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
3311         unsigned long flags;
3312
3313         local_irq_save(flags);
3314         double_rq_lock(idle_rq, rq);
3315
3316         idle_rq->curr = idle_rq->idle = idle;
3317         deactivate_task(idle, rq);
3318         idle->array = NULL;
3319         idle->prio = MAX_PRIO;
3320         idle->state = TASK_RUNNING;
3321         set_task_cpu(idle, cpu);
3322         double_rq_unlock(idle_rq, rq);
3323         set_tsk_need_resched(idle);
3324         local_irq_restore(flags);
3325
3326         /* Set the preempt count _outside_ the spinlocks! */
3327 #ifdef CONFIG_PREEMPT
3328         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
3329 #else
3330         idle->thread_info->preempt_count = 0;
3331 #endif
3332 }
3333
3334 /*
3335  * In a system that switches off the HZ timer nohz_cpu_mask
3336  * indicates which cpus entered this state. This is used
3337  * in the rcu update to wait only for active cpus. For system
3338  * which do not switch off the HZ timer nohz_cpu_mask should
3339  * always be CPU_MASK_NONE.
3340  */
3341 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
3342
3343 #ifdef CONFIG_SMP
3344 /*
3345  * This is how migration works:
3346  *
3347  * 1) we queue a migration_req_t structure in the source CPU's
3348  *    runqueue and wake up that CPU's migration thread.
3349  * 2) we down() the locked semaphore => thread blocks.
3350  * 3) migration thread wakes up (implicitly it forces the migrated
3351  *    thread off the CPU)
3352  * 4) it gets the migration request and checks whether the migrated
3353  *    task is still in the wrong runqueue.
3354  * 5) if it's in the wrong runqueue then the migration thread removes
3355  *    it and puts it into the right queue.
3356  * 6) migration thread up()s the semaphore.
3357  * 7) we wake up and the migration is done.
3358  */
3359
3360 /*
3361  * Change a given task's CPU affinity. Migrate the thread to a
3362  * proper CPU and schedule it away if the CPU it's executing on
3363  * is removed from the allowed bitmask.
3364  *
3365  * NOTE: the caller must have a valid reference to the task, the
3366  * task must not exit() & deallocate itself prematurely.  The
3367  * call is not atomic; no spinlocks may be held.
3368  */
3369 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
3370 {
3371         unsigned long flags;
3372         int ret = 0;
3373         migration_req_t req;
3374         runqueue_t *rq;
3375
3376         rq = task_rq_lock(p, &flags);
3377         if (!cpus_intersects(new_mask, cpu_online_map)) {
3378                 ret = -EINVAL;
3379                 goto out;
3380         }
3381
3382         p->cpus_allowed = new_mask;
3383         /* Can the task run on the task's current CPU? If so, we're done */
3384         if (cpu_isset(task_cpu(p), new_mask))
3385                 goto out;
3386
3387         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
3388                 /* Need help from migration thread: drop lock and wait. */
3389                 task_rq_unlock(rq, &flags);
3390                 wake_up_process(rq->migration_thread);
3391                 wait_for_completion(&req.done);
3392                 tlb_migrate_finish(p->mm);
3393                 return 0;
3394         }
3395 out:
3396         task_rq_unlock(rq, &flags);
3397         return ret;
3398 }
3399
3400 EXPORT_SYMBOL_GPL(set_cpus_allowed);
3401
3402 /*
3403  * Move (not current) task off this cpu, onto dest cpu.  We're doing
3404  * this because either it can't run here any more (set_cpus_allowed()
3405  * away from this CPU, or CPU going down), or because we're
3406  * attempting to rebalance this task on exec (sched_balance_exec).
3407  *
3408  * So we race with normal scheduler movements, but that's OK, as long
3409  * as the task is no longer on this CPU.
3410  */
3411 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
3412 {
3413         runqueue_t *rq_dest, *rq_src;
3414
3415         if (unlikely(cpu_is_offline(dest_cpu)))
3416                 return;
3417
3418         rq_src  = cpu_rq(src_cpu);
3419         rq_dest = cpu_rq(dest_cpu);
3420
3421         double_rq_lock(rq_src, rq_dest);
3422         /* Already moved. */
3423         if (task_cpu(p) != src_cpu)
3424                 goto out;
3425         /* Affinity changed (again). */
3426         if (!cpu_isset(dest_cpu, p->cpus_allowed))
3427                 goto out;
3428
3429         set_task_cpu(p, dest_cpu);
3430         if (p->array) {
3431                 /*
3432                  * Sync timestamp with rq_dest's before activating.
3433                  * The same thing could be achieved by doing this step
3434                  * afterwards, and pretending it was a local activate.
3435                  * This way is cleaner and logically correct.
3436                  */
3437                 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
3438                                 + rq_dest->timestamp_last_tick;
3439                 deactivate_task(p, rq_src);
3440                 activate_task(p, rq_dest, 0);
3441                 if (TASK_PREEMPTS_CURR(p, rq_dest))
3442                         resched_task(rq_dest->curr);
3443         }
3444
3445 out:
3446         double_rq_unlock(rq_src, rq_dest);
3447 }
3448
3449 /*
3450  * migration_thread - this is a highprio system thread that performs
3451  * thread migration by bumping thread off CPU then 'pushing' onto
3452  * another runqueue.
3453  */
3454 static int migration_thread(void * data)
3455 {
3456         runqueue_t *rq;
3457         int cpu = (long)data;
3458
3459         rq = cpu_rq(cpu);
3460         BUG_ON(rq->migration_thread != current);
3461
3462         set_current_state(TASK_INTERRUPTIBLE);
3463         while (!kthread_should_stop()) {
3464                 struct list_head *head;
3465                 migration_req_t *req;
3466
3467                 if (current->flags & PF_FREEZE)
3468                         refrigerator(PF_FREEZE);
3469
3470                 spin_lock_irq(&rq->lock);
3471
3472                 if (cpu_is_offline(cpu)) {
3473                         spin_unlock_irq(&rq->lock);
3474                         goto wait_to_die;
3475                 }
3476
3477                 if (rq->active_balance) {
3478                         active_load_balance(rq, cpu);
3479                         rq->active_balance = 0;
3480                 }
3481
3482                 head = &rq->migration_queue;
3483
3484                 if (list_empty(head)) {
3485                         spin_unlock_irq(&rq->lock);
3486                         schedule();
3487                         set_current_state(TASK_INTERRUPTIBLE);
3488                         continue;
3489                 }
3490                 req = list_entry(head->next, migration_req_t, list);
3491                 list_del_init(head->next);
3492
3493                 if (req->type == REQ_MOVE_TASK) {
3494                         spin_unlock(&rq->lock);
3495                         __migrate_task(req->task, smp_processor_id(),
3496                                         req->dest_cpu);
3497                         local_irq_enable();
3498                 } else if (req->type == REQ_SET_DOMAIN) {
3499                         rq->sd = req->sd;
3500                         spin_unlock_irq(&rq->lock);
3501                 } else {
3502                         spin_unlock_irq(&rq->lock);
3503                         WARN_ON(1);
3504                 }
3505
3506                 complete(&req->done);
3507         }
3508         __set_current_state(TASK_RUNNING);
3509         return 0;
3510
3511 wait_to_die:
3512         /* Wait for kthread_stop */
3513         set_current_state(TASK_INTERRUPTIBLE);
3514         while (!kthread_should_stop()) {
3515                 schedule();
3516                 set_current_state(TASK_INTERRUPTIBLE);
3517         }
3518         __set_current_state(TASK_RUNNING);
3519         return 0;
3520 }
3521
3522 #ifdef CONFIG_HOTPLUG_CPU
3523 /* migrate_all_tasks - function to migrate all tasks from the dead cpu.  */
3524 static void migrate_all_tasks(int src_cpu)
3525 {
3526         struct task_struct *tsk, *t;
3527         int dest_cpu;
3528         unsigned int node;
3529
3530         write_lock_irq(&tasklist_lock);
3531
3532         /* watch out for per node tasks, let's stay on this node */
3533         node = cpu_to_node(src_cpu);
3534
3535         do_each_thread(t, tsk) {
3536                 cpumask_t mask;
3537                 if (tsk == current)
3538                         continue;
3539
3540                 if (task_cpu(tsk) != src_cpu)
3541                         continue;
3542
3543                 /* Figure out where this task should go (attempting to
3544                  * keep it on-node), and check if it can be migrated
3545                  * as-is.  NOTE that kernel threads bound to more than
3546                  * one online cpu will be migrated. */
3547                 mask = node_to_cpumask(node);
3548                 cpus_and(mask, mask, tsk->cpus_allowed);
3549                 dest_cpu = any_online_cpu(mask);
3550                 if (dest_cpu == NR_CPUS)
3551                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
3552                 if (dest_cpu == NR_CPUS) {
3553                         cpus_setall(tsk->cpus_allowed);
3554                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
3555
3556                         /* Don't tell them about moving exiting tasks
3557                            or kernel threads (both mm NULL), since
3558                            they never leave kernel. */
3559                         if (tsk->mm && printk_ratelimit())
3560                                 printk(KERN_INFO "process %d (%s) no "
3561                                        "longer affine to cpu%d\n",
3562                                        tsk->pid, tsk->comm, src_cpu);
3563                 }
3564
3565                 __migrate_task(tsk, src_cpu, dest_cpu);
3566         } while_each_thread(t, tsk);
3567
3568         write_unlock_irq(&tasklist_lock);
3569 }
3570
3571 /* Schedules idle task to be the next runnable task on current CPU.
3572  * It does so by boosting its priority to highest possible and adding it to
3573  * the _front_ of runqueue. Used by CPU offline code.
3574  */
3575 void sched_idle_next(void)
3576 {
3577         int cpu = smp_processor_id();
3578         runqueue_t *rq = this_rq();
3579         struct task_struct *p = rq->idle;
3580         unsigned long flags;
3581
3582         /* cpu has to be offline */
3583         BUG_ON(cpu_online(cpu));
3584
3585         /* Strictly not necessary since rest of the CPUs are stopped by now
3586          * and interrupts disabled on current cpu.
3587          */
3588         spin_lock_irqsave(&rq->lock, flags);
3589
3590         __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
3591         /* Add idle task to _front_ of it's priority queue */
3592         __activate_idle_task(p, rq);
3593
3594         spin_unlock_irqrestore(&rq->lock, flags);
3595 }
3596 #endif /* CONFIG_HOTPLUG_CPU */
3597
3598 /*
3599  * migration_call - callback that gets triggered when a CPU is added.
3600  * Here we can start up the necessary migration thread for the new CPU.
3601  */
3602 static int migration_call(struct notifier_block *nfb, unsigned long action,
3603                           void *hcpu)
3604 {
3605         int cpu = (long)hcpu;
3606         struct task_struct *p;
3607         struct runqueue *rq;
3608         unsigned long flags;
3609
3610         switch (action) {
3611         case CPU_UP_PREPARE:
3612                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
3613                 if (IS_ERR(p))
3614                         return NOTIFY_BAD;
3615                 p->flags |= PF_NOFREEZE;
3616                 kthread_bind(p, cpu);
3617                 /* Must be high prio: stop_machine expects to yield to it. */
3618                 rq = task_rq_lock(p, &flags);
3619                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
3620                 task_rq_unlock(rq, &flags);
3621                 cpu_rq(cpu)->migration_thread = p;
3622                 break;
3623         case CPU_ONLINE:
3624                 /* Strictly unneccessary, as first user will wake it. */
3625                 wake_up_process(cpu_rq(cpu)->migration_thread);
3626                 break;
3627 #ifdef CONFIG_HOTPLUG_CPU
3628         case CPU_UP_CANCELED:
3629                 /* Unbind it from offline cpu so it can run.  Fall thru. */
3630                 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
3631                 kthread_stop(cpu_rq(cpu)->migration_thread);
3632                 cpu_rq(cpu)->migration_thread = NULL;
3633                 break;
3634         case CPU_DEAD:
3635                 migrate_all_tasks(cpu);
3636                 rq = cpu_rq(cpu);
3637                 kthread_stop(rq->migration_thread);
3638                 rq->migration_thread = NULL;
3639                 /* Idle task back to normal (off runqueue, low prio) */
3640                 rq = task_rq_lock(rq->idle, &flags);
3641                 deactivate_task(rq->idle, rq);
3642                 rq->idle->static_prio = MAX_PRIO;
3643                 __setscheduler(rq->idle, SCHED_NORMAL, 0);
3644                 task_rq_unlock(rq, &flags);
3645                 BUG_ON(rq->nr_running != 0);
3646
3647                 /* No need to migrate the tasks: it was best-effort if
3648                  * they didn't do lock_cpu_hotplug().  Just wake up
3649                  * the requestors. */
3650                 spin_lock_irq(&rq->lock);
3651                 while (!list_empty(&rq->migration_queue)) {
3652                         migration_req_t *req;
3653                         req = list_entry(rq->migration_queue.next,
3654                                          migration_req_t, list);
3655                         BUG_ON(req->type != REQ_MOVE_TASK);
3656                         list_del_init(&req->list);
3657                         complete(&req->done);
3658                 }
3659                 spin_unlock_irq(&rq->lock);
3660                 break;
3661 #endif
3662         }
3663         return NOTIFY_OK;
3664 }
3665
3666 /* Register at highest priority so that task migration (migrate_all_tasks)
3667  * happens before everything else.
3668  */
3669 static struct notifier_block __devinitdata migration_notifier = {
3670         .notifier_call = migration_call,
3671         .priority = 10
3672 };
3673
3674 int __init migration_init(void)
3675 {
3676         void *cpu = (void *)(long)smp_processor_id();
3677         /* Start one for boot CPU. */
3678         migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
3679         migration_call(&migration_notifier, CPU_ONLINE, cpu);
3680         register_cpu_notifier(&migration_notifier);
3681         return 0;
3682 }
3683 #endif
3684
3685 /*
3686  * The 'big kernel lock'
3687  *
3688  * This spinlock is taken and released recursively by lock_kernel()
3689  * and unlock_kernel().  It is transparently dropped and reaquired
3690  * over schedule().  It is used to protect legacy code that hasn't
3691  * been migrated to a proper locking design yet.
3692  *
3693  * Don't use in new code.
3694  *
3695  * Note: spinlock debugging needs this even on !CONFIG_SMP.
3696  */
3697 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
3698 EXPORT_SYMBOL(kernel_flag);
3699
3700 #ifdef CONFIG_SMP
3701 /* Attach the domain 'sd' to 'cpu' as its base domain */
3702 void cpu_attach_domain(struct sched_domain *sd, int cpu)
3703 {
3704         migration_req_t req;
3705         unsigned long flags;
3706         runqueue_t *rq = cpu_rq(cpu);
3707         int local = 1;
3708
3709         lock_cpu_hotplug();
3710
3711         spin_lock_irqsave(&rq->lock, flags);
3712
3713         if (cpu == smp_processor_id() || !cpu_online(cpu)) {
3714                 rq->sd = sd;
3715         } else {
3716                 init_completion(&req.done);
3717                 req.type = REQ_SET_DOMAIN;
3718                 req.sd = sd;
3719                 list_add(&req.list, &rq->migration_queue);
3720                 local = 0;
3721         }
3722
3723         spin_unlock_irqrestore(&rq->lock, flags);
3724
3725         if (!local) {
3726                 wake_up_process(rq->migration_thread);
3727                 wait_for_completion(&req.done);
3728         }
3729
3730         unlock_cpu_hotplug();
3731 }
3732
3733 #ifdef ARCH_HAS_SCHED_DOMAIN
3734 extern void __init arch_init_sched_domains(void);
3735 #else
3736 static struct sched_group sched_group_cpus[NR_CPUS];
3737 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
3738 #ifdef CONFIG_NUMA
3739 static struct sched_group sched_group_nodes[MAX_NUMNODES];
3740 static DEFINE_PER_CPU(struct sched_domain, node_domains);
3741 static void __init arch_init_sched_domains(void)
3742 {
3743         int i;
3744         struct sched_group *first_node = NULL, *last_node = NULL;
3745
3746         /* Set up domains */
3747         for_each_cpu(i) {
3748                 int node = cpu_to_node(i);
3749                 cpumask_t nodemask = node_to_cpumask(node);
3750                 struct sched_domain *node_sd = &per_cpu(node_domains, i);
3751                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3752
3753                 *node_sd = SD_NODE_INIT;
3754                 node_sd->span = cpu_possible_map;
3755                 node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
3756
3757                 *cpu_sd = SD_CPU_INIT;
3758                 cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
3759                 cpu_sd->groups = &sched_group_cpus[i];
3760                 cpu_sd->parent = node_sd;
3761         }
3762
3763         /* Set up groups */
3764         for (i = 0; i < MAX_NUMNODES; i++) {
3765                 cpumask_t tmp = node_to_cpumask(i);
3766                 cpumask_t nodemask;
3767                 struct sched_group *first_cpu = NULL, *last_cpu = NULL;
3768                 struct sched_group *node = &sched_group_nodes[i];
3769                 int j;
3770
3771                 cpus_and(nodemask, tmp, cpu_possible_map);
3772
3773                 if (cpus_empty(nodemask))
3774                         continue;
3775
3776                 node->cpumask = nodemask;
3777                 node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
3778
3779                 for_each_cpu_mask(j, node->cpumask) {
3780                         struct sched_group *cpu = &sched_group_cpus[j];
3781
3782                         cpus_clear(cpu->cpumask);
3783                         cpu_set(j, cpu->cpumask);
3784                         cpu->cpu_power = SCHED_LOAD_SCALE;
3785
3786                         if (!first_cpu)
3787                                 first_cpu = cpu;
3788                         if (last_cpu)
3789                                 last_cpu->next = cpu;
3790                         last_cpu = cpu;
3791                 }
3792                 last_cpu->next = first_cpu;
3793
3794                 if (!first_node)
3795                         first_node = node;
3796                 if (last_node)
3797                         last_node->next = node;
3798                 last_node = node;
3799         }
3800         last_node->next = first_node;
3801
3802         mb();
3803         for_each_cpu(i) {
3804                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3805                 cpu_attach_domain(cpu_sd, i);
3806         }
3807 }
3808
3809 #else /* !CONFIG_NUMA */
3810 static void __init arch_init_sched_domains(void)
3811 {
3812         int i;
3813         struct sched_group *first_cpu = NULL, *last_cpu = NULL;
3814
3815         /* Set up domains */
3816         for_each_cpu(i) {
3817                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3818
3819                 *cpu_sd = SD_CPU_INIT;
3820                 cpu_sd->span = cpu_possible_map;
3821                 cpu_sd->groups = &sched_group_cpus[i];
3822         }
3823
3824         /* Set up CPU groups */
3825         for_each_cpu_mask(i, cpu_possible_map) {
3826                 struct sched_group *cpu = &sched_group_cpus[i];
3827
3828                 cpus_clear(cpu->cpumask);
3829                 cpu_set(i, cpu->cpumask);
3830                 cpu->cpu_power = SCHED_LOAD_SCALE;
3831
3832                 if (!first_cpu)
3833                         first_cpu = cpu;
3834                 if (last_cpu)
3835                         last_cpu->next = cpu;
3836                 last_cpu = cpu;
3837         }
3838         last_cpu->next = first_cpu;
3839
3840         mb(); /* domains were modified outside the lock */
3841         for_each_cpu(i) {
3842                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3843                 cpu_attach_domain(cpu_sd, i);
3844         }
3845 }
3846
3847 #endif /* CONFIG_NUMA */
3848 #endif /* ARCH_HAS_SCHED_DOMAIN */
3849
3850 #define SCHED_DOMAIN_DEBUG
3851 #ifdef SCHED_DOMAIN_DEBUG
3852 void sched_domain_debug(void)
3853 {
3854         int i;
3855
3856         for_each_cpu(i) {
3857                 runqueue_t *rq = cpu_rq(i);
3858                 struct sched_domain *sd;
3859                 int level = 0;
3860
3861                 sd = rq->sd;
3862
3863                 printk(KERN_DEBUG "CPU%d: %s\n",
3864                                 i, (cpu_online(i) ? " online" : "offline"));
3865
3866                 do {
3867                         int j;
3868                         char str[NR_CPUS];
3869                         struct sched_group *group = sd->groups;
3870                         cpumask_t groupmask;
3871
3872                         cpumask_scnprintf(str, NR_CPUS, sd->span);
3873                         cpus_clear(groupmask);
3874
3875                         printk(KERN_DEBUG);
3876                         for (j = 0; j < level + 1; j++)
3877                                 printk(" ");
3878                         printk("domain %d: span %s\n", level, str);
3879
3880                         if (!cpu_isset(i, sd->span))
3881                                 printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
3882                         if (!cpu_isset(i, group->cpumask))
3883                                 printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
3884                         if (!group->cpu_power)
3885                                 printk(KERN_DEBUG "ERROR domain->cpu_power not set\n");
3886
3887                         printk(KERN_DEBUG);
3888                         for (j = 0; j < level + 2; j++)
3889                                 printk(" ");
3890                         printk("groups:");
3891                         do {
3892                                 if (!group) {
3893                                         printk(" ERROR: NULL");
3894                                         break;
3895                                 }
3896
3897                                 if (!cpus_weight(group->cpumask))
3898                                         printk(" ERROR empty group:");
3899
3900                                 if (cpus_intersects(groupmask, group->cpumask))
3901                                         printk(" ERROR repeated CPUs:");
3902
3903                                 cpus_or(groupmask, groupmask, group->cpumask);
3904
3905                                 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
3906                                 printk(" %s", str);
3907
3908                                 group = group->next;
3909                         } while (group != sd->groups);
3910                         printk("\n");
3911
3912                         if (!cpus_equal(sd->span, groupmask))
3913                                 printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
3914
3915                         level++;
3916                         sd = sd->parent;
3917
3918                         if (sd) {
3919                                 if (!cpus_subset(groupmask, sd->span))
3920                                         printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
3921                         }
3922
3923                 } while (sd);
3924         }
3925 }
3926 #else
3927 #define sched_domain_debug() {}
3928 #endif
3929
3930 void __init sched_init_smp(void)
3931 {
3932         arch_init_sched_domains();
3933         sched_domain_debug();
3934 }
3935 #else
3936 void __init sched_init_smp(void)
3937 {
3938 }
3939 #endif /* CONFIG_SMP */
3940
3941 int in_sched_functions(unsigned long addr)
3942 {
3943         /* Linker adds these: start and end of __sched functions */
3944         extern char __sched_text_start[], __sched_text_end[];
3945         return addr >= (unsigned long)__sched_text_start
3946                 && addr < (unsigned long)__sched_text_end;
3947 }
3948
3949 void __init sched_init(void)
3950 {
3951         runqueue_t *rq;
3952         int i, j, k;
3953
3954 #ifdef CONFIG_SMP
3955         /* Set up an initial dummy domain for early boot */
3956         static struct sched_domain sched_domain_init;
3957         static struct sched_group sched_group_init;
3958
3959         memset(&sched_domain_init, 0, sizeof(struct sched_domain));
3960         sched_domain_init.span = CPU_MASK_ALL;
3961         sched_domain_init.groups = &sched_group_init;
3962         sched_domain_init.last_balance = jiffies;
3963         sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
3964
3965         memset(&sched_group_init, 0, sizeof(struct sched_group));
3966         sched_group_init.cpumask = CPU_MASK_ALL;
3967         sched_group_init.next = &sched_group_init;
3968         sched_group_init.cpu_power = SCHED_LOAD_SCALE;
3969 #endif
3970
3971         for (i = 0; i < NR_CPUS; i++) {
3972                 prio_array_t *array;
3973
3974                 rq = cpu_rq(i);
3975                 spin_lock_init(&rq->lock);
3976                 rq->active = rq->arrays;
3977                 rq->expired = rq->arrays + 1;
3978                 rq->best_expired_prio = MAX_PRIO;
3979
3980 #ifdef CONFIG_SMP
3981                 rq->sd = &sched_domain_init;
3982                 rq->cpu_load = 0;
3983                 rq->active_balance = 0;
3984                 rq->push_cpu = 0;
3985                 rq->migration_thread = NULL;
3986                 INIT_LIST_HEAD(&rq->migration_queue);
3987 #endif
3988                 atomic_set(&rq->nr_iowait, 0);
3989
3990                 for (j = 0; j < 2; j++) {
3991                         array = rq->arrays + j;
3992                         for (k = 0; k < MAX_PRIO; k++) {
3993                                 INIT_LIST_HEAD(array->queue + k);
3994                                 __clear_bit(k, array->bitmap);
3995                         }
3996                         // delimiter for bitsearch
3997                         __set_bit(MAX_PRIO, array->bitmap);
3998                 }
3999         }
4000         /*
4001          * We have to do a little magic to get the first
4002          * thread right in SMP mode.
4003          */
4004         rq = this_rq();
4005         rq->curr = current;
4006         rq->idle = current;
4007         set_task_cpu(current, smp_processor_id());
4008         wake_up_forked_process(current);
4009
4010         /*
4011          * The boot idle thread does lazy MMU switching as well:
4012          */
4013         atomic_inc(&init_mm.mm_count);
4014         enter_lazy_tlb(&init_mm, current);
4015 }
4016
4017 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4018 void __might_sleep(char *file, int line, int atomic_depth)
4019 {
4020 #if defined(in_atomic)
4021         static unsigned long prev_jiffy;        /* ratelimiting */
4022
4023 #ifndef CONFIG_PREEMPT
4024         atomic_depth = 0;
4025 #endif
4026         if (((in_atomic() != atomic_depth) || irqs_disabled()) &&
4027             system_state == SYSTEM_RUNNING) {
4028                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
4029                         return;
4030                 prev_jiffy = jiffies;
4031                 printk(KERN_ERR "Debug: sleeping function called from invalid"
4032                                 " context at %s:%d\n", file, line);
4033                 printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
4034                         in_atomic(), atomic_depth, irqs_disabled());
4035                 dump_stack();
4036         }
4037 #endif
4038 }
4039 EXPORT_SYMBOL(__might_sleep);
4040 #endif
4041
4042
4043 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
4044 /*
4045  * This could be a long-held lock.  If another CPU holds it for a long time,
4046  * and that CPU is not asked to reschedule then *this* CPU will spin on the
4047  * lock for a long time, even if *this* CPU is asked to reschedule.
4048  *
4049  * So what we do here, in the slow (contended) path is to spin on the lock by
4050  * hand while permitting preemption.
4051  *
4052  * Called inside preempt_disable().
4053  */
4054 void __sched __preempt_spin_lock(spinlock_t *lock)
4055 {
4056         if (preempt_count() > 1) {
4057                 _raw_spin_lock(lock);
4058                 return;
4059         }
4060         do {
4061                 preempt_enable();
4062                 while (spin_is_locked(lock))
4063                         cpu_relax();
4064                 preempt_disable();
4065         } while (!_raw_spin_trylock(lock));
4066 }
4067
4068 EXPORT_SYMBOL(__preempt_spin_lock);
4069
4070 void __sched __preempt_write_lock(rwlock_t *lock)
4071 {
4072         if (preempt_count() > 1) {
4073                 _raw_write_lock(lock);
4074                 return;
4075         }
4076
4077         do {
4078                 preempt_enable();
4079                 while (rwlock_is_locked(lock))
4080                         cpu_relax();
4081                 preempt_disable();
4082         } while (!_raw_write_trylock(lock));
4083 }
4084
4085 EXPORT_SYMBOL(__preempt_write_lock);
4086 #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */