patch-2.6.6-vs1.9.0
[linux-2.6.git] / kernel / sched.c
1 /*
2  *  kernel/sched.c
3  *
4  *  Kernel scheduler and related syscalls
5  *
6  *  Copyright (C) 1991-2002  Linus Torvalds
7  *
8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9  *              make semaphores SMP safe
10  *  1998-11-19  Implemented schedule_timeout() and related stuff
11  *              by Andrea Arcangeli
12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
13  *              hybrid priority-list and round-robin design with
14  *              an array-switch method of distributing timeslices
15  *              and per-CPU runqueues.  Cleanups and useful suggestions
16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
17  *  2003-09-03  Interactivity tuning by Con Kolivas.
18  */
19
20 #include <linux/mm.h>
21 #include <linux/module.h>
22 #include <linux/nmi.h>
23 #include <linux/init.h>
24 #include <asm/uaccess.h>
25 #include <linux/highmem.h>
26 #include <linux/smp_lock.h>
27 #include <asm/mmu_context.h>
28 #include <linux/interrupt.h>
29 #include <linux/completion.h>
30 #include <linux/kernel_stat.h>
31 #include <linux/security.h>
32 #include <linux/notifier.h>
33 #include <linux/suspend.h>
34 #include <linux/blkdev.h>
35 #include <linux/delay.h>
36 #include <linux/smp.h>
37 #include <linux/timer.h>
38 #include <linux/rcupdate.h>
39 #include <linux/cpu.h>
40 #include <linux/percpu.h>
41 #include <linux/kthread.h>
42 #include <linux/vserver/sched.h>
43 #include <linux/vinline.h>
44
45 #ifdef CONFIG_NUMA
46 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
47 #else
48 #define cpu_to_node_mask(cpu) (cpu_online_map)
49 #endif
50
51 /*
52  * Convert user-nice values [ -20 ... 0 ... 19 ]
53  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
54  * and back.
55  */
56 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
57 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
58 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
59
60 /*
61  * 'User priority' is the nice value converted to something we
62  * can work with better when scaling various scheduler parameters,
63  * it's a [ 0 ... 39 ] range.
64  */
65 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
66 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
67 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
68 #define AVG_TIMESLICE   (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
69                         (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
70
71 /*
72  * Some helpers for converting nanosecond timing to jiffy resolution
73  */
74 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
75 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
76
77 /*
78  * These are the 'tuning knobs' of the scheduler:
79  *
80  * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
81  * maximum timeslice is 200 msecs. Timeslices get refilled after
82  * they expire.
83  */
84 #define MIN_TIMESLICE           ( 10 * HZ / 1000)
85 #define MAX_TIMESLICE           (200 * HZ / 1000)
86 #define ON_RUNQUEUE_WEIGHT       30
87 #define CHILD_PENALTY            95
88 #define PARENT_PENALTY          100
89 #define EXIT_WEIGHT               3
90 #define PRIO_BONUS_RATIO         25
91 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
92 #define INTERACTIVE_DELTA         2
93 #define MAX_SLEEP_AVG           (AVG_TIMESLICE * MAX_BONUS)
94 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
95 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
96 #define NODE_THRESHOLD          125
97 #define CREDIT_LIMIT            100
98
99 /*
100  * If a task is 'interactive' then we reinsert it in the active
101  * array after it has expired its current timeslice. (it will not
102  * continue to run immediately, it will still roundrobin with
103  * other interactive tasks.)
104  *
105  * This part scales the interactivity limit depending on niceness.
106  *
107  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
108  * Here are a few examples of different nice levels:
109  *
110  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
111  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
112  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
113  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
114  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
115  *
116  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
117  *  priority range a task can explore, a value of '1' means the
118  *  task is rated interactive.)
119  *
120  * Ie. nice +19 tasks can never get 'interactive' enough to be
121  * reinserted into the active array. And only heavily CPU-hog nice -20
122  * tasks will be expired. Default nice 0 tasks are somewhere between,
123  * it takes some effort for them to get interactive, but it's not
124  * too hard.
125  */
126
127 #define CURRENT_BONUS(p) \
128         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
129                 MAX_SLEEP_AVG)
130
131 #ifdef CONFIG_SMP
132 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
133                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
134                         num_online_cpus())
135 #else
136 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
137                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
138 #endif
139
140 #define SCALE(v1,v1_max,v2_max) \
141         (v1) * (v2_max) / (v1_max)
142
143 #define DELTA(p) \
144         (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
145                 INTERACTIVE_DELTA)
146
147 #define TASK_INTERACTIVE(p) \
148         ((p)->prio <= (p)->static_prio - DELTA(p))
149
150 #define INTERACTIVE_SLEEP(p) \
151         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
152                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
153
154 #define HIGH_CREDIT(p) \
155         ((p)->interactive_credit > CREDIT_LIMIT)
156
157 #define LOW_CREDIT(p) \
158         ((p)->interactive_credit < -CREDIT_LIMIT)
159
160 #define TASK_PREEMPTS_CURR(p, rq) \
161         ((p)->prio < (rq)->curr->prio)
162
163 /*
164  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
165  * to time slice values.
166  *
167  * The higher a thread's priority, the bigger timeslices
168  * it gets during one round of execution. But even the lowest
169  * priority thread gets MIN_TIMESLICE worth of execution time.
170  *
171  * task_timeslice() is the interface that is used by the scheduler.
172  */
173
174 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
175                 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
176                         (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
177
178 static inline unsigned int task_timeslice(task_t *p)
179 {
180         return BASE_TIMESLICE(p);
181 }
182
183 /*
184  * These are the runqueue data structures:
185  */
186
187 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
188
189 typedef struct runqueue runqueue_t;
190
191 struct prio_array {
192         int nr_active;
193         unsigned long bitmap[BITMAP_SIZE];
194         struct list_head queue[MAX_PRIO];
195 };
196
197 /*
198  * This is the main, per-CPU runqueue data structure.
199  *
200  * Locking rule: those places that want to lock multiple runqueues
201  * (such as the load balancing or the thread migration code), lock
202  * acquire operations must be ordered by ascending &runqueue.
203  */
204 struct runqueue {
205         spinlock_t lock;
206         unsigned long long nr_switches;
207         unsigned long nr_running, expired_timestamp, nr_uninterruptible,
208                 timestamp_last_tick;
209         task_t *curr, *idle;
210         struct mm_struct *prev_mm;
211         prio_array_t *active, *expired, arrays[2];
212         int best_expired_prio, prev_cpu_load[NR_CPUS];
213 #ifdef CONFIG_NUMA
214         atomic_t *node_nr_running;
215         int prev_node_load[MAX_NUMNODES];
216 #endif
217         task_t *migration_thread;
218         struct list_head migration_queue;
219         struct list_head hold_queue;
220         int idle_tokens;
221
222         atomic_t nr_iowait;
223 };
224
225 static DEFINE_PER_CPU(struct runqueue, runqueues);
226
227 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
228 #define this_rq()               (&__get_cpu_var(runqueues))
229 #define task_rq(p)              cpu_rq(task_cpu(p))
230 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
231
232 extern unsigned long __scheduling_functions_start_here;
233 extern unsigned long __scheduling_functions_end_here;
234 const unsigned long scheduling_functions_start_here =
235                         (unsigned long)&__scheduling_functions_start_here;
236 const unsigned long scheduling_functions_end_here =
237                         (unsigned long)&__scheduling_functions_end_here;
238
239 /*
240  * Default context-switch locking:
241  */
242 #ifndef prepare_arch_switch
243 # define prepare_arch_switch(rq, next)  do { } while (0)
244 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
245 # define task_running(rq, p)            ((rq)->curr == (p))
246 #endif
247
248 #ifdef CONFIG_NUMA
249
250 /*
251  * Keep track of running tasks.
252  */
253
254 static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
255         {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
256
257 static inline void nr_running_init(struct runqueue *rq)
258 {
259         rq->node_nr_running = &node_nr_running[0];
260 }
261
262 static inline void nr_running_inc(runqueue_t *rq)
263 {
264         atomic_inc(rq->node_nr_running);
265         rq->nr_running++;
266 }
267
268 static inline void nr_running_dec(runqueue_t *rq)
269 {
270         atomic_dec(rq->node_nr_running);
271         rq->nr_running--;
272 }
273
274 __init void node_nr_running_init(void)
275 {
276         int i;
277
278         for (i = 0; i < NR_CPUS; i++) {
279                 if (cpu_possible(i))
280                         cpu_rq(i)->node_nr_running =
281                                 &node_nr_running[cpu_to_node(i)];
282         }
283 }
284
285 #else /* !CONFIG_NUMA */
286
287 # define nr_running_init(rq)    do { } while (0)
288 # define nr_running_inc(rq)     do { (rq)->nr_running++; } while (0)
289 # define nr_running_dec(rq)     do { (rq)->nr_running--; } while (0)
290
291 #endif /* CONFIG_NUMA */
292
293 /*
294  * task_rq_lock - lock the runqueue a given task resides on and disable
295  * interrupts.  Note the ordering: we can safely lookup the task_rq without
296  * explicitly disabling preemption.
297  */
298 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
299 {
300         struct runqueue *rq;
301
302 repeat_lock_task:
303         local_irq_save(*flags);
304         rq = task_rq(p);
305         spin_lock(&rq->lock);
306         if (unlikely(rq != task_rq(p))) {
307                 spin_unlock_irqrestore(&rq->lock, *flags);
308                 goto repeat_lock_task;
309         }
310         return rq;
311 }
312
313 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
314 {
315         spin_unlock_irqrestore(&rq->lock, *flags);
316 }
317
318 /*
319  * rq_lock - lock a given runqueue and disable interrupts.
320  */
321 static inline runqueue_t *this_rq_lock(void)
322 {
323         runqueue_t *rq;
324
325         local_irq_disable();
326         rq = this_rq();
327         spin_lock(&rq->lock);
328
329         return rq;
330 }
331
332 static inline void rq_unlock(runqueue_t *rq)
333 {
334         spin_unlock_irq(&rq->lock);
335 }
336
337 /*
338  * Adding/removing a task to/from a priority array:
339  */
340 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
341 {
342         array->nr_active--;
343         list_del(&p->run_list);
344         if (list_empty(array->queue + p->prio))
345                 __clear_bit(p->prio, array->bitmap);
346 }
347
348 static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
349 {
350         list_add_tail(&p->run_list, array->queue + p->prio);
351         __set_bit(p->prio, array->bitmap);
352         array->nr_active++;
353         p->array = array;
354 }
355
356 /*
357  * effective_prio - return the priority that is based on the static
358  * priority but is modified by bonuses/penalties.
359  *
360  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
361  * into the -5 ... 0 ... +5 bonus/penalty range.
362  *
363  * We use 25% of the full 0...39 priority range so that:
364  *
365  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
366  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
367  *
368  * Both properties are important to certain workloads.
369  */
370 static int effective_prio(task_t *p)
371 {
372         int bonus, prio;
373
374         if (rt_task(p))
375                 return p->prio;
376
377         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
378
379         prio = p->static_prio - bonus;
380         if (__vx_task_flags(p, VXF_SCHED_PRIO, 0))
381                 prio += effective_vavavoom(p, MAX_USER_PRIO);
382
383         if (prio < MAX_RT_PRIO)
384                 prio = MAX_RT_PRIO;
385         if (prio > MAX_PRIO-1)
386                 prio = MAX_PRIO-1;
387         return prio;
388 }
389
390 /*
391  * __activate_task - move a task to the runqueue.
392  */
393 static inline void __activate_task(task_t *p, runqueue_t *rq)
394 {
395         enqueue_task(p, rq->active);
396         nr_running_inc(rq);
397 }
398
399 static void recalc_task_prio(task_t *p, unsigned long long now)
400 {
401         unsigned long long __sleep_time = now - p->timestamp;
402         unsigned long sleep_time;
403
404         if (__sleep_time > NS_MAX_SLEEP_AVG)
405                 sleep_time = NS_MAX_SLEEP_AVG;
406         else
407                 sleep_time = (unsigned long)__sleep_time;
408
409         if (likely(sleep_time > 0)) {
410                 /*
411                  * User tasks that sleep a long time are categorised as
412                  * idle and will get just interactive status to stay active &
413                  * prevent them suddenly becoming cpu hogs and starving
414                  * other processes.
415                  */
416                 if (p->mm && p->activated != -1 &&
417                         sleep_time > INTERACTIVE_SLEEP(p)) {
418                                 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
419                                                 AVG_TIMESLICE);
420                                 if (!HIGH_CREDIT(p))
421                                         p->interactive_credit++;
422                 } else {
423                         /*
424                          * The lower the sleep avg a task has the more
425                          * rapidly it will rise with sleep time.
426                          */
427                         sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
428
429                         /*
430                          * Tasks with low interactive_credit are limited to
431                          * one timeslice worth of sleep avg bonus.
432                          */
433                         if (LOW_CREDIT(p) &&
434                             sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
435                                 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
436
437                         /*
438                          * Non high_credit tasks waking from uninterruptible
439                          * sleep are limited in their sleep_avg rise as they
440                          * are likely to be cpu hogs waiting on I/O
441                          */
442                         if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
443                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
444                                         sleep_time = 0;
445                                 else if (p->sleep_avg + sleep_time >=
446                                                 INTERACTIVE_SLEEP(p)) {
447                                         p->sleep_avg = INTERACTIVE_SLEEP(p);
448                                         sleep_time = 0;
449                                 }
450                         }
451
452                         /*
453                          * This code gives a bonus to interactive tasks.
454                          *
455                          * The boost works by updating the 'average sleep time'
456                          * value here, based on ->timestamp. The more time a
457                          * task spends sleeping, the higher the average gets -
458                          * and the higher the priority boost gets as well.
459                          */
460                         p->sleep_avg += sleep_time;
461
462                         if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
463                                 p->sleep_avg = NS_MAX_SLEEP_AVG;
464                                 if (!HIGH_CREDIT(p))
465                                         p->interactive_credit++;
466                         }
467                 }
468         }
469
470         p->prio = effective_prio(p);
471 }
472
473 /*
474  * activate_task - move a task to the runqueue and do priority recalculation
475  *
476  * Update all the scheduling statistics stuff. (sleep average
477  * calculation, priority modifiers, etc.)
478  */
479 static inline void activate_task(task_t *p, runqueue_t *rq)
480 {
481         unsigned long long now = sched_clock();
482
483         recalc_task_prio(p, now);
484
485         /*
486          * This checks to make sure it's not an uninterruptible task
487          * that is now waking up.
488          */
489         if (!p->activated) {
490                 /*
491                  * Tasks which were woken up by interrupts (ie. hw events)
492                  * are most likely of interactive nature. So we give them
493                  * the credit of extending their sleep time to the period
494                  * of time they spend on the runqueue, waiting for execution
495                  * on a CPU, first time around:
496                  */
497                 if (in_interrupt())
498                         p->activated = 2;
499                 else {
500                         /*
501                          * Normal first-time wakeups get a credit too for
502                          * on-runqueue time, but it will be weighted down:
503                          */
504                         p->activated = 1;
505                 }
506         }
507         p->timestamp = now;
508
509         __activate_task(p, rq);
510 }
511
512 /*
513  * deactivate_task - remove a task from the runqueue.
514  */
515 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
516 {
517         nr_running_dec(rq);
518         if (p->state == TASK_UNINTERRUPTIBLE)
519                 rq->nr_uninterruptible++;
520         dequeue_task(p, p->array);
521         p->array = NULL;
522 }
523
524 /*
525  * resched_task - mark a task 'to be rescheduled now'.
526  *
527  * On UP this means the setting of the need_resched flag, on SMP it
528  * might also involve a cross-CPU call to trigger the scheduler on
529  * the target CPU.
530  */
531 static inline void resched_task(task_t *p)
532 {
533 #ifdef CONFIG_SMP
534         int need_resched, nrpolling;
535
536         preempt_disable();
537         /* minimise the chance of sending an interrupt to poll_idle() */
538         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
539         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
540         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
541
542         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
543                 smp_send_reschedule(task_cpu(p));
544         preempt_enable();
545 #else
546         set_tsk_need_resched(p);
547 #endif
548 }
549
550 /**
551  * task_curr - is this task currently executing on a CPU?
552  * @p: the task in question.
553  */
554 inline int task_curr(task_t *p)
555 {
556         return cpu_curr(task_cpu(p)) == p;
557 }
558
559 #ifdef CONFIG_SMP
560 typedef struct {
561         struct list_head list;
562         task_t *task;
563         struct completion done;
564 } migration_req_t;
565
566 /*
567  * The task's runqueue lock must be held, and the new mask must be valid.
568  * Returns true if you have to wait for migration thread.
569  */
570 static int __set_cpus_allowed(task_t *p, cpumask_t new_mask,
571                                 migration_req_t *req)
572 {
573         runqueue_t *rq = task_rq(p);
574
575         p->cpus_allowed = new_mask;
576         /*
577          * Can the task run on the task's current CPU? If not then
578          * migrate the thread off to a proper CPU.
579          */
580         if (cpu_isset(task_cpu(p), new_mask))
581                 return 0;
582
583         /*
584          * If the task is not on a runqueue (and not running), then
585          * it is sufficient to simply update the task's cpu field.
586          */
587         if (!p->array && !task_running(rq, p)) {
588                 set_task_cpu(p, any_online_cpu(p->cpus_allowed));
589                 return 0;
590         }
591
592         init_completion(&req->done);
593         req->task = p;
594         list_add(&req->list, &rq->migration_queue);
595         return 1;
596 }
597
598 /*
599  * wait_task_inactive - wait for a thread to unschedule.
600  *
601  * The caller must ensure that the task *will* unschedule sometime soon,
602  * else this function might spin for a *long* time. This function can't
603  * be called with interrupts off, or it may introduce deadlock with
604  * smp_call_function() if an IPI is sent by the same process we are
605  * waiting to become inactive.
606  */
607 void wait_task_inactive(task_t * p)
608 {
609         unsigned long flags;
610         runqueue_t *rq;
611         int preempted;
612
613 repeat:
614         rq = task_rq_lock(p, &flags);
615         /* Must be off runqueue entirely, not preempted. */
616         if (unlikely(p->array)) {
617                 /* If it's preempted, we yield.  It could be a while. */
618                 preempted = !task_running(rq, p);
619                 task_rq_unlock(rq, &flags);
620                 cpu_relax();
621                 if (preempted)
622                         yield();
623                 goto repeat;
624         }
625         task_rq_unlock(rq, &flags);
626 }
627
628 /***
629  * kick_process - kick a running thread to enter/exit the kernel
630  * @p: the to-be-kicked thread
631  *
632  * Cause a process which is running on another CPU to enter
633  * kernel-mode, without any delay. (to get signals handled.)
634  */
635 void kick_process(task_t *p)
636 {
637         int cpu;
638
639         preempt_disable();
640         cpu = task_cpu(p);
641         if ((cpu != smp_processor_id()) && task_curr(p))
642                 smp_send_reschedule(cpu);
643         preempt_enable();
644 }
645
646 EXPORT_SYMBOL_GPL(kick_process);
647
648 #endif
649
650 /***
651  * try_to_wake_up - wake up a thread
652  * @p: the to-be-woken-up thread
653  * @state: the mask of task states that can be woken
654  * @sync: do a synchronous wakeup?
655  *
656  * Put it on the run-queue if it's not already there. The "current"
657  * thread is always on the run-queue (except when the actual
658  * re-schedule is in progress), and as such you're allowed to do
659  * the simpler "current->state = TASK_RUNNING" to mark yourself
660  * runnable without the overhead of this.
661  *
662  * returns failure only if the task is already active.
663  */
664 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
665 {
666         unsigned long flags;
667         int success = 0;
668         long old_state;
669         runqueue_t *rq;
670
671 repeat_lock_task:
672         rq = task_rq_lock(p, &flags);
673         old_state = p->state;
674         if (old_state & state) {
675                 if (!p->array) {
676                         /*
677                          * Fast-migrate the task if it's not running or runnable
678                          * currently. Do not violate hard affinity.
679                          */
680                         if (unlikely(sync && !task_running(rq, p) &&
681                                 (task_cpu(p) != smp_processor_id()) &&
682                                         cpu_isset(smp_processor_id(),
683                                                         p->cpus_allowed) &&
684                                         !cpu_is_offline(smp_processor_id()))) {
685                                 set_task_cpu(p, smp_processor_id());
686                                 task_rq_unlock(rq, &flags);
687                                 goto repeat_lock_task;
688                         }
689                         if (old_state == TASK_UNINTERRUPTIBLE) {
690                                 rq->nr_uninterruptible--;
691                                 /*
692                                  * Tasks on involuntary sleep don't earn
693                                  * sleep_avg beyond just interactive state.
694                                  */
695                                 p->activated = -1;
696                         }
697                         if (sync && (task_cpu(p) == smp_processor_id()))
698                                 __activate_task(p, rq);
699                         else {
700                                 activate_task(p, rq);
701                                 if (TASK_PREEMPTS_CURR(p, rq))
702                                         resched_task(rq->curr);
703                         }
704                         success = 1;
705                 }
706                 p->state = TASK_RUNNING;
707         }
708         task_rq_unlock(rq, &flags);
709
710         return success;
711 }
712 int fastcall wake_up_process(task_t * p)
713 {
714         return try_to_wake_up(p, TASK_STOPPED |
715                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
716 }
717
718 EXPORT_SYMBOL(wake_up_process);
719
720 int fastcall wake_up_state(task_t *p, unsigned int state)
721 {
722         return try_to_wake_up(p, state, 0);
723 }
724
725 /*
726  * Perform scheduler related setup for a newly forked process p.
727  * p is forked by current.
728  */
729 void fastcall sched_fork(task_t *p)
730 {
731         /*
732          * We mark the process as running here, but have not actually
733          * inserted it onto the runqueue yet. This guarantees that
734          * nobody will actually run it, and a signal or other external
735          * event cannot wake it up and insert it on the runqueue either.
736          */
737         p->state = TASK_RUNNING;
738         INIT_LIST_HEAD(&p->run_list);
739         p->array = NULL;
740         spin_lock_init(&p->switch_lock);
741 #ifdef CONFIG_PREEMPT
742         /*
743          * During context-switch we hold precisely one spinlock, which
744          * schedule_tail drops. (in the common case it's this_rq()->lock,
745          * but it also can be p->switch_lock.) So we compensate with a count
746          * of 1. Also, we want to start with kernel preemption disabled.
747          */
748         p->thread_info->preempt_count = 1;
749 #endif
750         /*
751          * Share the timeslice between parent and child, thus the
752          * total amount of pending timeslices in the system doesn't change,
753          * resulting in more scheduling fairness.
754          */
755         local_irq_disable();
756         p->time_slice = (current->time_slice + 1) >> 1;
757         /*
758          * The remainder of the first timeslice might be recovered by
759          * the parent if the child exits early enough.
760          */
761         p->first_time_slice = 1;
762         current->time_slice >>= 1;
763         p->timestamp = sched_clock();
764         if (!current->time_slice) {
765                 /*
766                  * This case is rare, it happens when the parent has only
767                  * a single jiffy left from its timeslice. Taking the
768                  * runqueue lock is not a problem.
769                  */
770                 current->time_slice = 1;
771                 preempt_disable();
772                 scheduler_tick(0, 0);
773                 local_irq_enable();
774                 preempt_enable();
775         } else
776                 local_irq_enable();
777 }
778
779 /*
780  * wake_up_forked_process - wake up a freshly forked process.
781  *
782  * This function will do some initial scheduler statistics housekeeping
783  * that must be done for every newly created process.
784  */
785 void fastcall wake_up_forked_process(task_t * p)
786 {
787         unsigned long flags;
788         runqueue_t *rq = task_rq_lock(current, &flags);
789
790         BUG_ON(p->state != TASK_RUNNING);
791
792         /*
793          * We decrease the sleep average of forking parents
794          * and children as well, to keep max-interactive tasks
795          * from forking tasks that are max-interactive.
796          */
797         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
798                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
799
800         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
801                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
802
803         p->interactive_credit = 0;
804
805         p->prio = effective_prio(p);
806         set_task_cpu(p, smp_processor_id());
807
808         if (unlikely(!current->array))
809                 __activate_task(p, rq);
810         else {
811                 p->prio = current->prio;
812                 list_add_tail(&p->run_list, &current->run_list);
813                 p->array = current->array;
814                 p->array->nr_active++;
815                 nr_running_inc(rq);
816         }
817         task_rq_unlock(rq, &flags);
818 }
819
820 /*
821  * Potentially available exiting-child timeslices are
822  * retrieved here - this way the parent does not get
823  * penalized for creating too many threads.
824  *
825  * (this cannot be used to 'generate' timeslices
826  * artificially, because any timeslice recovered here
827  * was given away by the parent in the first place.)
828  */
829 void fastcall sched_exit(task_t * p)
830 {
831         unsigned long flags;
832         runqueue_t *rq;
833
834         local_irq_save(flags);
835         if (p->first_time_slice) {
836                 p->parent->time_slice += p->time_slice;
837                 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
838                         p->parent->time_slice = MAX_TIMESLICE;
839         }
840         local_irq_restore(flags);
841         /*
842          * If the child was a (relative-) CPU hog then decrease
843          * the sleep_avg of the parent as well.
844          */
845         rq = task_rq_lock(p->parent, &flags);
846         if (p->sleep_avg < p->parent->sleep_avg)
847                 p->parent->sleep_avg = p->parent->sleep_avg /
848                 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
849                 (EXIT_WEIGHT + 1);
850         task_rq_unlock(rq, &flags);
851 }
852
853 /**
854  * finish_task_switch - clean up after a task-switch
855  * @prev: the thread we just switched away from.
856  *
857  * We enter this with the runqueue still locked, and finish_arch_switch()
858  * will unlock it along with doing any other architecture-specific cleanup
859  * actions.
860  *
861  * Note that we may have delayed dropping an mm in context_switch(). If
862  * so, we finish that here outside of the runqueue lock.  (Doing it
863  * with the lock held can cause deadlocks; see schedule() for
864  * details.)
865  */
866 static inline void finish_task_switch(task_t *prev)
867 {
868         runqueue_t *rq = this_rq();
869         struct mm_struct *mm = rq->prev_mm;
870         unsigned long prev_task_flags;
871
872         rq->prev_mm = NULL;
873
874         /*
875          * A task struct has one reference for the use as "current".
876          * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
877          * schedule one last time. The schedule call will never return,
878          * and the scheduled task must drop that reference.
879          * The test for TASK_ZOMBIE must occur while the runqueue locks are
880          * still held, otherwise prev could be scheduled on another cpu, die
881          * there before we look at prev->state, and then the reference would
882          * be dropped twice.
883          *              Manfred Spraul <manfred@colorfullife.com>
884          */
885         prev_task_flags = prev->flags;
886         finish_arch_switch(rq, prev);
887         if (mm)
888                 mmdrop(mm);
889         if (unlikely(prev_task_flags & PF_DEAD))
890                 put_task_struct(prev);
891 }
892
893 /**
894  * schedule_tail - first thing a freshly forked thread must call.
895  * @prev: the thread we just switched away from.
896  */
897 asmlinkage void schedule_tail(task_t *prev)
898 {
899         finish_task_switch(prev);
900
901         if (current->set_child_tid)
902                 put_user(current->pid, current->set_child_tid);
903 }
904
905 /*
906  * context_switch - switch to the new MM and the new
907  * thread's register state.
908  */
909 static inline
910 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
911 {
912         struct mm_struct *mm = next->mm;
913         struct mm_struct *oldmm = prev->active_mm;
914
915         if (unlikely(!mm)) {
916                 next->active_mm = oldmm;
917                 atomic_inc(&oldmm->mm_count);
918                 enter_lazy_tlb(oldmm, next);
919         } else
920                 switch_mm(oldmm, mm, next);
921
922         if (unlikely(!prev->mm)) {
923                 prev->active_mm = NULL;
924                 WARN_ON(rq->prev_mm);
925                 rq->prev_mm = oldmm;
926         }
927
928         /* Here we just switch the register state and the stack. */
929         switch_to(prev, next, prev);
930
931         return prev;
932 }
933
934 /*
935  * nr_running, nr_uninterruptible and nr_context_switches:
936  *
937  * externally visible scheduler statistics: current number of runnable
938  * threads, current number of uninterruptible-sleeping threads, total
939  * number of context switches performed since bootup.
940  */
941 unsigned long nr_running(void)
942 {
943         unsigned long i, sum = 0;
944
945         for (i = 0; i < NR_CPUS; i++)
946                 sum += cpu_rq(i)->nr_running;
947
948         return sum;
949 }
950
951 unsigned long nr_uninterruptible(void)
952 {
953         unsigned long i, sum = 0;
954
955         for_each_cpu(i)
956                 sum += cpu_rq(i)->nr_uninterruptible;
957
958         return sum;
959 }
960
961 unsigned long long nr_context_switches(void)
962 {
963         unsigned long long i, sum = 0;
964
965         for_each_cpu(i)
966                 sum += cpu_rq(i)->nr_switches;
967
968         return sum;
969 }
970
971 unsigned long nr_iowait(void)
972 {
973         unsigned long i, sum = 0;
974
975         for_each_cpu(i)
976                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
977
978         return sum;
979 }
980
981 /*
982  * double_rq_lock - safely lock two runqueues
983  *
984  * Note this does not disable interrupts like task_rq_lock,
985  * you need to do so manually before calling.
986  */
987 static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
988 {
989         if (rq1 == rq2)
990                 spin_lock(&rq1->lock);
991         else {
992                 if (rq1 < rq2) {
993                         spin_lock(&rq1->lock);
994                         spin_lock(&rq2->lock);
995                 } else {
996                         spin_lock(&rq2->lock);
997                         spin_lock(&rq1->lock);
998                 }
999         }
1000 }
1001
1002 /*
1003  * double_rq_unlock - safely unlock two runqueues
1004  *
1005  * Note this does not restore interrupts like task_rq_unlock,
1006  * you need to do so manually after calling.
1007  */
1008 static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1009 {
1010         spin_unlock(&rq1->lock);
1011         if (rq1 != rq2)
1012                 spin_unlock(&rq2->lock);
1013 }
1014
1015 #ifdef CONFIG_NUMA
1016 /*
1017  * If dest_cpu is allowed for this process, migrate the task to it.
1018  * This is accomplished by forcing the cpu_allowed mask to only
1019  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1020  * the cpu_allowed mask is restored.
1021  */
1022 static void sched_migrate_task(task_t *p, int dest_cpu)
1023 {
1024         runqueue_t *rq;
1025         migration_req_t req;
1026         unsigned long flags;
1027         cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu);
1028
1029         lock_cpu_hotplug();
1030         rq = task_rq_lock(p, &flags);
1031         old_mask = p->cpus_allowed;
1032         if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu))
1033                 goto out;
1034
1035         /* force the process onto the specified CPU */
1036         if (__set_cpus_allowed(p, new_mask, &req)) {
1037                 /* Need to wait for migration thread. */
1038                 task_rq_unlock(rq, &flags);
1039                 wake_up_process(rq->migration_thread);
1040                 wait_for_completion(&req.done);
1041
1042                 /* If we raced with sys_sched_setaffinity, don't
1043                  * restore mask. */
1044                 rq = task_rq_lock(p, &flags);
1045                 if (likely(cpus_equal(p->cpus_allowed, new_mask))) {
1046                         /* Restore old mask: won't need migration
1047                          * thread, since current cpu is allowed. */
1048                         BUG_ON(__set_cpus_allowed(p, old_mask, NULL));
1049                 }
1050         }
1051 out:
1052         task_rq_unlock(rq, &flags);
1053         unlock_cpu_hotplug();
1054 }
1055
1056 /*
1057  * Find the least loaded CPU.  Slightly favor the current CPU by
1058  * setting its runqueue length as the minimum to start.
1059  */
1060 static int sched_best_cpu(struct task_struct *p)
1061 {
1062         int i, minload, load, best_cpu, node = 0;
1063         cpumask_t cpumask;
1064
1065         best_cpu = task_cpu(p);
1066         if (cpu_rq(best_cpu)->nr_running <= 2)
1067                 return best_cpu;
1068
1069         minload = 10000000;
1070         for_each_node_with_cpus(i) {
1071                 /*
1072                  * Node load is always divided by nr_cpus_node to normalise
1073                  * load values in case cpu count differs from node to node.
1074                  * We first multiply node_nr_running by 10 to get a little
1075                  * better resolution.
1076                  */
1077                 load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
1078                 if (load < minload) {
1079                         minload = load;
1080                         node = i;
1081                 }
1082         }
1083
1084         minload = 10000000;
1085         cpumask = node_to_cpumask(node);
1086         for (i = 0; i < NR_CPUS; ++i) {
1087                 if (!cpu_isset(i, cpumask))
1088                         continue;
1089                 if (cpu_rq(i)->nr_running < minload) {
1090                         best_cpu = i;
1091                         minload = cpu_rq(i)->nr_running;
1092                 }
1093         }
1094         return best_cpu;
1095 }
1096
1097 void sched_balance_exec(void)
1098 {
1099         int new_cpu;
1100
1101         if (numnodes > 1) {
1102                 new_cpu = sched_best_cpu(current);
1103                 if (new_cpu != smp_processor_id())
1104                         sched_migrate_task(current, new_cpu);
1105         }
1106 }
1107
1108 /*
1109  * Find the busiest node. All previous node loads contribute with a
1110  * geometrically deccaying weight to the load measure:
1111  *      load_{t} = load_{t-1}/2 + nr_node_running_{t}
1112  * This way sudden load peaks are flattened out a bit.
1113  * Node load is divided by nr_cpus_node() in order to compare nodes
1114  * of different cpu count but also [first] multiplied by 10 to
1115  * provide better resolution.
1116  */
1117 static int find_busiest_node(int this_node)
1118 {
1119         int i, node = -1, load, this_load, maxload;
1120
1121         if (!nr_cpus_node(this_node))
1122                 return node;
1123         this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
1124                 + (10 * atomic_read(&node_nr_running[this_node])
1125                 / nr_cpus_node(this_node));
1126         this_rq()->prev_node_load[this_node] = this_load;
1127         for_each_node_with_cpus(i) {
1128                 if (i == this_node)
1129                         continue;
1130                 load = (this_rq()->prev_node_load[i] >> 1)
1131                         + (10 * atomic_read(&node_nr_running[i])
1132                         / nr_cpus_node(i));
1133                 this_rq()->prev_node_load[i] = load;
1134                 if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
1135                         maxload = load;
1136                         node = i;
1137                 }
1138         }
1139         return node;
1140 }
1141
1142 #endif /* CONFIG_NUMA */
1143
1144 #ifdef CONFIG_SMP
1145
1146 /*
1147  * double_lock_balance - lock the busiest runqueue
1148  *
1149  * this_rq is locked already. Recalculate nr_running if we have to
1150  * drop the runqueue lock.
1151  */
1152 static inline
1153 unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest,
1154                                  int this_cpu, int idle,
1155                                  unsigned int nr_running)
1156 {
1157         if (unlikely(!spin_trylock(&busiest->lock))) {
1158                 if (busiest < this_rq) {
1159                         spin_unlock(&this_rq->lock);
1160                         spin_lock(&busiest->lock);
1161                         spin_lock(&this_rq->lock);
1162                         /* Need to recalculate nr_running */
1163                         if (idle || (this_rq->nr_running >
1164                                         this_rq->prev_cpu_load[this_cpu]))
1165                                 nr_running = this_rq->nr_running;
1166                         else
1167                                 nr_running = this_rq->prev_cpu_load[this_cpu];
1168                 } else
1169                         spin_lock(&busiest->lock);
1170         }
1171         return nr_running;
1172 }
1173
1174 /*
1175  * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
1176  */
1177 static inline
1178 runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle,
1179                                int *imbalance, cpumask_t cpumask)
1180 {
1181         int nr_running, load, max_load, i;
1182         runqueue_t *busiest, *rq_src;
1183
1184         /*
1185          * We search all runqueues to find the most busy one.
1186          * We do this lockless to reduce cache-bouncing overhead,
1187          * we re-check the 'best' source CPU later on again, with
1188          * the lock held.
1189          *
1190          * We fend off statistical fluctuations in runqueue lengths by
1191          * saving the runqueue length (as seen by the balancing CPU) during
1192          * the previous load-balancing operation and using the smaller one
1193          * of the current and saved lengths. If a runqueue is long enough
1194          * for a longer amount of time then we recognize it and pull tasks
1195          * from it.
1196          *
1197          * The 'current runqueue length' is a statistical maximum variable,
1198          * for that one we take the longer one - to avoid fluctuations in
1199          * the other direction. So for a load-balance to happen it needs
1200          * stable long runqueue on the target CPU and stable short runqueue
1201          * on the local runqueue.
1202          *
1203          * We make an exception if this CPU is about to become idle - in
1204          * that case we are less picky about moving a task across CPUs and
1205          * take what can be taken.
1206          */
1207         if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
1208                 nr_running = this_rq->nr_running;
1209         else
1210                 nr_running = this_rq->prev_cpu_load[this_cpu];
1211
1212         busiest = NULL;
1213         max_load = 1;
1214         for (i = 0; i < NR_CPUS; i++) {
1215                 if (!cpu_isset(i, cpumask))
1216                         continue;
1217
1218                 rq_src = cpu_rq(i);
1219                 if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
1220                         load = rq_src->nr_running;
1221                 else
1222                         load = this_rq->prev_cpu_load[i];
1223                 this_rq->prev_cpu_load[i] = rq_src->nr_running;
1224
1225                 if ((load > max_load) && (rq_src != this_rq)) {
1226                         busiest = rq_src;
1227                         max_load = load;
1228                 }
1229         }
1230
1231         if (likely(!busiest))
1232                 goto out;
1233
1234         *imbalance = max_load - nr_running;
1235
1236         /* It needs an at least ~25% imbalance to trigger balancing. */
1237         if (!idle && ((*imbalance)*4 < max_load)) {
1238                 busiest = NULL;
1239                 goto out;
1240         }
1241
1242         nr_running = double_lock_balance(this_rq, busiest, this_cpu,
1243                                          idle, nr_running);
1244         /*
1245          * Make sure nothing changed since we checked the
1246          * runqueue length.
1247          */
1248         if (busiest->nr_running <= nr_running) {
1249                 spin_unlock(&busiest->lock);
1250                 busiest = NULL;
1251         }
1252 out:
1253         return busiest;
1254 }
1255
1256 /*
1257  * pull_task - move a task from a remote runqueue to the local runqueue.
1258  * Both runqueues must be locked.
1259  */
1260 static inline
1261 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1262                runqueue_t *this_rq, int this_cpu)
1263 {
1264         dequeue_task(p, src_array);
1265         nr_running_dec(src_rq);
1266         set_task_cpu(p, this_cpu);
1267         nr_running_inc(this_rq);
1268         enqueue_task(p, this_rq->active);
1269         p->timestamp = sched_clock() -
1270                                 (src_rq->timestamp_last_tick - p->timestamp);
1271         /*
1272          * Note that idle threads have a prio of MAX_PRIO, for this test
1273          * to be always true for them.
1274          */
1275         if (TASK_PREEMPTS_CURR(p, this_rq))
1276                 set_need_resched();
1277 }
1278
1279 /*
1280  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1281  */
1282 static inline
1283 int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
1284 {
1285         unsigned long delta = rq->timestamp_last_tick - tsk->timestamp;
1286
1287         /*
1288          * We do not migrate tasks that are:
1289          * 1) running (obviously), or
1290          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1291          * 3) are cache-hot on their current CPU.
1292          */
1293         if (task_running(rq, tsk))
1294                 return 0;
1295         if (!cpu_isset(this_cpu, tsk->cpus_allowed))
1296                 return 0;
1297         if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks)))
1298                 return 0;
1299         return 1;
1300 }
1301
1302 /*
1303  * Current runqueue is empty, or rebalance tick: if there is an
1304  * inbalance (current runqueue is too short) then pull from
1305  * busiest runqueue(s).
1306  *
1307  * We call this with the current runqueue locked,
1308  * irqs disabled.
1309  */
1310 static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
1311 {
1312         int imbalance, idx, this_cpu = smp_processor_id();
1313         runqueue_t *busiest;
1314         prio_array_t *array;
1315         struct list_head *head, *curr;
1316         task_t *tmp;
1317
1318         if (cpu_is_offline(this_cpu))
1319                 goto out;
1320
1321         busiest = find_busiest_queue(this_rq, this_cpu, idle,
1322                                      &imbalance, cpumask);
1323         if (!busiest)
1324                 goto out;
1325
1326         /*
1327          * We only want to steal a number of tasks equal to 1/2 the imbalance,
1328          * otherwise we'll just shift the imbalance to the new queue:
1329          */
1330         imbalance /= 2;
1331
1332         /*
1333          * We first consider expired tasks. Those will likely not be
1334          * executed in the near future, and they are most likely to
1335          * be cache-cold, thus switching CPUs has the least effect
1336          * on them.
1337          */
1338         if (busiest->expired->nr_active)
1339                 array = busiest->expired;
1340         else
1341                 array = busiest->active;
1342
1343 new_array:
1344         /* Start searching at priority 0: */
1345         idx = 0;
1346 skip_bitmap:
1347         if (!idx)
1348                 idx = sched_find_first_bit(array->bitmap);
1349         else
1350                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1351         if (idx >= MAX_PRIO) {
1352                 if (array == busiest->expired) {
1353                         array = busiest->active;
1354                         goto new_array;
1355                 }
1356                 goto out_unlock;
1357         }
1358
1359         head = array->queue + idx;
1360         curr = head->prev;
1361 skip_queue:
1362         tmp = list_entry(curr, task_t, run_list);
1363
1364         curr = curr->prev;
1365
1366         if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
1367                 if (curr != head)
1368                         goto skip_queue;
1369                 idx++;
1370                 goto skip_bitmap;
1371         }
1372         pull_task(busiest, array, tmp, this_rq, this_cpu);
1373
1374         /* Only migrate one task if we are idle */
1375         if (!idle && --imbalance) {
1376                 if (curr != head)
1377                         goto skip_queue;
1378                 idx++;
1379                 goto skip_bitmap;
1380         }
1381 out_unlock:
1382         spin_unlock(&busiest->lock);
1383 out:
1384         ;
1385 }
1386
1387 /*
1388  * One of the idle_cpu_tick() and busy_cpu_tick() functions will
1389  * get called every timer tick, on every CPU. Our balancing action
1390  * frequency and balancing agressivity depends on whether the CPU is
1391  * idle or not.
1392  *
1393  * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
1394  * systems with HZ=100, every 10 msecs.)
1395  *
1396  * On NUMA, do a node-rebalance every 400 msecs.
1397  */
1398 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1399 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
1400 #define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
1401 #define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
1402
1403 #ifdef CONFIG_NUMA
1404 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
1405 {
1406         int node = find_busiest_node(cpu_to_node(this_cpu));
1407
1408         if (node >= 0) {
1409                 cpumask_t cpumask = node_to_cpumask(node);
1410                 cpu_set(this_cpu, cpumask);
1411                 spin_lock(&this_rq->lock);
1412                 load_balance(this_rq, idle, cpumask);
1413                 spin_unlock(&this_rq->lock);
1414         }
1415 }
1416 #endif
1417
1418 static void rebalance_tick(runqueue_t *this_rq, int idle)
1419 {
1420 #ifdef CONFIG_NUMA
1421         int this_cpu = smp_processor_id();
1422 #endif
1423         unsigned long j = jiffies;
1424
1425         /*
1426          * First do inter-node rebalancing, then intra-node rebalancing,
1427          * if both events happen in the same tick. The inter-node
1428          * rebalancing does not necessarily have to create a perfect
1429          * balance within the node, since we load-balance the most loaded
1430          * node with the current CPU. (ie. other CPUs in the local node
1431          * are not balanced.)
1432          */
1433         if (idle) {
1434 #ifdef CONFIG_NUMA
1435                 if (!(j % IDLE_NODE_REBALANCE_TICK))
1436                         balance_node(this_rq, idle, this_cpu);
1437 #endif
1438                 if (!(j % IDLE_REBALANCE_TICK)) {
1439                         spin_lock(&this_rq->lock);
1440                         load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1441                         spin_unlock(&this_rq->lock);
1442                 }
1443                 return;
1444         }
1445 #ifdef CONFIG_NUMA
1446         if (!(j % BUSY_NODE_REBALANCE_TICK))
1447                 balance_node(this_rq, idle, this_cpu);
1448 #endif
1449         if (!(j % BUSY_REBALANCE_TICK)) {
1450                 spin_lock(&this_rq->lock);
1451                 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1452                 spin_unlock(&this_rq->lock);
1453         }
1454 }
1455 #else
1456 /*
1457  * on UP we do not need to balance between CPUs:
1458  */
1459 static inline void rebalance_tick(runqueue_t *this_rq, int idle)
1460 {
1461 }
1462 #endif
1463
1464 DEFINE_PER_CPU(struct kernel_stat, kstat);
1465
1466 EXPORT_PER_CPU_SYMBOL(kstat);
1467
1468 /*
1469  * We place interactive tasks back into the active array, if possible.
1470  *
1471  * To guarantee that this does not starve expired tasks we ignore the
1472  * interactivity of a task if the first expired task had to wait more
1473  * than a 'reasonable' amount of time. This deadline timeout is
1474  * load-dependent, as the frequency of array switched decreases with
1475  * increasing number of running tasks. We also ignore the interactivity
1476  * if a better static_prio task has expired:
1477  */
1478 #define EXPIRED_STARVING(rq) \
1479         ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1480                 (jiffies - (rq)->expired_timestamp >= \
1481                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1482                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
1483
1484 /*
1485  * This function gets called by the timer code, with HZ frequency.
1486  * We call it with interrupts disabled.
1487  *
1488  * It also gets called by the fork code, when changing the parent's
1489  * timeslices.
1490  */
1491 void scheduler_tick(int user_ticks, int sys_ticks)
1492 {
1493         int cpu = smp_processor_id();
1494         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1495         runqueue_t *rq = this_rq();
1496         task_t *p = current;
1497
1498         rq->timestamp_last_tick = sched_clock();
1499
1500         if (rcu_pending(cpu))
1501                 rcu_check_callbacks(cpu, user_ticks);
1502
1503         /* note: this timer irq context must be accounted for as well */
1504         if (hardirq_count() - HARDIRQ_OFFSET) {
1505                 cpustat->irq += sys_ticks;
1506                 sys_ticks = 0;
1507         } else if (softirq_count()) {
1508                 cpustat->softirq += sys_ticks;
1509                 sys_ticks = 0;
1510         }
1511
1512         if (p == rq->idle) {
1513                 if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
1514                         set_need_resched();     
1515
1516                 if (atomic_read(&rq->nr_iowait) > 0)
1517                         cpustat->iowait += sys_ticks;
1518                 else
1519                         cpustat->idle += sys_ticks;
1520                 rebalance_tick(rq, 1);
1521                 return;
1522         }
1523         if (TASK_NICE(p) > 0)
1524                 cpustat->nice += user_ticks;
1525         else
1526                 cpustat->user += user_ticks;
1527         cpustat->system += sys_ticks;
1528
1529         /* Task might have expired already, but not scheduled off yet */
1530         if (p->array != rq->active) {
1531                 set_tsk_need_resched(p);
1532                 goto out;
1533         }
1534         spin_lock(&rq->lock);
1535         /*
1536          * The task was running during this tick - update the
1537          * time slice counter. Note: we do not update a thread's
1538          * priority until it either goes to sleep or uses up its
1539          * timeslice. This makes it possible for interactive tasks
1540          * to use up their timeslices at their highest priority levels.
1541          */
1542         if (unlikely(rt_task(p))) {
1543                 /*
1544                  * RR tasks need a special form of timeslice management.
1545                  * FIFO tasks have no timeslices.
1546                  */
1547                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
1548                         p->time_slice = task_timeslice(p);
1549                         p->first_time_slice = 0;
1550                         set_tsk_need_resched(p);
1551
1552                         /* put it at the end of the queue: */
1553                         dequeue_task(p, rq->active);
1554                         enqueue_task(p, rq->active);
1555                 }
1556                 goto out_unlock;
1557         }
1558         if (vx_need_resched(p)) {
1559                 dequeue_task(p, rq->active);
1560                 set_tsk_need_resched(p);
1561                 p->prio = effective_prio(p);
1562                 p->time_slice = task_timeslice(p);
1563                 p->first_time_slice = 0;
1564
1565                 if (!rq->expired_timestamp)
1566                         rq->expired_timestamp = jiffies;
1567                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1568                         enqueue_task(p, rq->expired);
1569                         if (p->static_prio < rq->best_expired_prio)
1570                                 rq->best_expired_prio = p->static_prio;
1571                 } else
1572                         enqueue_task(p, rq->active);
1573         } else {
1574                 /*
1575                  * Prevent a too long timeslice allowing a task to monopolize
1576                  * the CPU. We do this by splitting up the timeslice into
1577                  * smaller pieces.
1578                  *
1579                  * Note: this does not mean the task's timeslices expire or
1580                  * get lost in any way, they just might be preempted by
1581                  * another task of equal priority. (one with higher
1582                  * priority would have preempted this task already.) We
1583                  * requeue this task to the end of the list on this priority
1584                  * level, which is in essence a round-robin of tasks with
1585                  * equal priority.
1586                  *
1587                  * This only applies to tasks in the interactive
1588                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
1589                  */
1590                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1591                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1592                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1593                         (p->array == rq->active)) {
1594
1595                         dequeue_task(p, rq->active);
1596                         set_tsk_need_resched(p);
1597                         p->prio = effective_prio(p);
1598                         enqueue_task(p, rq->active);
1599                 }
1600         }
1601 out_unlock:
1602         spin_unlock(&rq->lock);
1603 out:
1604         rebalance_tick(rq, 0);
1605 }
1606
1607 /*
1608  * schedule() is the main scheduler function.
1609  */
1610 asmlinkage void __sched schedule(void)
1611 {
1612         long *switch_count;
1613         task_t *prev, *next;
1614         runqueue_t *rq;
1615         prio_array_t *array;
1616         struct list_head *queue;
1617         unsigned long long now;
1618         unsigned long run_time;
1619 #ifdef  CONFIG_VSERVER_HARDCPU          
1620         struct vx_info *vxi;
1621         int maxidle = -HZ;
1622 #endif
1623         int idx;
1624
1625         /*
1626          * Test if we are atomic.  Since do_exit() needs to call into
1627          * schedule() atomically, we ignore that path for now.
1628          * Otherwise, whine if we are scheduling when we should not be.
1629          */
1630         if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
1631                 if (unlikely(in_atomic())) {
1632                         printk(KERN_ERR "bad: scheduling while atomic!\n");
1633                         dump_stack();
1634                 }
1635         }
1636
1637 need_resched:
1638         preempt_disable();
1639         prev = current;
1640         rq = this_rq();
1641
1642         release_kernel_lock(prev);
1643         now = sched_clock();
1644         if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
1645                 run_time = now - prev->timestamp;
1646         else
1647                 run_time = NS_MAX_SLEEP_AVG;
1648
1649         /*
1650          * Tasks with interactive credits get charged less run_time
1651          * at high sleep_avg to delay them losing their interactive
1652          * status
1653          */
1654         if (HIGH_CREDIT(prev))
1655                 run_time /= (CURRENT_BONUS(prev) ? : 1);
1656
1657         spin_lock_irq(&rq->lock);
1658
1659         /*
1660          * if entering off of a kernel preemption go straight
1661          * to picking the next task.
1662          */
1663         switch_count = &prev->nivcsw;
1664         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1665                 switch_count = &prev->nvcsw;
1666                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
1667                                 unlikely(signal_pending(prev))))
1668                         prev->state = TASK_RUNNING;
1669                 else
1670                         deactivate_task(prev, rq);
1671         }
1672
1673 #ifdef  CONFIG_VSERVER_HARDCPU          
1674         if (!list_empty(&rq->hold_queue)) {
1675                 struct list_head *l, *n;
1676                 int ret;
1677
1678                 vxi = NULL;
1679                 list_for_each_safe(l, n, &rq->hold_queue) {
1680                         next = list_entry(l, task_t, run_list);
1681                         if (vxi == next->vx_info)
1682                                 continue;
1683
1684                         vxi = next->vx_info;
1685                         ret = vx_tokens_recalc(vxi);
1686                         // tokens = vx_tokens_avail(next);
1687
1688                         if (ret > 0) {
1689                                 list_del(&next->run_list);
1690                                 next->state &= ~TASK_ONHOLD;
1691                                 recalc_task_prio(next, now);
1692                                 __activate_task(next, rq);
1693                                 // printk("··· unhold %p\n", next);
1694                                 break;
1695                         }
1696                         if ((ret < 0) && (maxidle < ret))
1697                                 maxidle = ret;
1698                 }       
1699         }
1700         rq->idle_tokens = -maxidle;
1701
1702 pick_next:
1703 #endif
1704         if (unlikely(!rq->nr_running)) {
1705 #ifdef CONFIG_SMP
1706                 load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
1707 #endif
1708                 if (!rq->nr_running) {
1709                         next = rq->idle;
1710                         rq->expired_timestamp = 0;
1711                         goto switch_tasks;
1712                 }
1713         }
1714
1715         array = rq->active;
1716         if (unlikely(!array->nr_active)) {
1717                 /*
1718                  * Switch the active and expired arrays.
1719                  */
1720                 rq->active = rq->expired;
1721                 rq->expired = array;
1722                 array = rq->active;
1723                 rq->expired_timestamp = 0;
1724                 rq->best_expired_prio = MAX_PRIO;
1725         }
1726
1727         idx = sched_find_first_bit(array->bitmap);
1728         queue = array->queue + idx;
1729         next = list_entry(queue->next, task_t, run_list);
1730
1731 #ifdef  CONFIG_VSERVER_HARDCPU          
1732         vxi = next->vx_info;
1733         if (vxi && __vx_flags(vxi->vx_flags,
1734                 VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
1735                 int ret = vx_tokens_recalc(vxi);
1736
1737                 if (unlikely(ret <= 0)) {
1738                         if (ret && (rq->idle_tokens > -ret))
1739                                 rq->idle_tokens = -ret;
1740                         deactivate_task(next, rq);
1741                         list_add_tail(&next->run_list, &rq->hold_queue);
1742                         next->state |= TASK_ONHOLD;                     
1743                         goto pick_next;
1744                 }
1745         }
1746 #endif
1747
1748         if (!rt_task(next) && next->activated > 0) {
1749                 unsigned long long delta = now - next->timestamp;
1750
1751                 if (next->activated == 1)
1752                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1753
1754                 array = next->array;
1755                 dequeue_task(next, array);
1756                 recalc_task_prio(next, next->timestamp + delta);
1757                 enqueue_task(next, array);
1758         }
1759         next->activated = 0;
1760 switch_tasks:
1761         prefetch(next);
1762         clear_tsk_need_resched(prev);
1763         RCU_qsctr(task_cpu(prev))++;
1764
1765         prev->sleep_avg -= run_time;
1766         if ((long)prev->sleep_avg <= 0) {
1767                 prev->sleep_avg = 0;
1768                 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
1769                         prev->interactive_credit--;
1770         }
1771         prev->timestamp = now;
1772
1773         if (likely(prev != next)) {
1774                 next->timestamp = now;
1775                 rq->nr_switches++;
1776                 rq->curr = next;
1777                 ++*switch_count;
1778
1779                 prepare_arch_switch(rq, next);
1780                 prev = context_switch(rq, prev, next);
1781                 barrier();
1782
1783                 finish_task_switch(prev);
1784         } else
1785                 spin_unlock_irq(&rq->lock);
1786
1787         reacquire_kernel_lock(current);
1788         preempt_enable_no_resched();
1789         if (test_thread_flag(TIF_NEED_RESCHED))
1790                 goto need_resched;
1791 }
1792
1793 EXPORT_SYMBOL(schedule);
1794
1795 #ifdef CONFIG_PREEMPT
1796 /*
1797  * this is is the entry point to schedule() from in-kernel preemption
1798  * off of preempt_enable.  Kernel preemptions off return from interrupt
1799  * occur there and call schedule directly.
1800  */
1801 asmlinkage void __sched preempt_schedule(void)
1802 {
1803         struct thread_info *ti = current_thread_info();
1804
1805         /*
1806          * If there is a non-zero preempt_count or interrupts are disabled,
1807          * we do not want to preempt the current task.  Just return..
1808          */
1809         if (unlikely(ti->preempt_count || irqs_disabled()))
1810                 return;
1811
1812 need_resched:
1813         ti->preempt_count = PREEMPT_ACTIVE;
1814         schedule();
1815         ti->preempt_count = 0;
1816
1817         /* we could miss a preemption opportunity between schedule and now */
1818         barrier();
1819         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
1820                 goto need_resched;
1821 }
1822
1823 EXPORT_SYMBOL(preempt_schedule);
1824 #endif /* CONFIG_PREEMPT */
1825
1826 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync)
1827 {
1828         task_t *p = curr->task;
1829         return try_to_wake_up(p, mode, sync);
1830 }
1831
1832 EXPORT_SYMBOL(default_wake_function);
1833
1834 /*
1835  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
1836  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
1837  * number) then we wake all the non-exclusive tasks and one exclusive task.
1838  *
1839  * There are circumstances in which we can try to wake a task which has already
1840  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
1841  * zero in this (rare) case, and we handle it by continuing to scan the queue.
1842  */
1843 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
1844                              int nr_exclusive, int sync)
1845 {
1846         struct list_head *tmp, *next;
1847
1848         list_for_each_safe(tmp, next, &q->task_list) {
1849                 wait_queue_t *curr;
1850                 unsigned flags;
1851                 curr = list_entry(tmp, wait_queue_t, task_list);
1852                 flags = curr->flags;
1853                 if (curr->func(curr, mode, sync) &&
1854                     (flags & WQ_FLAG_EXCLUSIVE) &&
1855                     !--nr_exclusive)
1856                         break;
1857         }
1858 }
1859
1860 /**
1861  * __wake_up - wake up threads blocked on a waitqueue.
1862  * @q: the waitqueue
1863  * @mode: which threads
1864  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1865  */
1866 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1867 {
1868         unsigned long flags;
1869
1870         spin_lock_irqsave(&q->lock, flags);
1871         __wake_up_common(q, mode, nr_exclusive, 0);
1872         spin_unlock_irqrestore(&q->lock, flags);
1873 }
1874
1875 EXPORT_SYMBOL(__wake_up);
1876
1877 /*
1878  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
1879  */
1880 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
1881 {
1882         __wake_up_common(q, mode, 1, 0);
1883 }
1884
1885 /**
1886  * __wake_up - sync- wake up threads blocked on a waitqueue.
1887  * @q: the waitqueue
1888  * @mode: which threads
1889  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1890  *
1891  * The sync wakeup differs that the waker knows that it will schedule
1892  * away soon, so while the target thread will be woken up, it will not
1893  * be migrated to another CPU - ie. the two threads are 'synchronized'
1894  * with each other. This can prevent needless bouncing between CPUs.
1895  *
1896  * On UP it can prevent extra preemption.
1897  */
1898 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1899 {
1900         unsigned long flags;
1901
1902         if (unlikely(!q))
1903                 return;
1904
1905         spin_lock_irqsave(&q->lock, flags);
1906         if (likely(nr_exclusive))
1907                 __wake_up_common(q, mode, nr_exclusive, 1);
1908         else
1909                 __wake_up_common(q, mode, nr_exclusive, 0);
1910         spin_unlock_irqrestore(&q->lock, flags);
1911 }
1912 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
1913
1914 void fastcall complete(struct completion *x)
1915 {
1916         unsigned long flags;
1917
1918         spin_lock_irqsave(&x->wait.lock, flags);
1919         x->done++;
1920         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1921                          1, 0);
1922         spin_unlock_irqrestore(&x->wait.lock, flags);
1923 }
1924 EXPORT_SYMBOL(complete);
1925
1926 void fastcall complete_all(struct completion *x)
1927 {
1928         unsigned long flags;
1929
1930         spin_lock_irqsave(&x->wait.lock, flags);
1931         x->done += UINT_MAX/2;
1932         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1933                          0, 0);
1934         spin_unlock_irqrestore(&x->wait.lock, flags);
1935 }
1936 EXPORT_SYMBOL(complete_all);
1937
1938 void fastcall __sched wait_for_completion(struct completion *x)
1939 {
1940         might_sleep();
1941         spin_lock_irq(&x->wait.lock);
1942         if (!x->done) {
1943                 DECLARE_WAITQUEUE(wait, current);
1944
1945                 wait.flags |= WQ_FLAG_EXCLUSIVE;
1946                 __add_wait_queue_tail(&x->wait, &wait);
1947                 do {
1948                         __set_current_state(TASK_UNINTERRUPTIBLE);
1949                         spin_unlock_irq(&x->wait.lock);
1950                         schedule();
1951                         spin_lock_irq(&x->wait.lock);
1952                 } while (!x->done);
1953                 __remove_wait_queue(&x->wait, &wait);
1954         }
1955         x->done--;
1956         spin_unlock_irq(&x->wait.lock);
1957 }
1958 EXPORT_SYMBOL(wait_for_completion);
1959
1960 #define SLEEP_ON_VAR                                    \
1961         unsigned long flags;                            \
1962         wait_queue_t wait;                              \
1963         init_waitqueue_entry(&wait, current);
1964
1965 #define SLEEP_ON_HEAD                                   \
1966         spin_lock_irqsave(&q->lock,flags);              \
1967         __add_wait_queue(q, &wait);                     \
1968         spin_unlock(&q->lock);
1969
1970 #define SLEEP_ON_TAIL                                   \
1971         spin_lock_irq(&q->lock);                        \
1972         __remove_wait_queue(q, &wait);                  \
1973         spin_unlock_irqrestore(&q->lock, flags);
1974
1975 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
1976 {
1977         SLEEP_ON_VAR
1978
1979         current->state = TASK_INTERRUPTIBLE;
1980
1981         SLEEP_ON_HEAD
1982         schedule();
1983         SLEEP_ON_TAIL
1984 }
1985
1986 EXPORT_SYMBOL(interruptible_sleep_on);
1987
1988 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1989 {
1990         SLEEP_ON_VAR
1991
1992         current->state = TASK_INTERRUPTIBLE;
1993
1994         SLEEP_ON_HEAD
1995         timeout = schedule_timeout(timeout);
1996         SLEEP_ON_TAIL
1997
1998         return timeout;
1999 }
2000
2001 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2002
2003 void fastcall __sched sleep_on(wait_queue_head_t *q)
2004 {
2005         SLEEP_ON_VAR
2006
2007         current->state = TASK_UNINTERRUPTIBLE;
2008
2009         SLEEP_ON_HEAD
2010         schedule();
2011         SLEEP_ON_TAIL
2012 }
2013
2014 EXPORT_SYMBOL(sleep_on);
2015
2016 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2017 {
2018         SLEEP_ON_VAR
2019
2020         current->state = TASK_UNINTERRUPTIBLE;
2021
2022         SLEEP_ON_HEAD
2023         timeout = schedule_timeout(timeout);
2024         SLEEP_ON_TAIL
2025
2026         return timeout;
2027 }
2028
2029 EXPORT_SYMBOL(sleep_on_timeout);
2030
2031 void set_user_nice(task_t *p, long nice)
2032 {
2033         unsigned long flags;
2034         prio_array_t *array;
2035         runqueue_t *rq;
2036         int old_prio, new_prio, delta;
2037
2038         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
2039                 return;
2040         /*
2041          * We have to be careful, if called from sys_setpriority(),
2042          * the task might be in the middle of scheduling on another CPU.
2043          */
2044         rq = task_rq_lock(p, &flags);
2045         /*
2046          * The RT priorities are set via setscheduler(), but we still
2047          * allow the 'normal' nice value to be set - but as expected
2048          * it wont have any effect on scheduling until the task is
2049          * not SCHED_NORMAL:
2050          */
2051         if (rt_task(p)) {
2052                 p->static_prio = NICE_TO_PRIO(nice);
2053                 goto out_unlock;
2054         }
2055         array = p->array;
2056         if (array)
2057                 dequeue_task(p, array);
2058
2059         old_prio = p->prio;
2060         new_prio = NICE_TO_PRIO(nice);
2061         delta = new_prio - old_prio;
2062         p->static_prio = NICE_TO_PRIO(nice);
2063         p->prio += delta;
2064
2065         if (array) {
2066                 enqueue_task(p, array);
2067                 /*
2068                  * If the task increased its priority or is running and
2069                  * lowered its priority, then reschedule its CPU:
2070                  */
2071                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2072                         resched_task(rq->curr);
2073         }
2074 out_unlock:
2075         task_rq_unlock(rq, &flags);
2076 }
2077
2078 EXPORT_SYMBOL(set_user_nice);
2079
2080 #ifndef __alpha__
2081
2082 /*
2083  * sys_nice - change the priority of the current process.
2084  * @increment: priority increment
2085  *
2086  * sys_setpriority is a more generic, but much slower function that
2087  * does similar things.
2088  */
2089 asmlinkage long sys_nice(int increment)
2090 {
2091         int retval;
2092         long nice;
2093
2094         /*
2095          * Setpriority might change our priority at the same moment.
2096          * We don't have to worry. Conceptually one call occurs first
2097          * and we have a single winner.
2098          */
2099         if (increment < 0) {
2100                 if (!capable(CAP_SYS_NICE))
2101                         return -EPERM;
2102                 if (increment < -40)
2103                         increment = -40;
2104         }
2105         if (increment > 40)
2106                 increment = 40;
2107
2108         nice = PRIO_TO_NICE(current->static_prio) + increment;
2109         if (nice < -20)
2110                 nice = -20;
2111         if (nice > 19)
2112                 nice = 19;
2113
2114         retval = security_task_setnice(current, nice);
2115         if (retval)
2116                 return retval;
2117
2118         set_user_nice(current, nice);
2119         return 0;
2120 }
2121
2122 #endif
2123
2124 /**
2125  * task_prio - return the priority value of a given task.
2126  * @p: the task in question.
2127  *
2128  * This is the priority value as seen by users in /proc.
2129  * RT tasks are offset by -200. Normal tasks are centered
2130  * around 0, value goes from -16 to +15.
2131  */
2132 int task_prio(task_t *p)
2133 {
2134         return p->prio - MAX_RT_PRIO;
2135 }
2136
2137 /**
2138  * task_nice - return the nice value of a given task.
2139  * @p: the task in question.
2140  */
2141 int task_nice(task_t *p)
2142 {
2143         return TASK_NICE(p);
2144 }
2145
2146 EXPORT_SYMBOL(task_nice);
2147
2148 /**
2149  * idle_cpu - is a given cpu idle currently?
2150  * @cpu: the processor in question.
2151  */
2152 int idle_cpu(int cpu)
2153 {
2154         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
2155 }
2156
2157 EXPORT_SYMBOL_GPL(idle_cpu);
2158
2159 /**
2160  * find_process_by_pid - find a process with a matching PID value.
2161  * @pid: the pid in question.
2162  */
2163 static inline task_t *find_process_by_pid(pid_t pid)
2164 {
2165         return pid ? find_task_by_pid(pid) : current;
2166 }
2167
2168 /* Actually do priority change: must hold rq lock. */
2169 static void __setscheduler(struct task_struct *p, int policy, int prio)
2170 {
2171         BUG_ON(p->array);
2172         p->policy = policy;
2173         p->rt_priority = prio;
2174         if (policy != SCHED_NORMAL)
2175                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
2176         else
2177                 p->prio = p->static_prio;
2178 }
2179
2180 /*
2181  * setscheduler - change the scheduling policy and/or RT priority of a thread.
2182  */
2183 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
2184 {
2185         struct sched_param lp;
2186         int retval = -EINVAL;
2187         int oldprio;
2188         prio_array_t *array;
2189         unsigned long flags;
2190         runqueue_t *rq;
2191         task_t *p;
2192
2193         if (!param || pid < 0)
2194                 goto out_nounlock;
2195
2196         retval = -EFAULT;
2197         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
2198                 goto out_nounlock;
2199
2200         /*
2201          * We play safe to avoid deadlocks.
2202          */
2203         read_lock_irq(&tasklist_lock);
2204
2205         p = find_process_by_pid(pid);
2206
2207         retval = -ESRCH;
2208         if (!p)
2209                 goto out_unlock_tasklist;
2210
2211         /*
2212          * To be able to change p->policy safely, the apropriate
2213          * runqueue lock must be held.
2214          */
2215         rq = task_rq_lock(p, &flags);
2216
2217         if (policy < 0)
2218                 policy = p->policy;
2219         else {
2220                 retval = -EINVAL;
2221                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
2222                                 policy != SCHED_NORMAL)
2223                         goto out_unlock;
2224         }
2225
2226         /*
2227          * Valid priorities for SCHED_FIFO and SCHED_RR are
2228          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
2229          */
2230         retval = -EINVAL;
2231         if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
2232                 goto out_unlock;
2233         if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
2234                 goto out_unlock;
2235
2236         retval = -EPERM;
2237         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
2238             !capable(CAP_SYS_NICE))
2239                 goto out_unlock;
2240         if ((current->euid != p->euid) && (current->euid != p->uid) &&
2241             !capable(CAP_SYS_NICE))
2242                 goto out_unlock;
2243
2244         retval = security_task_setscheduler(p, policy, &lp);
2245         if (retval)
2246                 goto out_unlock;
2247
2248         array = p->array;
2249         if (array)
2250                 deactivate_task(p, task_rq(p));
2251         retval = 0;
2252         oldprio = p->prio;
2253         __setscheduler(p, policy, lp.sched_priority);
2254         if (array) {
2255                 __activate_task(p, task_rq(p));
2256                 /*
2257                  * Reschedule if we are currently running on this runqueue and
2258                  * our priority decreased, or if we are not currently running on
2259                  * this runqueue and our priority is higher than the current's
2260                  */
2261                 if (task_running(rq, p)) {
2262                         if (p->prio > oldprio)
2263                                 resched_task(rq->curr);
2264                 } else if (p->prio < rq->curr->prio)
2265                         resched_task(rq->curr);
2266         }
2267
2268 out_unlock:
2269         task_rq_unlock(rq, &flags);
2270 out_unlock_tasklist:
2271         read_unlock_irq(&tasklist_lock);
2272
2273 out_nounlock:
2274         return retval;
2275 }
2276
2277 /**
2278  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
2279  * @pid: the pid in question.
2280  * @policy: new policy
2281  * @param: structure containing the new RT priority.
2282  */
2283 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
2284                                        struct sched_param __user *param)
2285 {
2286         return setscheduler(pid, policy, param);
2287 }
2288
2289 /**
2290  * sys_sched_setparam - set/change the RT priority of a thread
2291  * @pid: the pid in question.
2292  * @param: structure containing the new RT priority.
2293  */
2294 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
2295 {
2296         return setscheduler(pid, -1, param);
2297 }
2298
2299 /**
2300  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
2301  * @pid: the pid in question.
2302  */
2303 asmlinkage long sys_sched_getscheduler(pid_t pid)
2304 {
2305         int retval = -EINVAL;
2306         task_t *p;
2307
2308         if (pid < 0)
2309                 goto out_nounlock;
2310
2311         retval = -ESRCH;
2312         read_lock(&tasklist_lock);
2313         p = find_process_by_pid(pid);
2314         if (p) {
2315                 retval = security_task_getscheduler(p);
2316                 if (!retval)
2317                         retval = p->policy;
2318         }
2319         read_unlock(&tasklist_lock);
2320
2321 out_nounlock:
2322         return retval;
2323 }
2324
2325 /**
2326  * sys_sched_getscheduler - get the RT priority of a thread
2327  * @pid: the pid in question.
2328  * @param: structure containing the RT priority.
2329  */
2330 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
2331 {
2332         struct sched_param lp;
2333         int retval = -EINVAL;
2334         task_t *p;
2335
2336         if (!param || pid < 0)
2337                 goto out_nounlock;
2338
2339         read_lock(&tasklist_lock);
2340         p = find_process_by_pid(pid);
2341         retval = -ESRCH;
2342         if (!p)
2343                 goto out_unlock;
2344
2345         retval = security_task_getscheduler(p);
2346         if (retval)
2347                 goto out_unlock;
2348
2349         lp.sched_priority = p->rt_priority;
2350         read_unlock(&tasklist_lock);
2351
2352         /*
2353          * This one might sleep, we cannot do it with a spinlock held ...
2354          */
2355         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
2356
2357 out_nounlock:
2358         return retval;
2359
2360 out_unlock:
2361         read_unlock(&tasklist_lock);
2362         return retval;
2363 }
2364
2365 /**
2366  * sys_sched_setaffinity - set the cpu affinity of a process
2367  * @pid: pid of the process
2368  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2369  * @user_mask_ptr: user-space pointer to the new cpu mask
2370  */
2371 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
2372                                       unsigned long __user *user_mask_ptr)
2373 {
2374         cpumask_t new_mask;
2375         int retval;
2376         task_t *p;
2377
2378         if (len < sizeof(new_mask))
2379                 return -EINVAL;
2380
2381         if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
2382                 return -EFAULT;
2383
2384         lock_cpu_hotplug();
2385         read_lock(&tasklist_lock);
2386
2387         p = find_process_by_pid(pid);
2388         if (!p) {
2389                 read_unlock(&tasklist_lock);
2390                 unlock_cpu_hotplug();
2391                 return -ESRCH;
2392         }
2393
2394         /*
2395          * It is not safe to call set_cpus_allowed with the
2396          * tasklist_lock held.  We will bump the task_struct's
2397          * usage count and then drop tasklist_lock.
2398          */
2399         get_task_struct(p);
2400         read_unlock(&tasklist_lock);
2401
2402         retval = -EPERM;
2403         if ((current->euid != p->euid) && (current->euid != p->uid) &&
2404                         !capable(CAP_SYS_NICE))
2405                 goto out_unlock;
2406
2407         retval = set_cpus_allowed(p, new_mask);
2408
2409 out_unlock:
2410         put_task_struct(p);
2411         unlock_cpu_hotplug();
2412         return retval;
2413 }
2414
2415 /**
2416  * sys_sched_getaffinity - get the cpu affinity of a process
2417  * @pid: pid of the process
2418  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2419  * @user_mask_ptr: user-space pointer to hold the current cpu mask
2420  */
2421 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
2422                                       unsigned long __user *user_mask_ptr)
2423 {
2424         unsigned int real_len;
2425         cpumask_t mask;
2426         int retval;
2427         task_t *p;
2428
2429         real_len = sizeof(mask);
2430         if (len < real_len)
2431                 return -EINVAL;
2432
2433         read_lock(&tasklist_lock);
2434
2435         retval = -ESRCH;
2436         p = find_process_by_pid(pid);
2437         if (!p)
2438                 goto out_unlock;
2439
2440         retval = 0;
2441         cpus_and(mask, p->cpus_allowed, cpu_possible_map);
2442
2443 out_unlock:
2444         read_unlock(&tasklist_lock);
2445         if (retval)
2446                 return retval;
2447         if (copy_to_user(user_mask_ptr, &mask, real_len))
2448                 return -EFAULT;
2449         return real_len;
2450 }
2451
2452 /**
2453  * sys_sched_yield - yield the current processor to other threads.
2454  *
2455  * this function yields the current CPU by moving the calling thread
2456  * to the expired array. If there are no other threads running on this
2457  * CPU then this function will return.
2458  */
2459 asmlinkage long sys_sched_yield(void)
2460 {
2461         runqueue_t *rq = this_rq_lock();
2462         prio_array_t *array = current->array;
2463
2464         /*
2465          * We implement yielding by moving the task into the expired
2466          * queue.
2467          *
2468          * (special rule: RT tasks will just roundrobin in the active
2469          *  array.)
2470          */
2471         if (likely(!rt_task(current))) {
2472                 dequeue_task(current, array);
2473                 enqueue_task(current, rq->expired);
2474         } else {
2475                 list_del(&current->run_list);
2476                 list_add_tail(&current->run_list, array->queue + current->prio);
2477         }
2478         /*
2479          * Since we are going to call schedule() anyway, there's
2480          * no need to preempt:
2481          */
2482         _raw_spin_unlock(&rq->lock);
2483         preempt_enable_no_resched();
2484
2485         schedule();
2486
2487         return 0;
2488 }
2489
2490 void __sched __cond_resched(void)
2491 {
2492         set_current_state(TASK_RUNNING);
2493         schedule();
2494 }
2495
2496 EXPORT_SYMBOL(__cond_resched);
2497
2498 /**
2499  * yield - yield the current processor to other threads.
2500  *
2501  * this is a shortcut for kernel-space yielding - it marks the
2502  * thread runnable and calls sys_sched_yield().
2503  */
2504 void __sched yield(void)
2505 {
2506         set_current_state(TASK_RUNNING);
2507         sys_sched_yield();
2508 }
2509
2510 EXPORT_SYMBOL(yield);
2511
2512 /*
2513  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
2514  * that process accounting knows that this is a task in IO wait state.
2515  *
2516  * But don't do that if it is a deliberate, throttling IO wait (this task
2517  * has set its backing_dev_info: the queue against which it should throttle)
2518  */
2519 void __sched io_schedule(void)
2520 {
2521         struct runqueue *rq = this_rq();
2522
2523         atomic_inc(&rq->nr_iowait);
2524         schedule();
2525         atomic_dec(&rq->nr_iowait);
2526 }
2527
2528 EXPORT_SYMBOL(io_schedule);
2529
2530 long __sched io_schedule_timeout(long timeout)
2531 {
2532         struct runqueue *rq = this_rq();
2533         long ret;
2534
2535         atomic_inc(&rq->nr_iowait);
2536         ret = schedule_timeout(timeout);
2537         atomic_dec(&rq->nr_iowait);
2538         return ret;
2539 }
2540
2541 /**
2542  * sys_sched_get_priority_max - return maximum RT priority.
2543  * @policy: scheduling class.
2544  *
2545  * this syscall returns the maximum rt_priority that can be used
2546  * by a given scheduling class.
2547  */
2548 asmlinkage long sys_sched_get_priority_max(int policy)
2549 {
2550         int ret = -EINVAL;
2551
2552         switch (policy) {
2553         case SCHED_FIFO:
2554         case SCHED_RR:
2555                 ret = MAX_USER_RT_PRIO-1;
2556                 break;
2557         case SCHED_NORMAL:
2558                 ret = 0;
2559                 break;
2560         }
2561         return ret;
2562 }
2563
2564 /**
2565  * sys_sched_get_priority_min - return minimum RT priority.
2566  * @policy: scheduling class.
2567  *
2568  * this syscall returns the minimum rt_priority that can be used
2569  * by a given scheduling class.
2570  */
2571 asmlinkage long sys_sched_get_priority_min(int policy)
2572 {
2573         int ret = -EINVAL;
2574
2575         switch (policy) {
2576         case SCHED_FIFO:
2577         case SCHED_RR:
2578                 ret = 1;
2579                 break;
2580         case SCHED_NORMAL:
2581                 ret = 0;
2582         }
2583         return ret;
2584 }
2585
2586 /**
2587  * sys_sched_rr_get_interval - return the default timeslice of a process.
2588  * @pid: pid of the process.
2589  * @interval: userspace pointer to the timeslice value.
2590  *
2591  * this syscall writes the default timeslice value of a given process
2592  * into the user-space timespec buffer. A value of '0' means infinity.
2593  */
2594 asmlinkage
2595 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
2596 {
2597         int retval = -EINVAL;
2598         struct timespec t;
2599         task_t *p;
2600
2601         if (pid < 0)
2602                 goto out_nounlock;
2603
2604         retval = -ESRCH;
2605         read_lock(&tasklist_lock);
2606         p = find_process_by_pid(pid);
2607         if (!p)
2608                 goto out_unlock;
2609
2610         retval = security_task_getscheduler(p);
2611         if (retval)
2612                 goto out_unlock;
2613
2614         jiffies_to_timespec(p->policy & SCHED_FIFO ?
2615                                 0 : task_timeslice(p), &t);
2616         read_unlock(&tasklist_lock);
2617         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2618 out_nounlock:
2619         return retval;
2620 out_unlock:
2621         read_unlock(&tasklist_lock);
2622         return retval;
2623 }
2624
2625 static inline struct task_struct *eldest_child(struct task_struct *p)
2626 {
2627         if (list_empty(&p->children)) return NULL;
2628         return list_entry(p->children.next,struct task_struct,sibling);
2629 }
2630
2631 static inline struct task_struct *older_sibling(struct task_struct *p)
2632 {
2633         if (p->sibling.prev==&p->parent->children) return NULL;
2634         return list_entry(p->sibling.prev,struct task_struct,sibling);
2635 }
2636
2637 static inline struct task_struct *younger_sibling(struct task_struct *p)
2638 {
2639         if (p->sibling.next==&p->parent->children) return NULL;
2640         return list_entry(p->sibling.next,struct task_struct,sibling);
2641 }
2642
2643 static void show_task(task_t * p)
2644 {
2645         task_t *relative;
2646         unsigned state;
2647         unsigned long free = 0;
2648         static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
2649
2650         printk("%-13.13s ", p->comm);
2651         state = p->state ? __ffs(p->state) + 1 : 0;
2652         if (state < ARRAY_SIZE(stat_nam))
2653                 printk(stat_nam[state]);
2654         else
2655                 printk("?");
2656 #if (BITS_PER_LONG == 32)
2657         if (state == TASK_RUNNING)
2658                 printk(" running ");
2659         else
2660                 printk(" %08lX ", thread_saved_pc(p));
2661 #else
2662         if (state == TASK_RUNNING)
2663                 printk("  running task   ");
2664         else
2665                 printk(" %016lx ", thread_saved_pc(p));
2666 #endif
2667 #ifdef CONFIG_DEBUG_STACK_USAGE
2668         {
2669                 unsigned long * n = (unsigned long *) (p->thread_info+1);
2670                 while (!*n)
2671                         n++;
2672                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
2673         }
2674 #endif
2675         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
2676         if ((relative = eldest_child(p)))
2677                 printk("%5d ", relative->pid);
2678         else
2679                 printk("      ");
2680         if ((relative = younger_sibling(p)))
2681                 printk("%7d", relative->pid);
2682         else
2683                 printk("       ");
2684         if ((relative = older_sibling(p)))
2685                 printk(" %5d", relative->pid);
2686         else
2687                 printk("      ");
2688         if (!p->mm)
2689                 printk(" (L-TLB)\n");
2690         else
2691                 printk(" (NOTLB)\n");
2692
2693         if (state != TASK_RUNNING)
2694                 show_stack(p, NULL);
2695 }
2696
2697 void show_state(void)
2698 {
2699         task_t *g, *p;
2700
2701 #if (BITS_PER_LONG == 32)
2702         printk("\n"
2703                "                                               sibling\n");
2704         printk("  task             PC      pid father child younger older\n");
2705 #else
2706         printk("\n"
2707                "                                                       sibling\n");
2708         printk("  task                 PC          pid father child younger older\n");
2709 #endif
2710         read_lock(&tasklist_lock);
2711         do_each_thread(g, p) {
2712                 /*
2713                  * reset the NMI-timeout, listing all files on a slow
2714                  * console might take alot of time:
2715                  */
2716                 touch_nmi_watchdog();
2717                 show_task(p);
2718         } while_each_thread(g, p);
2719
2720         read_unlock(&tasklist_lock);
2721 }
2722
2723 void __init init_idle(task_t *idle, int cpu)
2724 {
2725         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
2726         unsigned long flags;
2727
2728         local_irq_save(flags);
2729         double_rq_lock(idle_rq, rq);
2730
2731         idle_rq->curr = idle_rq->idle = idle;
2732         deactivate_task(idle, rq);
2733         idle->array = NULL;
2734         idle->prio = MAX_PRIO;
2735         idle->state = TASK_RUNNING;
2736         set_task_cpu(idle, cpu);
2737         double_rq_unlock(idle_rq, rq);
2738         set_tsk_need_resched(idle);
2739         local_irq_restore(flags);
2740
2741         /* Set the preempt count _outside_ the spinlocks! */
2742 #ifdef CONFIG_PREEMPT
2743         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
2744 #else
2745         idle->thread_info->preempt_count = 0;
2746 #endif
2747 }
2748
2749 /*
2750  * In a system that switches off the HZ timer idle_cpu_mask
2751  * indicates which cpus entered this state. This is used
2752  * in the rcu update to wait only for active cpus. For system
2753  * which do not switch off the HZ timer idle_cpu_mask should
2754  * always be CPU_MASK_NONE.
2755  */
2756 cpumask_t idle_cpu_mask = CPU_MASK_NONE;
2757
2758 #ifdef CONFIG_SMP
2759 /*
2760  * This is how migration works:
2761  *
2762  * 1) we queue a migration_req_t structure in the source CPU's
2763  *    runqueue and wake up that CPU's migration thread.
2764  * 2) we down() the locked semaphore => thread blocks.
2765  * 3) migration thread wakes up (implicitly it forces the migrated
2766  *    thread off the CPU)
2767  * 4) it gets the migration request and checks whether the migrated
2768  *    task is still in the wrong runqueue.
2769  * 5) if it's in the wrong runqueue then the migration thread removes
2770  *    it and puts it into the right queue.
2771  * 6) migration thread up()s the semaphore.
2772  * 7) we wake up and the migration is done.
2773  */
2774
2775 /*
2776  * Change a given task's CPU affinity. Migrate the thread to a
2777  * proper CPU and schedule it away if the CPU it's executing on
2778  * is removed from the allowed bitmask.
2779  *
2780  * NOTE: the caller must have a valid reference to the task, the
2781  * task must not exit() & deallocate itself prematurely.  The
2782  * call is not atomic; no spinlocks may be held.
2783  */
2784 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
2785 {
2786         unsigned long flags;
2787         int ret = 0;
2788         migration_req_t req;
2789         runqueue_t *rq;
2790
2791         rq = task_rq_lock(p, &flags);
2792         if (any_online_cpu(new_mask) == NR_CPUS) {
2793                 ret = -EINVAL;
2794                 goto out;
2795         }
2796
2797         if (__set_cpus_allowed(p, new_mask, &req)) {
2798                 /* Need help from migration thread: drop lock and wait. */
2799                 task_rq_unlock(rq, &flags);
2800                 wake_up_process(rq->migration_thread);
2801                 wait_for_completion(&req.done);
2802                 return 0;
2803         }
2804 out:
2805         task_rq_unlock(rq, &flags);
2806         return ret;
2807 }
2808
2809 EXPORT_SYMBOL_GPL(set_cpus_allowed);
2810
2811 /* Move (not current) task off this cpu, onto dest cpu. */
2812 static void move_task_away(struct task_struct *p, int dest_cpu)
2813 {
2814         runqueue_t *rq_dest;
2815
2816         rq_dest = cpu_rq(dest_cpu);
2817
2818         double_rq_lock(this_rq(), rq_dest);
2819         if (task_cpu(p) != smp_processor_id())
2820                 goto out; /* Already moved */
2821
2822         set_task_cpu(p, dest_cpu);
2823         if (p->array) {
2824                 deactivate_task(p, this_rq());
2825                 activate_task(p, rq_dest);
2826                 if (p->prio < rq_dest->curr->prio)
2827                         resched_task(rq_dest->curr);
2828         }
2829         p->timestamp = rq_dest->timestamp_last_tick;
2830
2831 out:
2832         double_rq_unlock(this_rq(), rq_dest);
2833 }
2834
2835 /*
2836  * migration_thread - this is a highprio system thread that performs
2837  * thread migration by bumping thread off CPU then 'pushing' onto
2838  * another runqueue.
2839  */
2840 static int migration_thread(void * data)
2841 {
2842         runqueue_t *rq;
2843         int cpu = (long)data;
2844
2845         rq = cpu_rq(cpu);
2846         BUG_ON(rq->migration_thread != current);
2847
2848         while (!kthread_should_stop()) {
2849                 struct list_head *head;
2850                 migration_req_t *req;
2851
2852                 if (current->flags & PF_FREEZE)
2853                         refrigerator(PF_FREEZE);
2854
2855                 spin_lock_irq(&rq->lock);
2856                 head = &rq->migration_queue;
2857                 current->state = TASK_INTERRUPTIBLE;
2858                 if (list_empty(head)) {
2859                         spin_unlock_irq(&rq->lock);
2860                         schedule();
2861                         continue;
2862                 }
2863                 req = list_entry(head->next, migration_req_t, list);
2864                 list_del_init(head->next);
2865                 spin_unlock(&rq->lock);
2866
2867                 move_task_away(req->task,
2868                                any_online_cpu(req->task->cpus_allowed));
2869                 local_irq_enable();
2870                 complete(&req->done);
2871         }
2872         return 0;
2873 }
2874
2875 #ifdef CONFIG_HOTPLUG_CPU
2876 /* migrate_all_tasks - function to migrate all the tasks from the
2877  * current cpu caller must have already scheduled this to the target
2878  * cpu via set_cpus_allowed.  Machine is stopped.  */
2879 void migrate_all_tasks(void)
2880 {
2881         struct task_struct *tsk, *t;
2882         int dest_cpu, src_cpu;
2883         unsigned int node;
2884
2885         /* We're nailed to this CPU. */
2886         src_cpu = smp_processor_id();
2887
2888         /* Not required, but here for neatness. */
2889         write_lock(&tasklist_lock);
2890
2891         /* watch out for per node tasks, let's stay on this node */
2892         node = cpu_to_node(src_cpu);
2893
2894         do_each_thread(t, tsk) {
2895                 cpumask_t mask;
2896                 if (tsk == current)
2897                         continue;
2898
2899                 if (task_cpu(tsk) != src_cpu)
2900                         continue;
2901
2902                 /* Figure out where this task should go (attempting to
2903                  * keep it on-node), and check if it can be migrated
2904                  * as-is.  NOTE that kernel threads bound to more than
2905                  * one online cpu will be migrated. */
2906                 mask = node_to_cpumask(node);
2907                 cpus_and(mask, mask, tsk->cpus_allowed);
2908                 dest_cpu = any_online_cpu(mask);
2909                 if (dest_cpu == NR_CPUS)
2910                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
2911                 if (dest_cpu == NR_CPUS) {
2912                         cpus_clear(tsk->cpus_allowed);
2913                         cpus_complement(tsk->cpus_allowed);
2914                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
2915
2916                         /* Don't tell them about moving exiting tasks
2917                            or kernel threads (both mm NULL), since
2918                            they never leave kernel. */
2919                         if (tsk->mm && printk_ratelimit())
2920                                 printk(KERN_INFO "process %d (%s) no "
2921                                        "longer affine to cpu%d\n",
2922                                        tsk->pid, tsk->comm, src_cpu);
2923                 }
2924
2925                 move_task_away(tsk, dest_cpu);
2926         } while_each_thread(t, tsk);
2927
2928         write_unlock(&tasklist_lock);
2929 }
2930 #endif /* CONFIG_HOTPLUG_CPU */
2931
2932 /*
2933  * migration_call - callback that gets triggered when a CPU is added.
2934  * Here we can start up the necessary migration thread for the new CPU.
2935  */
2936 static int migration_call(struct notifier_block *nfb, unsigned long action,
2937                           void *hcpu)
2938 {
2939         int cpu = (long)hcpu;
2940         struct task_struct *p;
2941         struct runqueue *rq;
2942         unsigned long flags;
2943
2944         switch (action) {
2945         case CPU_UP_PREPARE:
2946                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
2947                 if (IS_ERR(p))
2948                         return NOTIFY_BAD;
2949                 kthread_bind(p, cpu);
2950                 /* Must be high prio: stop_machine expects to yield to it. */
2951                 rq = task_rq_lock(p, &flags);
2952                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
2953                 task_rq_unlock(rq, &flags);
2954                 cpu_rq(cpu)->migration_thread = p;
2955                 break;
2956         case CPU_ONLINE:
2957                 /* Strictly unneccessary, as first user will wake it. */
2958                 wake_up_process(cpu_rq(cpu)->migration_thread);
2959                 break;
2960 #ifdef CONFIG_HOTPLUG_CPU
2961         case CPU_UP_CANCELED:
2962                 /* Unbind it from offline cpu so it can run.  Fall thru. */
2963                 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
2964         case CPU_DEAD:
2965                 kthread_stop(cpu_rq(cpu)->migration_thread);
2966                 cpu_rq(cpu)->migration_thread = NULL;
2967                 BUG_ON(cpu_rq(cpu)->nr_running != 0);
2968                 break;
2969 #endif
2970         }
2971         return NOTIFY_OK;
2972 }
2973
2974 static struct notifier_block __devinitdata migration_notifier = {
2975         .notifier_call = migration_call,
2976 };
2977
2978 int __init migration_init(void)
2979 {
2980         void *cpu = (void *)(long)smp_processor_id();
2981         /* Start one for boot CPU. */
2982         migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
2983         migration_call(&migration_notifier, CPU_ONLINE, cpu);
2984         register_cpu_notifier(&migration_notifier);
2985         return 0;
2986 }
2987 #endif
2988
2989 /*
2990  * The 'big kernel lock'
2991  *
2992  * This spinlock is taken and released recursively by lock_kernel()
2993  * and unlock_kernel().  It is transparently dropped and reaquired
2994  * over schedule().  It is used to protect legacy code that hasn't
2995  * been migrated to a proper locking design yet.
2996  *
2997  * Don't use in new code.
2998  *
2999  * Note: spinlock debugging needs this even on !CONFIG_SMP.
3000  */
3001 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
3002 EXPORT_SYMBOL(kernel_flag);
3003
3004 void __init sched_init(void)
3005 {
3006         runqueue_t *rq;
3007         int i, j, k;
3008
3009         for (i = 0; i < NR_CPUS; i++) {
3010                 prio_array_t *array;
3011
3012                 rq = cpu_rq(i);
3013                 rq->active = rq->arrays;
3014                 rq->expired = rq->arrays + 1;
3015                 rq->best_expired_prio = MAX_PRIO;
3016
3017                 spin_lock_init(&rq->lock);
3018                 INIT_LIST_HEAD(&rq->migration_queue);
3019                 INIT_LIST_HEAD(&rq->hold_queue);
3020                 atomic_set(&rq->nr_iowait, 0);
3021                 nr_running_init(rq);
3022
3023                 for (j = 0; j < 2; j++) {
3024                         array = rq->arrays + j;
3025                         for (k = 0; k < MAX_PRIO; k++) {
3026                                 INIT_LIST_HEAD(array->queue + k);
3027                                 __clear_bit(k, array->bitmap);
3028                         }
3029                         // delimiter for bitsearch
3030                         __set_bit(MAX_PRIO, array->bitmap);
3031                 }
3032         }
3033         /*
3034          * We have to do a little magic to get the first
3035          * thread right in SMP mode.
3036          */
3037         rq = this_rq();
3038         rq->curr = current;
3039         rq->idle = current;
3040         set_task_cpu(current, smp_processor_id());
3041         wake_up_forked_process(current);
3042
3043         init_timers();
3044
3045         /*
3046          * The boot idle thread does lazy MMU switching as well:
3047          */
3048         atomic_inc(&init_mm.mm_count);
3049         enter_lazy_tlb(&init_mm, current);
3050 }
3051
3052 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
3053 void __might_sleep(char *file, int line)
3054 {
3055 #if defined(in_atomic)
3056         static unsigned long prev_jiffy;        /* ratelimiting */
3057
3058         if ((in_atomic() || irqs_disabled()) &&
3059             system_state == SYSTEM_RUNNING) {
3060                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
3061                         return;
3062                 prev_jiffy = jiffies;
3063                 printk(KERN_ERR "Debug: sleeping function called from invalid"
3064                                 " context at %s:%d\n", file, line);
3065                 printk("in_atomic():%d, irqs_disabled():%d\n",
3066                         in_atomic(), irqs_disabled());
3067                 dump_stack();
3068         }
3069 #endif
3070 }
3071 EXPORT_SYMBOL(__might_sleep);
3072 #endif
3073
3074
3075 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
3076 /*
3077  * This could be a long-held lock.  If another CPU holds it for a long time,
3078  * and that CPU is not asked to reschedule then *this* CPU will spin on the
3079  * lock for a long time, even if *this* CPU is asked to reschedule.
3080  *
3081  * So what we do here, in the slow (contended) path is to spin on the lock by
3082  * hand while permitting preemption.
3083  *
3084  * Called inside preempt_disable().
3085  */
3086 void __sched __preempt_spin_lock(spinlock_t *lock)
3087 {
3088         if (preempt_count() > 1) {
3089                 _raw_spin_lock(lock);
3090                 return;
3091         }
3092         do {
3093                 preempt_enable();
3094                 while (spin_is_locked(lock))
3095                         cpu_relax();
3096                 preempt_disable();
3097         } while (!_raw_spin_trylock(lock));
3098 }
3099
3100 EXPORT_SYMBOL(__preempt_spin_lock);
3101
3102 void __sched __preempt_write_lock(rwlock_t *lock)
3103 {
3104         if (preempt_count() > 1) {
3105                 _raw_write_lock(lock);
3106                 return;
3107         }
3108
3109         do {
3110                 preempt_enable();
3111                 while (rwlock_is_locked(lock))
3112                         cpu_relax();
3113                 preempt_disable();
3114         } while (!_raw_write_trylock(lock));
3115 }
3116
3117 EXPORT_SYMBOL(__preempt_write_lock);
3118 #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */