Merge to ckrm-E15
[linux-2.6.git] / kernel / sched.c
1 /*
2  *  kernel/sched.c
3  *
4  *  Kernel scheduler and related syscalls
5  *
6  *  Copyright (C) 1991-2002  Linus Torvalds
7  *
8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9  *              make semaphores SMP safe
10  *  1998-11-19  Implemented schedule_timeout() and related stuff
11  *              by Andrea Arcangeli
12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
13  *              hybrid priority-list and round-robin design with
14  *              an array-switch method of distributing timeslices
15  *              and per-CPU runqueues.  Cleanups and useful suggestions
16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
17  *  2003-09-03  Interactivity tuning by Con Kolivas.
18  *  2004-04-02  Scheduler domains code by Nick Piggin
19  */
20
21 #include <linux/mm.h>
22 #include <linux/module.h>
23 #include <linux/nmi.h>
24 #include <linux/init.h>
25 #include <asm/uaccess.h>
26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h>
28 #include <linux/pagemap.h>
29 #include <asm/mmu_context.h>
30 #include <linux/interrupt.h>
31 #include <linux/completion.h>
32 #include <linux/kernel_stat.h>
33 #include <linux/security.h>
34 #include <linux/notifier.h>
35 #include <linux/suspend.h>
36 #include <linux/blkdev.h>
37 #include <linux/delay.h>
38 #include <linux/smp.h>
39 #include <linux/timer.h>
40 #include <linux/rcupdate.h>
41 #include <linux/cpu.h>
42 #include <linux/percpu.h>
43 #include <linux/kthread.h>
44 #include <linux/vserver/sched.h>
45 #include <linux/vs_base.h>
46
47 #include <asm/unistd.h>
48
49 #include <asm/unistd.h>
50
51 #ifdef CONFIG_NUMA
52 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
53 #else
54 #define cpu_to_node_mask(cpu) (cpu_online_map)
55 #endif
56
57 /*
58  * Convert user-nice values [ -20 ... 0 ... 19 ]
59  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
60  * and back.
61  */
62 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
63 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
64 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
65
66 /*
67  * 'User priority' is the nice value converted to something we
68  * can work with better when scaling various scheduler parameters,
69  * it's a [ 0 ... 39 ] range.
70  */
71 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
72 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
73 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
74 #define AVG_TIMESLICE   (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
75                         (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
76
77 /*
78  * Some helpers for converting nanosecond timing to jiffy resolution
79  */
80 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
81 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
82
83 /*
84  * These are the 'tuning knobs' of the scheduler:
85  *
86  * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
87  * maximum timeslice is 200 msecs. Timeslices get refilled after
88  * they expire.
89  */
90 #define MIN_TIMESLICE           ( 10 * HZ / 1000)
91 #define MAX_TIMESLICE           (200 * HZ / 1000)
92 #define ON_RUNQUEUE_WEIGHT       30
93 #define CHILD_PENALTY            95
94 #define PARENT_PENALTY          100
95 #define EXIT_WEIGHT               3
96 #define PRIO_BONUS_RATIO         25
97 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
98 #define INTERACTIVE_DELTA         2
99 #define MAX_SLEEP_AVG           (AVG_TIMESLICE * MAX_BONUS)
100 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
101 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
102 #define CREDIT_LIMIT            100
103
104 /*
105  * If a task is 'interactive' then we reinsert it in the active
106  * array after it has expired its current timeslice. (it will not
107  * continue to run immediately, it will still roundrobin with
108  * other interactive tasks.)
109  *
110  * This part scales the interactivity limit depending on niceness.
111  *
112  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
113  * Here are a few examples of different nice levels:
114  *
115  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
116  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
117  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
118  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
119  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
120  *
121  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
122  *  priority range a task can explore, a value of '1' means the
123  *  task is rated interactive.)
124  *
125  * Ie. nice +19 tasks can never get 'interactive' enough to be
126  * reinserted into the active array. And only heavily CPU-hog nice -20
127  * tasks will be expired. Default nice 0 tasks are somewhere between,
128  * it takes some effort for them to get interactive, but it's not
129  * too hard.
130  */
131
132 #define CURRENT_BONUS(p) \
133         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
134                 MAX_SLEEP_AVG)
135
136 #ifdef CONFIG_SMP
137 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
138                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
139                         num_online_cpus())
140 #else
141 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
142                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
143 #endif
144
145 #define SCALE(v1,v1_max,v2_max) \
146         (v1) * (v2_max) / (v1_max)
147
148 #define DELTA(p) \
149         (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
150
151 #define TASK_INTERACTIVE(p) \
152         ((p)->prio <= (p)->static_prio - DELTA(p))
153
154 #define INTERACTIVE_SLEEP(p) \
155         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
156                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
157
158 #define HIGH_CREDIT(p) \
159         ((p)->interactive_credit > CREDIT_LIMIT)
160
161 #define LOW_CREDIT(p) \
162         ((p)->interactive_credit < -CREDIT_LIMIT)
163
164 #define TASK_PREEMPTS_CURR(p, rq) \
165         ((p)->prio < (rq)->curr->prio)
166
167 /*
168  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
169  * to time slice values.
170  *
171  * The higher a thread's priority, the bigger timeslices
172  * it gets during one round of execution. But even the lowest
173  * priority thread gets MIN_TIMESLICE worth of execution time.
174  *
175  * task_timeslice() is the interface that is used by the scheduler.
176  */
177
178 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
179                 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
180                         (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
181
182 static unsigned int task_timeslice(task_t *p)
183 {
184         return BASE_TIMESLICE(p);
185 }
186
187 #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
188
189 /*
190  * These are the runqueue data structures:
191  */
192
193 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
194
195 typedef struct runqueue runqueue_t;
196
197 struct prio_array {
198         unsigned int nr_active;
199         unsigned long bitmap[BITMAP_SIZE];
200         struct list_head queue[MAX_PRIO];
201 };
202
203 /*
204  * This is the main, per-CPU runqueue data structure.
205  *
206  * Locking rule: those places that want to lock multiple runqueues
207  * (such as the load balancing or the thread migration code), lock
208  * acquire operations must be ordered by ascending &runqueue.
209  */
210 struct runqueue {
211         spinlock_t lock;
212
213         /*
214          * nr_running and cpu_load should be in the same cacheline because
215          * remote CPUs use both these fields when doing load calculation.
216          */
217         unsigned long nr_running;
218 #ifdef CONFIG_SMP
219         unsigned long cpu_load;
220 #endif
221         unsigned long long nr_switches;
222         unsigned long expired_timestamp, nr_uninterruptible;
223         unsigned long long timestamp_last_tick;
224         task_t *curr, *idle;
225         struct mm_struct *prev_mm;
226         prio_array_t *active, *expired, arrays[2];
227         int best_expired_prio;
228         atomic_t nr_iowait;
229
230 #ifdef CONFIG_SMP
231         struct sched_domain *sd;
232
233         /* For active balancing */
234         int active_balance;
235         int push_cpu;
236
237         task_t *migration_thread;
238         struct list_head migration_queue;
239 #endif
240         struct list_head hold_queue;
241         int idle_tokens;
242 };
243
244 static DEFINE_PER_CPU(struct runqueue, runqueues);
245
246 #define for_each_domain(cpu, domain) \
247         for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
248
249 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
250 #define this_rq()               (&__get_cpu_var(runqueues))
251 #define task_rq(p)              cpu_rq(task_cpu(p))
252 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
253
254 /*
255  * Default context-switch locking:
256  */
257 #ifndef prepare_arch_switch
258 # define prepare_arch_switch(rq, next)  do { } while (0)
259 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
260 # define task_running(rq, p)            ((rq)->curr == (p))
261 #endif
262
263 /*
264  * task_rq_lock - lock the runqueue a given task resides on and disable
265  * interrupts.  Note the ordering: we can safely lookup the task_rq without
266  * explicitly disabling preemption.
267  */
268 static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
269 {
270         struct runqueue *rq;
271
272 repeat_lock_task:
273         local_irq_save(*flags);
274         rq = task_rq(p);
275         spin_lock(&rq->lock);
276         if (unlikely(rq != task_rq(p))) {
277                 spin_unlock_irqrestore(&rq->lock, *flags);
278                 goto repeat_lock_task;
279         }
280         return rq;
281 }
282
283 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
284 {
285         spin_unlock_irqrestore(&rq->lock, *flags);
286 }
287
288 /*
289  * rq_lock - lock a given runqueue and disable interrupts.
290  */
291 static runqueue_t *this_rq_lock(void)
292 {
293         runqueue_t *rq;
294
295         local_irq_disable();
296         rq = this_rq();
297         spin_lock(&rq->lock);
298
299         return rq;
300 }
301
302 static inline void rq_unlock(runqueue_t *rq)
303 {
304         spin_unlock_irq(&rq->lock);
305 }
306
307 /*
308  * Adding/removing a task to/from a priority array:
309  */
310 static void dequeue_task(struct task_struct *p, prio_array_t *array)
311 {
312         array->nr_active--;
313         list_del(&p->run_list);
314         if (list_empty(array->queue + p->prio))
315                 __clear_bit(p->prio, array->bitmap);
316 }
317
318 static void enqueue_task(struct task_struct *p, prio_array_t *array)
319 {
320         list_add_tail(&p->run_list, array->queue + p->prio);
321         __set_bit(p->prio, array->bitmap);
322         array->nr_active++;
323         p->array = array;
324 }
325
326 /*
327  * Used by the migration code - we pull tasks from the head of the
328  * remote queue so we want these tasks to show up at the head of the
329  * local queue:
330  */
331 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
332 {
333         list_add(&p->run_list, array->queue + p->prio);
334         __set_bit(p->prio, array->bitmap);
335         array->nr_active++;
336         p->array = array;
337 }
338
339 /*
340  * effective_prio - return the priority that is based on the static
341  * priority but is modified by bonuses/penalties.
342  *
343  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
344  * into the -5 ... 0 ... +5 bonus/penalty range.
345  *
346  * We use 25% of the full 0...39 priority range so that:
347  *
348  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
349  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
350  *
351  * Both properties are important to certain workloads.
352  */
353 static int effective_prio(task_t *p)
354 {
355         int bonus, prio;
356
357         if (rt_task(p))
358                 return p->prio;
359
360         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
361
362         prio = p->static_prio - bonus;
363         if (__vx_task_flags(p, VXF_SCHED_PRIO, 0))
364                 prio += effective_vavavoom(p, MAX_USER_PRIO);
365
366         if (prio < MAX_RT_PRIO)
367                 prio = MAX_RT_PRIO;
368         if (prio > MAX_PRIO-1)
369                 prio = MAX_PRIO-1;
370         return prio;
371 }
372
373 /*
374  * __activate_task - move a task to the runqueue.
375  */
376 static inline void __activate_task(task_t *p, runqueue_t *rq)
377 {
378         enqueue_task(p, rq->active);
379         rq->nr_running++;
380 }
381
382 /*
383  * __activate_idle_task - move idle task to the _front_ of runqueue.
384  */
385 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
386 {
387         enqueue_task_head(p, rq->active);
388         rq->nr_running++;
389 }
390
391 static void recalc_task_prio(task_t *p, unsigned long long now)
392 {
393         unsigned long long __sleep_time = now - p->timestamp;
394         unsigned long sleep_time;
395
396         if (__sleep_time > NS_MAX_SLEEP_AVG)
397                 sleep_time = NS_MAX_SLEEP_AVG;
398         else
399                 sleep_time = (unsigned long)__sleep_time;
400
401         if (likely(sleep_time > 0)) {
402                 /*
403                  * User tasks that sleep a long time are categorised as
404                  * idle and will get just interactive status to stay active &
405                  * prevent them suddenly becoming cpu hogs and starving
406                  * other processes.
407                  */
408                 if (p->mm && p->activated != -1 &&
409                         sleep_time > INTERACTIVE_SLEEP(p)) {
410                                 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
411                                                 AVG_TIMESLICE);
412                                 if (!HIGH_CREDIT(p))
413                                         p->interactive_credit++;
414                 } else {
415                         /*
416                          * The lower the sleep avg a task has the more
417                          * rapidly it will rise with sleep time.
418                          */
419                         sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
420
421                         /*
422                          * Tasks with low interactive_credit are limited to
423                          * one timeslice worth of sleep avg bonus.
424                          */
425                         if (LOW_CREDIT(p) &&
426                             sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
427                                 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
428
429                         /*
430                          * Non high_credit tasks waking from uninterruptible
431                          * sleep are limited in their sleep_avg rise as they
432                          * are likely to be cpu hogs waiting on I/O
433                          */
434                         if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
435                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
436                                         sleep_time = 0;
437                                 else if (p->sleep_avg + sleep_time >=
438                                                 INTERACTIVE_SLEEP(p)) {
439                                         p->sleep_avg = INTERACTIVE_SLEEP(p);
440                                         sleep_time = 0;
441                                 }
442                         }
443
444                         /*
445                          * This code gives a bonus to interactive tasks.
446                          *
447                          * The boost works by updating the 'average sleep time'
448                          * value here, based on ->timestamp. The more time a
449                          * task spends sleeping, the higher the average gets -
450                          * and the higher the priority boost gets as well.
451                          */
452                         p->sleep_avg += sleep_time;
453
454                         if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
455                                 p->sleep_avg = NS_MAX_SLEEP_AVG;
456                                 if (!HIGH_CREDIT(p))
457                                         p->interactive_credit++;
458                         }
459                 }
460         }
461
462         p->prio = effective_prio(p);
463 }
464
465 /*
466  * activate_task - move a task to the runqueue and do priority recalculation
467  *
468  * Update all the scheduling statistics stuff. (sleep average
469  * calculation, priority modifiers, etc.)
470  */
471 static void activate_task(task_t *p, runqueue_t *rq, int local)
472 {
473         unsigned long long now;
474
475         now = sched_clock();
476 #ifdef CONFIG_SMP
477         if (!local) {
478                 /* Compensate for drifting sched_clock */
479                 runqueue_t *this_rq = this_rq();
480                 now = (now - this_rq->timestamp_last_tick)
481                         + rq->timestamp_last_tick;
482         }
483 #endif
484
485         recalc_task_prio(p, now);
486
487         /*
488          * This checks to make sure it's not an uninterruptible task
489          * that is now waking up.
490          */
491         if (!p->activated) {
492                 /*
493                  * Tasks which were woken up by interrupts (ie. hw events)
494                  * are most likely of interactive nature. So we give them
495                  * the credit of extending their sleep time to the period
496                  * of time they spend on the runqueue, waiting for execution
497                  * on a CPU, first time around:
498                  */
499                 if (in_interrupt())
500                         p->activated = 2;
501                 else {
502                         /*
503                          * Normal first-time wakeups get a credit too for
504                          * on-runqueue time, but it will be weighted down:
505                          */
506                         p->activated = 1;
507                 }
508         }
509         p->timestamp = now;
510
511         __activate_task(p, rq);
512 }
513
514 /*
515  * deactivate_task - remove a task from the runqueue.
516  */
517 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
518 {
519         rq->nr_running--;
520         if (p->state == TASK_UNINTERRUPTIBLE)
521                 rq->nr_uninterruptible++;
522         dequeue_task(p, p->array);
523         p->array = NULL;
524 }
525
526 /*
527  * resched_task - mark a task 'to be rescheduled now'.
528  *
529  * On UP this means the setting of the need_resched flag, on SMP it
530  * might also involve a cross-CPU call to trigger the scheduler on
531  * the target CPU.
532  */
533 #ifdef CONFIG_SMP
534 static void resched_task(task_t *p)
535 {
536         int need_resched, nrpolling;
537
538         preempt_disable();
539         /* minimise the chance of sending an interrupt to poll_idle() */
540         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
541         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
542         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
543
544         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
545                 smp_send_reschedule(task_cpu(p));
546         preempt_enable();
547 }
548 #else
549 static inline void resched_task(task_t *p)
550 {
551         set_tsk_need_resched(p);
552 }
553 #endif
554
555 /**
556  * task_curr - is this task currently executing on a CPU?
557  * @p: the task in question.
558  */
559 inline int task_curr(const task_t *p)
560 {
561         return cpu_curr(task_cpu(p)) == p;
562 }
563
564 #ifdef CONFIG_SMP
565 enum request_type {
566         REQ_MOVE_TASK,
567         REQ_SET_DOMAIN,
568 };
569
570 typedef struct {
571         struct list_head list;
572         enum request_type type;
573
574         /* For REQ_MOVE_TASK */
575         task_t *task;
576         int dest_cpu;
577
578         /* For REQ_SET_DOMAIN */
579         struct sched_domain *sd;
580
581         struct completion done;
582 } migration_req_t;
583
584 /*
585  * The task's runqueue lock must be held.
586  * Returns true if you have to wait for migration thread.
587  */
588 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
589 {
590         runqueue_t *rq = task_rq(p);
591
592         /*
593          * If the task is not on a runqueue (and not running), then
594          * it is sufficient to simply update the task's cpu field.
595          */
596         if (!p->array && !task_running(rq, p)) {
597                 set_task_cpu(p, dest_cpu);
598                 return 0;
599         }
600
601         init_completion(&req->done);
602         req->type = REQ_MOVE_TASK;
603         req->task = p;
604         req->dest_cpu = dest_cpu;
605         list_add(&req->list, &rq->migration_queue);
606         return 1;
607 }
608
609 /*
610  * wait_task_inactive - wait for a thread to unschedule.
611  *
612  * The caller must ensure that the task *will* unschedule sometime soon,
613  * else this function might spin for a *long* time. This function can't
614  * be called with interrupts off, or it may introduce deadlock with
615  * smp_call_function() if an IPI is sent by the same process we are
616  * waiting to become inactive.
617  */
618 void wait_task_inactive(task_t * p)
619 {
620         unsigned long flags;
621         runqueue_t *rq;
622         int preempted;
623
624 repeat:
625         rq = task_rq_lock(p, &flags);
626         /* Must be off runqueue entirely, not preempted. */
627         if (unlikely(p->array)) {
628                 /* If it's preempted, we yield.  It could be a while. */
629                 preempted = !task_running(rq, p);
630                 task_rq_unlock(rq, &flags);
631                 cpu_relax();
632                 if (preempted)
633                         yield();
634                 goto repeat;
635         }
636         task_rq_unlock(rq, &flags);
637 }
638
639 /***
640  * kick_process - kick a running thread to enter/exit the kernel
641  * @p: the to-be-kicked thread
642  *
643  * Cause a process which is running on another CPU to enter
644  * kernel-mode, without any delay. (to get signals handled.)
645  */
646 void kick_process(task_t *p)
647 {
648         int cpu;
649
650         preempt_disable();
651         cpu = task_cpu(p);
652         if ((cpu != smp_processor_id()) && task_curr(p))
653                 smp_send_reschedule(cpu);
654         preempt_enable();
655 }
656
657 EXPORT_SYMBOL_GPL(kick_process);
658
659 /*
660  * Return a low guess at the load of a migration-source cpu.
661  *
662  * We want to under-estimate the load of migration sources, to
663  * balance conservatively.
664  */
665 static inline unsigned long source_load(int cpu)
666 {
667         runqueue_t *rq = cpu_rq(cpu);
668         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
669
670         return min(rq->cpu_load, load_now);
671 }
672
673 /*
674  * Return a high guess at the load of a migration-target cpu
675  */
676 static inline unsigned long target_load(int cpu)
677 {
678         runqueue_t *rq = cpu_rq(cpu);
679         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
680
681         return max(rq->cpu_load, load_now);
682 }
683
684 #endif
685
686 /*
687  * wake_idle() is useful especially on SMT architectures to wake a
688  * task onto an idle sibling if we would otherwise wake it onto a
689  * busy sibling.
690  *
691  * Returns the CPU we should wake onto.
692  */
693 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
694 static int wake_idle(int cpu, task_t *p)
695 {
696         cpumask_t tmp;
697         runqueue_t *rq = cpu_rq(cpu);
698         struct sched_domain *sd;
699         int i;
700
701         if (idle_cpu(cpu))
702                 return cpu;
703
704         sd = rq->sd;
705         if (!(sd->flags & SD_WAKE_IDLE))
706                 return cpu;
707
708         cpus_and(tmp, sd->span, cpu_online_map);
709         for_each_cpu_mask(i, tmp) {
710                 if (!cpu_isset(i, p->cpus_allowed))
711                         continue;
712
713                 if (idle_cpu(i))
714                         return i;
715         }
716
717         return cpu;
718 }
719 #else
720 static inline int wake_idle(int cpu, task_t *p)
721 {
722         return cpu;
723 }
724 #endif
725
726 /***
727  * try_to_wake_up - wake up a thread
728  * @p: the to-be-woken-up thread
729  * @state: the mask of task states that can be woken
730  * @sync: do a synchronous wakeup?
731  *
732  * Put it on the run-queue if it's not already there. The "current"
733  * thread is always on the run-queue (except when the actual
734  * re-schedule is in progress), and as such you're allowed to do
735  * the simpler "current->state = TASK_RUNNING" to mark yourself
736  * runnable without the overhead of this.
737  *
738  * returns failure only if the task is already active.
739  */
740 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
741 {
742         int cpu, this_cpu, success = 0;
743         unsigned long flags;
744         long old_state;
745         runqueue_t *rq;
746 #ifdef CONFIG_SMP
747         unsigned long load, this_load;
748         struct sched_domain *sd;
749         int new_cpu;
750 #endif
751
752         rq = task_rq_lock(p, &flags);
753         old_state = p->state;
754         if (!(old_state & state))
755                 goto out;
756
757         if (p->array)
758                 goto out_running;
759
760         cpu = task_cpu(p);
761         this_cpu = smp_processor_id();
762
763 #ifdef CONFIG_SMP
764         if (unlikely(task_running(rq, p)))
765                 goto out_activate;
766
767         new_cpu = cpu;
768
769         if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
770                 goto out_set_cpu;
771
772         load = source_load(cpu);
773         this_load = target_load(this_cpu);
774
775         /*
776          * If sync wakeup then subtract the (maximum possible) effect of
777          * the currently running task from the load of the current CPU:
778          */
779         if (sync)
780                 this_load -= SCHED_LOAD_SCALE;
781
782         /* Don't pull the task off an idle CPU to a busy one */
783         if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
784                 goto out_set_cpu;
785
786         new_cpu = this_cpu; /* Wake to this CPU if we can */
787
788         /*
789          * Scan domains for affine wakeup and passive balancing
790          * possibilities.
791          */
792         for_each_domain(this_cpu, sd) {
793                 unsigned int imbalance;
794                 /*
795                  * Start passive balancing when half the imbalance_pct
796                  * limit is reached.
797                  */
798                 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
799
800                 if ( ((sd->flags & SD_WAKE_AFFINE) &&
801                                 !task_hot(p, rq->timestamp_last_tick, sd))
802                         || ((sd->flags & SD_WAKE_BALANCE) &&
803                                 imbalance*this_load <= 100*load) ) {
804                         /*
805                          * Now sd has SD_WAKE_AFFINE and p is cache cold in sd
806                          * or sd has SD_WAKE_BALANCE and there is an imbalance
807                          */
808                         if (cpu_isset(cpu, sd->span))
809                                 goto out_set_cpu;
810                 }
811         }
812
813         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
814 out_set_cpu:
815         new_cpu = wake_idle(new_cpu, p);
816         if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
817                 set_task_cpu(p, new_cpu);
818                 task_rq_unlock(rq, &flags);
819                 /* might preempt at this point */
820                 rq = task_rq_lock(p, &flags);
821                 old_state = p->state;
822                 if (!(old_state & state))
823                         goto out;
824                 if (p->array)
825                         goto out_running;
826
827                 this_cpu = smp_processor_id();
828                 cpu = task_cpu(p);
829         }
830
831 out_activate:
832 #endif /* CONFIG_SMP */
833         if (old_state == TASK_UNINTERRUPTIBLE) {
834                 rq->nr_uninterruptible--;
835                 /*
836                  * Tasks on involuntary sleep don't earn
837                  * sleep_avg beyond just interactive state.
838                  */
839                 p->activated = -1;
840         }
841
842         /*
843          * Sync wakeups (i.e. those types of wakeups where the waker
844          * has indicated that it will leave the CPU in short order)
845          * don't trigger a preemption, if the woken up task will run on
846          * this cpu. (in this case the 'I will reschedule' promise of
847          * the waker guarantees that the freshly woken up task is going
848          * to be considered on this CPU.)
849          */
850         activate_task(p, rq, cpu == this_cpu);
851         if (!sync || cpu != this_cpu) {
852                 if (TASK_PREEMPTS_CURR(p, rq))
853                         resched_task(rq->curr);
854         }
855         success = 1;
856
857 out_running:
858         p->state = TASK_RUNNING;
859 out:
860         task_rq_unlock(rq, &flags);
861
862         return success;
863 }
864
865 int fastcall wake_up_process(task_t * p)
866 {
867         return try_to_wake_up(p, TASK_STOPPED |
868                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
869 }
870
871 EXPORT_SYMBOL(wake_up_process);
872
873 int fastcall wake_up_state(task_t *p, unsigned int state)
874 {
875         return try_to_wake_up(p, state, 0);
876 }
877
878 /*
879  * Perform scheduler related setup for a newly forked process p.
880  * p is forked by current.
881  */
882 void fastcall sched_fork(task_t *p)
883 {
884         /*
885          * We mark the process as running here, but have not actually
886          * inserted it onto the runqueue yet. This guarantees that
887          * nobody will actually run it, and a signal or other external
888          * event cannot wake it up and insert it on the runqueue either.
889          */
890         p->state = TASK_RUNNING;
891         INIT_LIST_HEAD(&p->run_list);
892         p->array = NULL;
893         spin_lock_init(&p->switch_lock);
894 #ifdef CONFIG_PREEMPT
895         /*
896          * During context-switch we hold precisely one spinlock, which
897          * schedule_tail drops. (in the common case it's this_rq()->lock,
898          * but it also can be p->switch_lock.) So we compensate with a count
899          * of 1. Also, we want to start with kernel preemption disabled.
900          */
901         p->thread_info->preempt_count = 1;
902 #endif
903         /*
904          * Share the timeslice between parent and child, thus the
905          * total amount of pending timeslices in the system doesn't change,
906          * resulting in more scheduling fairness.
907          */
908         local_irq_disable();
909         p->time_slice = (current->time_slice + 1) >> 1;
910         /*
911          * The remainder of the first timeslice might be recovered by
912          * the parent if the child exits early enough.
913          */
914         p->first_time_slice = 1;
915         current->time_slice >>= 1;
916         p->timestamp = sched_clock();
917         if (!current->time_slice) {
918                 /*
919                  * This case is rare, it happens when the parent has only
920                  * a single jiffy left from its timeslice. Taking the
921                  * runqueue lock is not a problem.
922                  */
923                 current->time_slice = 1;
924                 preempt_disable();
925                 scheduler_tick(0, 0);
926                 local_irq_enable();
927                 preempt_enable();
928         } else
929                 local_irq_enable();
930 }
931
932 /*
933  * wake_up_forked_process - wake up a freshly forked process.
934  *
935  * This function will do some initial scheduler statistics housekeeping
936  * that must be done for every newly created process.
937  */
938 void fastcall wake_up_forked_process(task_t * p)
939 {
940         unsigned long flags;
941         runqueue_t *rq = task_rq_lock(current, &flags);
942
943         BUG_ON(p->state != TASK_RUNNING);
944
945         /*
946          * We decrease the sleep average of forking parents
947          * and children as well, to keep max-interactive tasks
948          * from forking tasks that are max-interactive.
949          */
950         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
951                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
952
953         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
954                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
955
956         p->interactive_credit = 0;
957
958         p->prio = effective_prio(p);
959         set_task_cpu(p, smp_processor_id());
960
961         if (unlikely(!current->array))
962                 __activate_task(p, rq);
963         else {
964                 p->prio = current->prio;
965                 list_add_tail(&p->run_list, &current->run_list);
966                 p->array = current->array;
967                 p->array->nr_active++;
968                 rq->nr_running++;
969         }
970         task_rq_unlock(rq, &flags);
971 }
972
973 /*
974  * Potentially available exiting-child timeslices are
975  * retrieved here - this way the parent does not get
976  * penalized for creating too many threads.
977  *
978  * (this cannot be used to 'generate' timeslices
979  * artificially, because any timeslice recovered here
980  * was given away by the parent in the first place.)
981  */
982 void fastcall sched_exit(task_t * p)
983 {
984         unsigned long flags;
985         runqueue_t *rq;
986
987         local_irq_save(flags);
988         if (p->first_time_slice) {
989                 p->parent->time_slice += p->time_slice;
990                 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
991                         p->parent->time_slice = MAX_TIMESLICE;
992         }
993         local_irq_restore(flags);
994         /*
995          * If the child was a (relative-) CPU hog then decrease
996          * the sleep_avg of the parent as well.
997          */
998         rq = task_rq_lock(p->parent, &flags);
999         if (p->sleep_avg < p->parent->sleep_avg)
1000                 p->parent->sleep_avg = p->parent->sleep_avg /
1001                 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1002                 (EXIT_WEIGHT + 1);
1003         task_rq_unlock(rq, &flags);
1004 }
1005
1006 /**
1007  * finish_task_switch - clean up after a task-switch
1008  * @prev: the thread we just switched away from.
1009  *
1010  * We enter this with the runqueue still locked, and finish_arch_switch()
1011  * will unlock it along with doing any other architecture-specific cleanup
1012  * actions.
1013  *
1014  * Note that we may have delayed dropping an mm in context_switch(). If
1015  * so, we finish that here outside of the runqueue lock.  (Doing it
1016  * with the lock held can cause deadlocks; see schedule() for
1017  * details.)
1018  */
1019 static void finish_task_switch(task_t *prev)
1020 {
1021         runqueue_t *rq = this_rq();
1022         struct mm_struct *mm = rq->prev_mm;
1023         unsigned long prev_task_flags;
1024
1025         rq->prev_mm = NULL;
1026
1027         /*
1028          * A task struct has one reference for the use as "current".
1029          * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
1030          * schedule one last time. The schedule call will never return,
1031          * and the scheduled task must drop that reference.
1032          * The test for TASK_ZOMBIE must occur while the runqueue locks are
1033          * still held, otherwise prev could be scheduled on another cpu, die
1034          * there before we look at prev->state, and then the reference would
1035          * be dropped twice.
1036          *              Manfred Spraul <manfred@colorfullife.com>
1037          */
1038         prev_task_flags = prev->flags;
1039         finish_arch_switch(rq, prev);
1040         if (mm)
1041                 mmdrop(mm);
1042         if (unlikely(prev_task_flags & PF_DEAD))
1043                 put_task_struct(prev);
1044 }
1045
1046 /**
1047  * schedule_tail - first thing a freshly forked thread must call.
1048  * @prev: the thread we just switched away from.
1049  */
1050 asmlinkage void schedule_tail(task_t *prev)
1051 {
1052         finish_task_switch(prev);
1053
1054         if (current->set_child_tid)
1055                 put_user(current->pid, current->set_child_tid);
1056 }
1057
1058 /*
1059  * context_switch - switch to the new MM and the new
1060  * thread's register state.
1061  */
1062 static inline
1063 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1064 {
1065         struct mm_struct *mm = next->mm;
1066         struct mm_struct *oldmm = prev->active_mm;
1067
1068         if (unlikely(!mm)) {
1069                 next->active_mm = oldmm;
1070                 atomic_inc(&oldmm->mm_count);
1071                 enter_lazy_tlb(oldmm, next);
1072         } else
1073                 switch_mm(oldmm, mm, next);
1074
1075         if (unlikely(!prev->mm)) {
1076                 prev->active_mm = NULL;
1077                 WARN_ON(rq->prev_mm);
1078                 rq->prev_mm = oldmm;
1079         }
1080
1081         /* Here we just switch the register state and the stack. */
1082         switch_to(prev, next, prev);
1083
1084         return prev;
1085 }
1086
1087 /*
1088  * nr_running, nr_uninterruptible and nr_context_switches:
1089  *
1090  * externally visible scheduler statistics: current number of runnable
1091  * threads, current number of uninterruptible-sleeping threads, total
1092  * number of context switches performed since bootup.
1093  */
1094 unsigned long nr_running(void)
1095 {
1096         unsigned long i, sum = 0;
1097
1098         for_each_cpu(i)
1099                 sum += cpu_rq(i)->nr_running;
1100
1101         return sum;
1102 }
1103
1104 unsigned long nr_uninterruptible(void)
1105 {
1106         unsigned long i, sum = 0;
1107
1108         for_each_online_cpu(i)
1109                 sum += cpu_rq(i)->nr_uninterruptible;
1110
1111         return sum;
1112 }
1113
1114 unsigned long long nr_context_switches(void)
1115 {
1116         unsigned long long i, sum = 0;
1117
1118         for_each_online_cpu(i)
1119                 sum += cpu_rq(i)->nr_switches;
1120
1121         return sum;
1122 }
1123
1124 unsigned long nr_iowait(void)
1125 {
1126         unsigned long i, sum = 0;
1127
1128         for_each_online_cpu(i)
1129                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1130
1131         return sum;
1132 }
1133
1134 /*
1135  * double_rq_lock - safely lock two runqueues
1136  *
1137  * Note this does not disable interrupts like task_rq_lock,
1138  * you need to do so manually before calling.
1139  */
1140 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1141 {
1142         if (rq1 == rq2)
1143                 spin_lock(&rq1->lock);
1144         else {
1145                 if (rq1 < rq2) {
1146                         spin_lock(&rq1->lock);
1147                         spin_lock(&rq2->lock);
1148                 } else {
1149                         spin_lock(&rq2->lock);
1150                         spin_lock(&rq1->lock);
1151                 }
1152         }
1153 }
1154
1155 /*
1156  * double_rq_unlock - safely unlock two runqueues
1157  *
1158  * Note this does not restore interrupts like task_rq_unlock,
1159  * you need to do so manually after calling.
1160  */
1161 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1162 {
1163         spin_unlock(&rq1->lock);
1164         if (rq1 != rq2)
1165                 spin_unlock(&rq2->lock);
1166 }
1167
1168 enum idle_type
1169 {
1170         IDLE,
1171         NOT_IDLE,
1172         NEWLY_IDLE,
1173 };
1174
1175 #ifdef CONFIG_SMP
1176
1177 /*
1178  * find_idlest_cpu - find the least busy runqueue.
1179  */
1180 static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1181                            struct sched_domain *sd)
1182 {
1183         unsigned long load, min_load, this_load;
1184         int i, min_cpu;
1185         cpumask_t mask;
1186
1187         min_cpu = UINT_MAX;
1188         min_load = ULONG_MAX;
1189
1190         cpus_and(mask, sd->span, cpu_online_map);
1191         cpus_and(mask, mask, p->cpus_allowed);
1192
1193         for_each_cpu_mask(i, mask) {
1194                 load = target_load(i);
1195
1196                 if (load < min_load) {
1197                         min_cpu = i;
1198                         min_load = load;
1199
1200                         /* break out early on an idle CPU: */
1201                         if (!min_load)
1202                                 break;
1203                 }
1204         }
1205
1206         /* add +1 to account for the new task */
1207         this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1208
1209         /*
1210          * Would with the addition of the new task to the
1211          * current CPU there be an imbalance between this
1212          * CPU and the idlest CPU?
1213          *
1214          * Use half of the balancing threshold - new-context is
1215          * a good opportunity to balance.
1216          */
1217         if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1218                 return min_cpu;
1219
1220         return this_cpu;
1221 }
1222
1223 /*
1224  * wake_up_forked_thread - wake up a freshly forked thread.
1225  *
1226  * This function will do some initial scheduler statistics housekeeping
1227  * that must be done for every newly created context, and it also does
1228  * runqueue balancing.
1229  */
1230 void fastcall wake_up_forked_thread(task_t * p)
1231 {
1232         unsigned long flags;
1233         int this_cpu = get_cpu(), cpu;
1234         struct sched_domain *tmp, *sd = NULL;
1235         runqueue_t *this_rq = cpu_rq(this_cpu), *rq;
1236
1237         /*
1238          * Find the largest domain that this CPU is part of that
1239          * is willing to balance on clone:
1240          */
1241         for_each_domain(this_cpu, tmp)
1242                 if (tmp->flags & SD_BALANCE_CLONE)
1243                         sd = tmp;
1244         if (sd)
1245                 cpu = find_idlest_cpu(p, this_cpu, sd);
1246         else
1247                 cpu = this_cpu;
1248
1249         local_irq_save(flags);
1250 lock_again:
1251         rq = cpu_rq(cpu);
1252         double_rq_lock(this_rq, rq);
1253
1254         BUG_ON(p->state != TASK_RUNNING);
1255
1256         /*
1257          * We did find_idlest_cpu() unlocked, so in theory
1258          * the mask could have changed - just dont migrate
1259          * in this case:
1260          */
1261         if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) {
1262                 cpu = this_cpu;
1263                 double_rq_unlock(this_rq, rq);
1264                 goto lock_again;
1265         }
1266         /*
1267          * We decrease the sleep average of forking parents
1268          * and children as well, to keep max-interactive tasks
1269          * from forking tasks that are max-interactive.
1270          */
1271         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1272                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1273
1274         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1275                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1276
1277         p->interactive_credit = 0;
1278
1279         p->prio = effective_prio(p);
1280         set_task_cpu(p, cpu);
1281
1282         if (cpu == this_cpu) {
1283                 if (unlikely(!current->array))
1284                         __activate_task(p, rq);
1285                 else {
1286                         p->prio = current->prio;
1287                         list_add_tail(&p->run_list, &current->run_list);
1288                         p->array = current->array;
1289                         p->array->nr_active++;
1290                         rq->nr_running++;
1291                 }
1292         } else {
1293                 /* Not the local CPU - must adjust timestamp */
1294                 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1295                                         + rq->timestamp_last_tick;
1296                 __activate_task(p, rq);
1297                 if (TASK_PREEMPTS_CURR(p, rq))
1298                         resched_task(rq->curr);
1299         }
1300
1301         double_rq_unlock(this_rq, rq);
1302         local_irq_restore(flags);
1303         put_cpu();
1304 }
1305
1306 /*
1307  * If dest_cpu is allowed for this process, migrate the task to it.
1308  * This is accomplished by forcing the cpu_allowed mask to only
1309  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1310  * the cpu_allowed mask is restored.
1311  */
1312 static void sched_migrate_task(task_t *p, int dest_cpu)
1313 {
1314         migration_req_t req;
1315         runqueue_t *rq;
1316         unsigned long flags;
1317
1318         rq = task_rq_lock(p, &flags);
1319         if (!cpu_isset(dest_cpu, p->cpus_allowed)
1320             || unlikely(cpu_is_offline(dest_cpu)))
1321                 goto out;
1322
1323         /* force the process onto the specified CPU */
1324         if (migrate_task(p, dest_cpu, &req)) {
1325                 /* Need to wait for migration thread (might exit: take ref). */
1326                 struct task_struct *mt = rq->migration_thread;
1327                 get_task_struct(mt);
1328                 task_rq_unlock(rq, &flags);
1329                 wake_up_process(mt);
1330                 put_task_struct(mt);
1331                 wait_for_completion(&req.done);
1332                 return;
1333         }
1334 out:
1335         task_rq_unlock(rq, &flags);
1336 }
1337
1338 /*
1339  * sched_balance_exec(): find the highest-level, exec-balance-capable
1340  * domain and try to migrate the task to the least loaded CPU.
1341  *
1342  * execve() is a valuable balancing opportunity, because at this point
1343  * the task has the smallest effective memory and cache footprint.
1344  */
1345 void sched_balance_exec(void)
1346 {
1347         struct sched_domain *tmp, *sd = NULL;
1348         int new_cpu, this_cpu = get_cpu();
1349
1350         /* Prefer the current CPU if there's only this task running */
1351         if (this_rq()->nr_running <= 1)
1352                 goto out;
1353
1354         for_each_domain(this_cpu, tmp)
1355                 if (tmp->flags & SD_BALANCE_EXEC)
1356                         sd = tmp;
1357
1358         if (sd) {
1359                 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1360                 if (new_cpu != this_cpu) {
1361                         put_cpu();
1362                         sched_migrate_task(current, new_cpu);
1363                         return;
1364                 }
1365         }
1366 out:
1367         put_cpu();
1368 }
1369
1370 /*
1371  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1372  */
1373 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1374 {
1375         if (unlikely(!spin_trylock(&busiest->lock))) {
1376                 if (busiest < this_rq) {
1377                         spin_unlock(&this_rq->lock);
1378                         spin_lock(&busiest->lock);
1379                         spin_lock(&this_rq->lock);
1380                 } else
1381                         spin_lock(&busiest->lock);
1382         }
1383 }
1384
1385 /*
1386  * pull_task - move a task from a remote runqueue to the local runqueue.
1387  * Both runqueues must be locked.
1388  */
1389 static inline
1390 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1391                runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1392 {
1393         dequeue_task(p, src_array);
1394         src_rq->nr_running--;
1395         set_task_cpu(p, this_cpu);
1396         this_rq->nr_running++;
1397         enqueue_task(p, this_array);
1398         p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1399                                 + this_rq->timestamp_last_tick;
1400         /*
1401          * Note that idle threads have a prio of MAX_PRIO, for this test
1402          * to be always true for them.
1403          */
1404         if (TASK_PREEMPTS_CURR(p, this_rq))
1405                 resched_task(this_rq->curr);
1406 }
1407
1408 /*
1409  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1410  */
1411 static inline
1412 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1413                      struct sched_domain *sd, enum idle_type idle)
1414 {
1415         /*
1416          * We do not migrate tasks that are:
1417          * 1) running (obviously), or
1418          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1419          * 3) are cache-hot on their current CPU.
1420          */
1421         if (task_running(rq, p))
1422                 return 0;
1423         if (!cpu_isset(this_cpu, p->cpus_allowed))
1424                 return 0;
1425
1426         /* Aggressive migration if we've failed balancing */
1427         if (idle == NEWLY_IDLE ||
1428                         sd->nr_balance_failed < sd->cache_nice_tries) {
1429                 if (task_hot(p, rq->timestamp_last_tick, sd))
1430                         return 0;
1431         }
1432
1433         return 1;
1434 }
1435
1436 /*
1437  * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1438  * as part of a balancing operation within "domain". Returns the number of
1439  * tasks moved.
1440  *
1441  * Called with both runqueues locked.
1442  */
1443 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1444                       unsigned long max_nr_move, struct sched_domain *sd,
1445                       enum idle_type idle)
1446 {
1447         prio_array_t *array, *dst_array;
1448         struct list_head *head, *curr;
1449         int idx, pulled = 0;
1450         task_t *tmp;
1451
1452         if (max_nr_move <= 0 || busiest->nr_running <= 1)
1453                 goto out;
1454
1455         /*
1456          * We first consider expired tasks. Those will likely not be
1457          * executed in the near future, and they are most likely to
1458          * be cache-cold, thus switching CPUs has the least effect
1459          * on them.
1460          */
1461         if (busiest->expired->nr_active) {
1462                 array = busiest->expired;
1463                 dst_array = this_rq->expired;
1464         } else {
1465                 array = busiest->active;
1466                 dst_array = this_rq->active;
1467         }
1468
1469 new_array:
1470         /* Start searching at priority 0: */
1471         idx = 0;
1472 skip_bitmap:
1473         if (!idx)
1474                 idx = sched_find_first_bit(array->bitmap);
1475         else
1476                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1477         if (idx >= MAX_PRIO) {
1478                 if (array == busiest->expired && busiest->active->nr_active) {
1479                         array = busiest->active;
1480                         dst_array = this_rq->active;
1481                         goto new_array;
1482                 }
1483                 goto out;
1484         }
1485
1486         head = array->queue + idx;
1487         curr = head->prev;
1488 skip_queue:
1489         tmp = list_entry(curr, task_t, run_list);
1490
1491         curr = curr->prev;
1492
1493         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1494                 if (curr != head)
1495                         goto skip_queue;
1496                 idx++;
1497                 goto skip_bitmap;
1498         }
1499         pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1500         pulled++;
1501
1502         /* We only want to steal up to the prescribed number of tasks. */
1503         if (pulled < max_nr_move) {
1504                 if (curr != head)
1505                         goto skip_queue;
1506                 idx++;
1507                 goto skip_bitmap;
1508         }
1509 out:
1510         return pulled;
1511 }
1512
1513 /*
1514  * find_busiest_group finds and returns the busiest CPU group within the
1515  * domain. It calculates and returns the number of tasks which should be
1516  * moved to restore balance via the imbalance parameter.
1517  */
1518 static struct sched_group *
1519 find_busiest_group(struct sched_domain *sd, int this_cpu,
1520                    unsigned long *imbalance, enum idle_type idle)
1521 {
1522         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1523         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1524
1525         max_load = this_load = total_load = total_pwr = 0;
1526
1527         do {
1528                 cpumask_t tmp;
1529                 unsigned long load;
1530                 int local_group;
1531                 int i, nr_cpus = 0;
1532
1533                 local_group = cpu_isset(this_cpu, group->cpumask);
1534
1535                 /* Tally up the load of all CPUs in the group */
1536                 avg_load = 0;
1537                 cpus_and(tmp, group->cpumask, cpu_online_map);
1538                 if (unlikely(cpus_empty(tmp)))
1539                         goto nextgroup;
1540
1541                 for_each_cpu_mask(i, tmp) {
1542                         /* Bias balancing toward cpus of our domain */
1543                         if (local_group)
1544                                 load = target_load(i);
1545                         else
1546                                 load = source_load(i);
1547
1548                         nr_cpus++;
1549                         avg_load += load;
1550                 }
1551
1552                 if (!nr_cpus)
1553                         goto nextgroup;
1554
1555                 total_load += avg_load;
1556                 total_pwr += group->cpu_power;
1557
1558                 /* Adjust by relative CPU power of the group */
1559                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1560
1561                 if (local_group) {
1562                         this_load = avg_load;
1563                         this = group;
1564                         goto nextgroup;
1565                 } else if (avg_load > max_load) {
1566                         max_load = avg_load;
1567                         busiest = group;
1568                 }
1569 nextgroup:
1570                 group = group->next;
1571         } while (group != sd->groups);
1572
1573         if (!busiest || this_load >= max_load)
1574                 goto out_balanced;
1575
1576         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1577
1578         if (this_load >= avg_load ||
1579                         100*max_load <= sd->imbalance_pct*this_load)
1580                 goto out_balanced;
1581
1582         /*
1583          * We're trying to get all the cpus to the average_load, so we don't
1584          * want to push ourselves above the average load, nor do we wish to
1585          * reduce the max loaded cpu below the average load, as either of these
1586          * actions would just result in more rebalancing later, and ping-pong
1587          * tasks around. Thus we look for the minimum possible imbalance.
1588          * Negative imbalances (*we* are more loaded than anyone else) will
1589          * be counted as no imbalance for these purposes -- we can't fix that
1590          * by pulling tasks to us.  Be careful of negative numbers as they'll
1591          * appear as very large values with unsigned longs.
1592          */
1593         *imbalance = min(max_load - avg_load, avg_load - this_load);
1594
1595         /* How much load to actually move to equalise the imbalance */
1596         *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
1597                                 / SCHED_LOAD_SCALE;
1598
1599         if (*imbalance < SCHED_LOAD_SCALE - 1) {
1600                 unsigned long pwr_now = 0, pwr_move = 0;
1601                 unsigned long tmp;
1602
1603                 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1604                         *imbalance = 1;
1605                         return busiest;
1606                 }
1607
1608                 /*
1609                  * OK, we don't have enough imbalance to justify moving tasks,
1610                  * however we may be able to increase total CPU power used by
1611                  * moving them.
1612                  */
1613
1614                 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1615                 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1616                 pwr_now /= SCHED_LOAD_SCALE;
1617
1618                 /* Amount of load we'd subtract */
1619                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1620                 if (max_load > tmp)
1621                         pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1622                                                         max_load - tmp);
1623
1624                 /* Amount of load we'd add */
1625                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1626                 if (max_load < tmp)
1627                         tmp = max_load;
1628                 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1629                 pwr_move /= SCHED_LOAD_SCALE;
1630
1631                 /* Move if we gain another 8th of a CPU worth of throughput */
1632                 if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
1633                         goto out_balanced;
1634
1635                 *imbalance = 1;
1636                 return busiest;
1637         }
1638
1639         /* Get rid of the scaling factor, rounding down as we divide */
1640         *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
1641
1642         return busiest;
1643
1644 out_balanced:
1645         if (busiest && (idle == NEWLY_IDLE ||
1646                         (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1647                 *imbalance = 1;
1648                 return busiest;
1649         }
1650
1651         *imbalance = 0;
1652         return NULL;
1653 }
1654
1655 /*
1656  * find_busiest_queue - find the busiest runqueue among the cpus in group.
1657  */
1658 static runqueue_t *find_busiest_queue(struct sched_group *group)
1659 {
1660         cpumask_t tmp;
1661         unsigned long load, max_load = 0;
1662         runqueue_t *busiest = NULL;
1663         int i;
1664
1665         cpus_and(tmp, group->cpumask, cpu_online_map);
1666         for_each_cpu_mask(i, tmp) {
1667                 load = source_load(i);
1668
1669                 if (load > max_load) {
1670                         max_load = load;
1671                         busiest = cpu_rq(i);
1672                 }
1673         }
1674
1675         return busiest;
1676 }
1677
1678 /*
1679  * Check this_cpu to ensure it is balanced within domain. Attempt to move
1680  * tasks if there is an imbalance.
1681  *
1682  * Called with this_rq unlocked.
1683  */
1684 static int load_balance(int this_cpu, runqueue_t *this_rq,
1685                         struct sched_domain *sd, enum idle_type idle)
1686 {
1687         struct sched_group *group;
1688         runqueue_t *busiest;
1689         unsigned long imbalance;
1690         int nr_moved;
1691
1692         spin_lock(&this_rq->lock);
1693
1694         group = find_busiest_group(sd, this_cpu, &imbalance, idle);
1695         if (!group)
1696                 goto out_balanced;
1697
1698         busiest = find_busiest_queue(group);
1699         if (!busiest)
1700                 goto out_balanced;
1701         /*
1702          * This should be "impossible", but since load
1703          * balancing is inherently racy and statistical,
1704          * it could happen in theory.
1705          */
1706         if (unlikely(busiest == this_rq)) {
1707                 WARN_ON(1);
1708                 goto out_balanced;
1709         }
1710
1711         nr_moved = 0;
1712         if (busiest->nr_running > 1) {
1713                 /*
1714                  * Attempt to move tasks. If find_busiest_group has found
1715                  * an imbalance but busiest->nr_running <= 1, the group is
1716                  * still unbalanced. nr_moved simply stays zero, so it is
1717                  * correctly treated as an imbalance.
1718                  */
1719                 double_lock_balance(this_rq, busiest);
1720                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1721                                                 imbalance, sd, idle);
1722                 spin_unlock(&busiest->lock);
1723         }
1724         spin_unlock(&this_rq->lock);
1725
1726         if (!nr_moved) {
1727                 sd->nr_balance_failed++;
1728
1729                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1730                         int wake = 0;
1731
1732                         spin_lock(&busiest->lock);
1733                         if (!busiest->active_balance) {
1734                                 busiest->active_balance = 1;
1735                                 busiest->push_cpu = this_cpu;
1736                                 wake = 1;
1737                         }
1738                         spin_unlock(&busiest->lock);
1739                         if (wake)
1740                                 wake_up_process(busiest->migration_thread);
1741
1742                         /*
1743                          * We've kicked active balancing, reset the failure
1744                          * counter.
1745                          */
1746                         sd->nr_balance_failed = sd->cache_nice_tries;
1747                 }
1748         } else
1749                 sd->nr_balance_failed = 0;
1750
1751         /* We were unbalanced, so reset the balancing interval */
1752         sd->balance_interval = sd->min_interval;
1753
1754         return nr_moved;
1755
1756 out_balanced:
1757         spin_unlock(&this_rq->lock);
1758
1759         /* tune up the balancing interval */
1760         if (sd->balance_interval < sd->max_interval)
1761                 sd->balance_interval *= 2;
1762
1763         return 0;
1764 }
1765
1766 /*
1767  * Check this_cpu to ensure it is balanced within domain. Attempt to move
1768  * tasks if there is an imbalance.
1769  *
1770  * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
1771  * this_rq is locked.
1772  */
1773 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
1774                                 struct sched_domain *sd)
1775 {
1776         struct sched_group *group;
1777         runqueue_t *busiest = NULL;
1778         unsigned long imbalance;
1779         int nr_moved = 0;
1780
1781         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
1782         if (!group)
1783                 goto out;
1784
1785         busiest = find_busiest_queue(group);
1786         if (!busiest || busiest == this_rq)
1787                 goto out;
1788
1789         /* Attempt to move tasks */
1790         double_lock_balance(this_rq, busiest);
1791
1792         nr_moved = move_tasks(this_rq, this_cpu, busiest,
1793                                         imbalance, sd, NEWLY_IDLE);
1794
1795         spin_unlock(&busiest->lock);
1796
1797 out:
1798         return nr_moved;
1799 }
1800
1801 /*
1802  * idle_balance is called by schedule() if this_cpu is about to become
1803  * idle. Attempts to pull tasks from other CPUs.
1804  */
1805 static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
1806 {
1807         struct sched_domain *sd;
1808
1809         for_each_domain(this_cpu, sd) {
1810                 if (sd->flags & SD_BALANCE_NEWIDLE) {
1811                         if (load_balance_newidle(this_cpu, this_rq, sd)) {
1812                                 /* We've pulled tasks over so stop searching */
1813                                 break;
1814                         }
1815                 }
1816         }
1817 }
1818
1819 /*
1820  * active_load_balance is run by migration threads. It pushes a running
1821  * task off the cpu. It can be required to correctly have at least 1 task
1822  * running on each physical CPU where possible, and not have a physical /
1823  * logical imbalance.
1824  *
1825  * Called with busiest locked.
1826  */
1827 static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
1828 {
1829         struct sched_domain *sd;
1830         struct sched_group *group, *busy_group;
1831         int i;
1832
1833         if (busiest->nr_running <= 1)
1834                 return;
1835
1836         for_each_domain(busiest_cpu, sd)
1837                 if (cpu_isset(busiest->push_cpu, sd->span))
1838                         break;
1839         if (!sd) {
1840                 WARN_ON(1);
1841                 return;
1842         }
1843
1844         group = sd->groups;
1845         while (!cpu_isset(busiest_cpu, group->cpumask))
1846                 group = group->next;
1847         busy_group = group;
1848
1849         group = sd->groups;
1850         do {
1851                 cpumask_t tmp;
1852                 runqueue_t *rq;
1853                 int push_cpu = 0;
1854
1855                 if (group == busy_group)
1856                         goto next_group;
1857
1858                 cpus_and(tmp, group->cpumask, cpu_online_map);
1859                 if (!cpus_weight(tmp))
1860                         goto next_group;
1861
1862                 for_each_cpu_mask(i, tmp) {
1863                         if (!idle_cpu(i))
1864                                 goto next_group;
1865                         push_cpu = i;
1866                 }
1867
1868                 rq = cpu_rq(push_cpu);
1869
1870                 /*
1871                  * This condition is "impossible", but since load
1872                  * balancing is inherently a bit racy and statistical,
1873                  * it can trigger.. Reported by Bjorn Helgaas on a
1874                  * 128-cpu setup.
1875                  */
1876                 if (unlikely(busiest == rq))
1877                         goto next_group;
1878                 double_lock_balance(busiest, rq);
1879                 move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
1880                 spin_unlock(&rq->lock);
1881 next_group:
1882                 group = group->next;
1883         } while (group != sd->groups);
1884 >>>>>>> 1.1.9.3
1885 }
1886
1887 /*
1888  * rebalance_tick will get called every timer tick, on every CPU.
1889  *
1890  * It checks each scheduling domain to see if it is due to be balanced,
1891  * and initiates a balancing operation if so.
1892  *
1893  * Balancing parameters are set up in arch_init_sched_domains.
1894  */
1895
1896 /* Don't have all balancing operations going off at once */
1897 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
1898
1899 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
1900                            enum idle_type idle)
1901 {
1902         unsigned long old_load, this_load;
1903         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
1904         struct sched_domain *sd;
1905
1906         /* Update our load */
1907         old_load = this_rq->cpu_load;
1908         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
1909         /*
1910          * Round up the averaging division if load is increasing. This
1911          * prevents us from getting stuck on 9 if the load is 10, for
1912          * example.
1913          */
1914         if (this_load > old_load)
1915                 old_load++;
1916         this_rq->cpu_load = (old_load + this_load) / 2;
1917
1918         for_each_domain(this_cpu, sd) {
1919                 unsigned long interval = sd->balance_interval;
1920
1921                 if (idle != IDLE)
1922                         interval *= sd->busy_factor;
1923
1924                 /* scale ms to jiffies */
1925                 interval = msecs_to_jiffies(interval);
1926                 if (unlikely(!interval))
1927                         interval = 1;
1928
1929                 if (j - sd->last_balance >= interval) {
1930                         if (load_balance(this_cpu, this_rq, sd, idle)) {
1931                                 /* We've pulled tasks over so no longer idle */
1932                                 idle = NOT_IDLE;
1933                         }
1934                         sd->last_balance += interval;
1935                 }
1936         }
1937 }
1938 #else
1939 /*
1940  * on UP we do not need to balance between CPUs:
1941  */
1942 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
1943 {
1944 }
1945 static inline void idle_balance(int cpu, runqueue_t *rq)
1946 {
1947 }
1948 #endif
1949
1950 static inline int wake_priority_sleeper(runqueue_t *rq)
1951 {
1952 #ifdef CONFIG_SCHED_SMT
1953         /*
1954          * If an SMT sibling task has been put to sleep for priority
1955          * reasons reschedule the idle task to see if it can now run.
1956          */
1957         if (rq->nr_running) {
1958                 resched_task(rq->idle);
1959                 return 1;
1960         }
1961 #endif
1962         return 0;
1963 }
1964
1965 DEFINE_PER_CPU(struct kernel_stat, kstat);
1966
1967 EXPORT_PER_CPU_SYMBOL(kstat);
1968
1969 /*
1970  * We place interactive tasks back into the active array, if possible.
1971  *
1972  * To guarantee that this does not starve expired tasks we ignore the
1973  * interactivity of a task if the first expired task had to wait more
1974  * than a 'reasonable' amount of time. This deadline timeout is
1975  * load-dependent, as the frequency of array switched decreases with
1976  * increasing number of running tasks. We also ignore the interactivity
1977  * if a better static_prio task has expired:
1978  */
1979 #define EXPIRED_STARVING(rq) \
1980         ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1981                 (jiffies - (rq)->expired_timestamp >= \
1982                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1983                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
1984
1985 /*
1986  * This function gets called by the timer code, with HZ frequency.
1987  * We call it with interrupts disabled.
1988  *
1989  * It also gets called by the fork code, when changing the parent's
1990  * timeslices.
1991  */
1992 void scheduler_tick(int user_ticks, int sys_ticks)
1993 {
1994         int cpu = smp_processor_id();
1995         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1996         runqueue_t *rq = this_rq();
1997         task_t *p = current;
1998
1999         rq->timestamp_last_tick = sched_clock();
2000
2001         if (rcu_pending(cpu))
2002                 rcu_check_callbacks(cpu, user_ticks);
2003
2004         /* note: this timer irq context must be accounted for as well */
2005         if (hardirq_count() - HARDIRQ_OFFSET) {
2006                 cpustat->irq += sys_ticks;
2007                 sys_ticks = 0;
2008         } else if (softirq_count()) {
2009                 cpustat->softirq += sys_ticks;
2010                 sys_ticks = 0;
2011         }
2012
2013         if (p == rq->idle) {
2014                 if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
2015                         set_need_resched();     
2016
2017                 if (atomic_read(&rq->nr_iowait) > 0)
2018                         cpustat->iowait += sys_ticks;
2019                 else
2020                         cpustat->idle += sys_ticks;
2021                 if (wake_priority_sleeper(rq))
2022                         goto out;
2023                 rebalance_tick(cpu, rq, IDLE);
2024                 return;
2025         }
2026         if (TASK_NICE(p) > 0)
2027                 cpustat->nice += user_ticks;
2028         else
2029                 cpustat->user += user_ticks;
2030         cpustat->system += sys_ticks;
2031
2032         /* Task might have expired already, but not scheduled off yet */
2033         if (p->array != rq->active) {
2034                 set_tsk_need_resched(p);
2035                 goto out;
2036         }
2037         spin_lock(&rq->lock);
2038         /*
2039          * The task was running during this tick - update the
2040          * time slice counter. Note: we do not update a thread's
2041          * priority until it either goes to sleep or uses up its
2042          * timeslice. This makes it possible for interactive tasks
2043          * to use up their timeslices at their highest priority levels.
2044          */
2045         if (unlikely(rt_task(p))) {
2046                 /*
2047                  * RR tasks need a special form of timeslice management.
2048                  * FIFO tasks have no timeslices.
2049                  */
2050                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2051                         p->time_slice = task_timeslice(p);
2052                         p->first_time_slice = 0;
2053                         set_tsk_need_resched(p);
2054
2055                         /* put it at the end of the queue: */
2056                         dequeue_task(p, rq->active);
2057                         enqueue_task(p, rq->active);
2058                 }
2059                 goto out_unlock;
2060         }
2061         if (vx_need_resched(p)) {
2062                 dequeue_task(p, rq->active);
2063                 set_tsk_need_resched(p);
2064                 p->prio = effective_prio(p);
2065                 p->time_slice = task_timeslice(p);
2066                 p->first_time_slice = 0;
2067
2068                 if (!rq->expired_timestamp)
2069                         rq->expired_timestamp = jiffies;
2070                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2071                         enqueue_task(p, rq->expired);
2072                         if (p->static_prio < rq->best_expired_prio)
2073                                 rq->best_expired_prio = p->static_prio;
2074                 } else
2075                         enqueue_task(p, rq->active);
2076         } else {
2077                 /*
2078                  * Prevent a too long timeslice allowing a task to monopolize
2079                  * the CPU. We do this by splitting up the timeslice into
2080                  * smaller pieces.
2081                  *
2082                  * Note: this does not mean the task's timeslices expire or
2083                  * get lost in any way, they just might be preempted by
2084                  * another task of equal priority. (one with higher
2085                  * priority would have preempted this task already.) We
2086                  * requeue this task to the end of the list on this priority
2087                  * level, which is in essence a round-robin of tasks with
2088                  * equal priority.
2089                  *
2090                  * This only applies to tasks in the interactive
2091                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
2092                  */
2093                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2094                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2095                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2096                         (p->array == rq->active)) {
2097
2098                         dequeue_task(p, rq->active);
2099                         set_tsk_need_resched(p);
2100                         p->prio = effective_prio(p);
2101                         enqueue_task(p, rq->active);
2102                 }
2103         }
2104 out_unlock:
2105         spin_unlock(&rq->lock);
2106 out:
2107         rebalance_tick(cpu, rq, NOT_IDLE);
2108 }
2109
2110 #ifdef CONFIG_SCHED_SMT
2111 static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
2112 {
2113         int i;
2114         struct sched_domain *sd = rq->sd;
2115         cpumask_t sibling_map;
2116
2117         if (!(sd->flags & SD_SHARE_CPUPOWER))
2118                 return;
2119
2120         cpus_and(sibling_map, sd->span, cpu_online_map);
2121         for_each_cpu_mask(i, sibling_map) {
2122                 runqueue_t *smt_rq;
2123
2124                 if (i == cpu)
2125                         continue;
2126
2127                 smt_rq = cpu_rq(i);
2128
2129                 /*
2130                  * If an SMT sibling task is sleeping due to priority
2131                  * reasons wake it up now.
2132                  */
2133                 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2134                         resched_task(smt_rq->idle);
2135         }
2136 }
2137
2138 static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
2139 {
2140         struct sched_domain *sd = rq->sd;
2141         cpumask_t sibling_map;
2142         int ret = 0, i;
2143
2144         if (!(sd->flags & SD_SHARE_CPUPOWER))
2145                 return 0;
2146
2147         cpus_and(sibling_map, sd->span, cpu_online_map);
2148         for_each_cpu_mask(i, sibling_map) {
2149                 runqueue_t *smt_rq;
2150                 task_t *smt_curr;
2151
2152                 if (i == cpu)
2153                         continue;
2154
2155                 smt_rq = cpu_rq(i);
2156                 smt_curr = smt_rq->curr;
2157
2158                 /*
2159                  * If a user task with lower static priority than the
2160                  * running task on the SMT sibling is trying to schedule,
2161                  * delay it till there is proportionately less timeslice
2162                  * left of the sibling task to prevent a lower priority
2163                  * task from using an unfair proportion of the
2164                  * physical cpu's resources. -ck
2165                  */
2166                 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2167                         task_timeslice(p) || rt_task(smt_curr)) &&
2168                         p->mm && smt_curr->mm && !rt_task(p))
2169                                 ret = 1;
2170
2171                 /*
2172                  * Reschedule a lower priority task on the SMT sibling,
2173                  * or wake it up if it has been put to sleep for priority
2174                  * reasons.
2175                  */
2176                 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2177                         task_timeslice(smt_curr) || rt_task(p)) &&
2178                         smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2179                         (smt_curr == smt_rq->idle && smt_rq->nr_running))
2180                                 resched_task(smt_curr);
2181         }
2182         return ret;
2183 }
2184 #else
2185 static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
2186 {
2187 }
2188
2189 static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
2190 {
2191         return 0;
2192 }
2193 #endif
2194
2195 /*
2196  * schedule() is the main scheduler function.
2197  */
2198 asmlinkage void __sched schedule(void)
2199 {
2200         long *switch_count;
2201         task_t *prev, *next;
2202         runqueue_t *rq;
2203         prio_array_t *array;
2204         struct list_head *queue;
2205         unsigned long long now;
2206         unsigned long run_time;
2207         int cpu, idx;
2208 #ifdef  CONFIG_VSERVER_HARDCPU          
2209         struct vx_info *vxi;
2210         int maxidle = -HZ;
2211 #endif
2212
2213         /*
2214          * Test if we are atomic.  Since do_exit() needs to call into
2215          * schedule() atomically, we ignore that path for now.
2216          * Otherwise, whine if we are scheduling when we should not be.
2217          */
2218         if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
2219                 if (unlikely(in_atomic())) {
2220                         printk(KERN_ERR "bad: scheduling while atomic!\n");
2221                         dump_stack();
2222                 }
2223         }
2224
2225 need_resched:
2226         preempt_disable();
2227         prev = current;
2228         rq = this_rq();
2229
2230         release_kernel_lock(prev);
2231         now = sched_clock();
2232         if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
2233                 run_time = now - prev->timestamp;
2234         else
2235                 run_time = NS_MAX_SLEEP_AVG;
2236
2237         /*
2238          * Tasks with interactive credits get charged less run_time
2239          * at high sleep_avg to delay them losing their interactive
2240          * status
2241          */
2242         if (HIGH_CREDIT(prev))
2243                 run_time /= (CURRENT_BONUS(prev) ? : 1);
2244
2245         spin_lock_irq(&rq->lock);
2246
2247         /*
2248          * if entering off of a kernel preemption go straight
2249          * to picking the next task.
2250          */
2251         switch_count = &prev->nivcsw;
2252         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2253                 switch_count = &prev->nvcsw;
2254                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2255                                 unlikely(signal_pending(prev))))
2256                         prev->state = TASK_RUNNING;
2257                 else
2258                         deactivate_task(prev, rq);
2259         }
2260
2261         cpu = smp_processor_id();
2262 #ifdef  CONFIG_VSERVER_HARDCPU          
2263         if (!list_empty(&rq->hold_queue)) {
2264                 struct list_head *l, *n;
2265                 int ret;
2266
2267                 vxi = NULL;
2268                 list_for_each_safe(l, n, &rq->hold_queue) {
2269                         next = list_entry(l, task_t, run_list);
2270                         if (vxi == next->vx_info)
2271                                 continue;
2272
2273                         vxi = next->vx_info;
2274                         ret = vx_tokens_recalc(vxi);
2275                         // tokens = vx_tokens_avail(next);
2276
2277                         if (ret > 0) {
2278                                 list_del(&next->run_list);
2279                                 next->state &= ~TASK_ONHOLD;
2280                                 recalc_task_prio(next, now);
2281                                 __activate_task(next, rq);
2282                                 // printk("··· unhold %p\n", next);
2283                                 break;
2284                         }
2285                         if ((ret < 0) && (maxidle < ret))
2286                                 maxidle = ret;
2287                 }       
2288         }
2289         rq->idle_tokens = -maxidle;
2290
2291 pick_next:
2292 #endif
2293         if (unlikely(!rq->nr_running)) {
2294                 idle_balance(cpu, rq);
2295                 if (!rq->nr_running) {
2296                         next = rq->idle;
2297                         rq->expired_timestamp = 0;
2298                         wake_sleeping_dependent(cpu, rq);
2299                         goto switch_tasks;
2300                 }
2301         }
2302
2303         array = rq->active;
2304         if (unlikely(!array->nr_active)) {
2305                 /*
2306                  * Switch the active and expired arrays.
2307                  */
2308                 rq->active = rq->expired;
2309                 rq->expired = array;
2310                 array = rq->active;
2311                 rq->expired_timestamp = 0;
2312                 rq->best_expired_prio = MAX_PRIO;
2313         }
2314
2315         idx = sched_find_first_bit(array->bitmap);
2316         queue = array->queue + idx;
2317         next = list_entry(queue->next, task_t, run_list);
2318
2319         if (dependent_sleeper(cpu, rq, next)) {
2320                 next = rq->idle;
2321                 goto switch_tasks;
2322         }
2323
2324 #ifdef  CONFIG_VSERVER_HARDCPU          
2325         vxi = next->vx_info;
2326         if (vxi && __vx_flags(vxi->vx_flags,
2327                 VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
2328                 int ret = vx_tokens_recalc(vxi);
2329
2330                 if (unlikely(ret <= 0)) {
2331                         if (ret && (rq->idle_tokens > -ret))
2332                                 rq->idle_tokens = -ret;
2333                         deactivate_task(next, rq);
2334                         list_add_tail(&next->run_list, &rq->hold_queue);
2335                         next->state |= TASK_ONHOLD;                     
2336                         goto pick_next;
2337                 }
2338         }
2339 #endif
2340
2341         if (!rt_task(next) && next->activated > 0) {
2342                 unsigned long long delta = now - next->timestamp;
2343
2344                 if (next->activated == 1)
2345                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2346
2347                 array = next->array;
2348                 dequeue_task(next, array);
2349                 recalc_task_prio(next, next->timestamp + delta);
2350                 enqueue_task(next, array);
2351         }
2352         next->activated = 0;
2353 switch_tasks:
2354         prefetch(next);
2355         clear_tsk_need_resched(prev);
2356         RCU_qsctr(task_cpu(prev))++;
2357
2358         prev->sleep_avg -= run_time;
2359         if ((long)prev->sleep_avg <= 0) {
2360                 prev->sleep_avg = 0;
2361                 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
2362                         prev->interactive_credit--;
2363         }
2364         add_delay_ts(prev,runcpu_total,prev->timestamp,now);
2365         prev->timestamp = now;
2366
2367         if (likely(prev != next)) {
2368                 add_delay_ts(next,waitcpu_total,next->timestamp,now);
2369                 inc_delay(next,runs);
2370                 next->timestamp = now;
2371                 rq->nr_switches++;
2372                 rq->curr = next;
2373                 ++*switch_count;
2374
2375                 prepare_arch_switch(rq, next);
2376                 prev = context_switch(rq, prev, next);
2377                 barrier();
2378
2379                 finish_task_switch(prev);
2380         } else
2381                 spin_unlock_irq(&rq->lock);
2382
2383         reacquire_kernel_lock(current);
2384         preempt_enable_no_resched();
2385         if (test_thread_flag(TIF_NEED_RESCHED))
2386                 goto need_resched;
2387 }
2388
2389 EXPORT_SYMBOL(schedule);
2390
2391 #ifdef CONFIG_PREEMPT
2392 /*
2393  * this is is the entry point to schedule() from in-kernel preemption
2394  * off of preempt_enable.  Kernel preemptions off return from interrupt
2395  * occur there and call schedule directly.
2396  */
2397 asmlinkage void __sched preempt_schedule(void)
2398 {
2399         struct thread_info *ti = current_thread_info();
2400
2401         /*
2402          * If there is a non-zero preempt_count or interrupts are disabled,
2403          * we do not want to preempt the current task.  Just return..
2404          */
2405         if (unlikely(ti->preempt_count || irqs_disabled()))
2406                 return;
2407
2408 need_resched:
2409         ti->preempt_count = PREEMPT_ACTIVE;
2410         schedule();
2411         ti->preempt_count = 0;
2412
2413         /* we could miss a preemption opportunity between schedule and now */
2414         barrier();
2415         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2416                 goto need_resched;
2417 }
2418
2419 EXPORT_SYMBOL(preempt_schedule);
2420 #endif /* CONFIG_PREEMPT */
2421
2422 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2423 {
2424         task_t *p = curr->task;
2425         return try_to_wake_up(p, mode, sync);
2426 }
2427
2428 EXPORT_SYMBOL(default_wake_function);
2429
2430 /*
2431  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
2432  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
2433  * number) then we wake all the non-exclusive tasks and one exclusive task.
2434  *
2435  * There are circumstances in which we can try to wake a task which has already
2436  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
2437  * zero in this (rare) case, and we handle it by continuing to scan the queue.
2438  */
2439 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2440                              int nr_exclusive, int sync, void *key)
2441 {
2442         struct list_head *tmp, *next;
2443
2444         list_for_each_safe(tmp, next, &q->task_list) {
2445                 wait_queue_t *curr;
2446                 unsigned flags;
2447                 curr = list_entry(tmp, wait_queue_t, task_list);
2448                 flags = curr->flags;
2449                 if (curr->func(curr, mode, sync, key) &&
2450                     (flags & WQ_FLAG_EXCLUSIVE) &&
2451                     !--nr_exclusive)
2452                         break;
2453         }
2454 }
2455
2456 /**
2457  * __wake_up - wake up threads blocked on a waitqueue.
2458  * @q: the waitqueue
2459  * @mode: which threads
2460  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2461  */
2462 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2463                                 int nr_exclusive, void *key)
2464 {
2465         unsigned long flags;
2466
2467         spin_lock_irqsave(&q->lock, flags);
2468         __wake_up_common(q, mode, nr_exclusive, 0, key);
2469         spin_unlock_irqrestore(&q->lock, flags);
2470 }
2471
2472 EXPORT_SYMBOL(__wake_up);
2473
2474 /*
2475  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2476  */
2477 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2478 {
2479         __wake_up_common(q, mode, 1, 0, NULL);
2480 }
2481
2482 /**
2483  * __wake_up - sync- wake up threads blocked on a waitqueue.
2484  * @q: the waitqueue
2485  * @mode: which threads
2486  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2487  *
2488  * The sync wakeup differs that the waker knows that it will schedule
2489  * away soon, so while the target thread will be woken up, it will not
2490  * be migrated to another CPU - ie. the two threads are 'synchronized'
2491  * with each other. This can prevent needless bouncing between CPUs.
2492  *
2493  * On UP it can prevent extra preemption.
2494  */
2495 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2496 {
2497         unsigned long flags;
2498         int sync = 1;
2499
2500         if (unlikely(!q))
2501                 return;
2502
2503         if (unlikely(!nr_exclusive))
2504                 sync = 0;
2505
2506         spin_lock_irqsave(&q->lock, flags);
2507         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
2508         spin_unlock_irqrestore(&q->lock, flags);
2509 }
2510 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
2511
2512 void fastcall complete(struct completion *x)
2513 {
2514         unsigned long flags;
2515
2516         spin_lock_irqsave(&x->wait.lock, flags);
2517         x->done++;
2518         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2519                          1, 0, NULL);
2520         spin_unlock_irqrestore(&x->wait.lock, flags);
2521 }
2522 EXPORT_SYMBOL(complete);
2523
2524 void fastcall complete_all(struct completion *x)
2525 {
2526         unsigned long flags;
2527
2528         spin_lock_irqsave(&x->wait.lock, flags);
2529         x->done += UINT_MAX/2;
2530         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2531                          0, 0, NULL);
2532         spin_unlock_irqrestore(&x->wait.lock, flags);
2533 }
2534 EXPORT_SYMBOL(complete_all);
2535
2536 void fastcall __sched wait_for_completion(struct completion *x)
2537 {
2538         might_sleep();
2539         spin_lock_irq(&x->wait.lock);
2540         if (!x->done) {
2541                 DECLARE_WAITQUEUE(wait, current);
2542
2543                 wait.flags |= WQ_FLAG_EXCLUSIVE;
2544                 __add_wait_queue_tail(&x->wait, &wait);
2545                 do {
2546                         __set_current_state(TASK_UNINTERRUPTIBLE);
2547                         spin_unlock_irq(&x->wait.lock);
2548                         schedule();
2549                         spin_lock_irq(&x->wait.lock);
2550                 } while (!x->done);
2551                 __remove_wait_queue(&x->wait, &wait);
2552         }
2553         x->done--;
2554         spin_unlock_irq(&x->wait.lock);
2555 }
2556 EXPORT_SYMBOL(wait_for_completion);
2557
2558 #define SLEEP_ON_VAR                                    \
2559         unsigned long flags;                            \
2560         wait_queue_t wait;                              \
2561         init_waitqueue_entry(&wait, current);
2562
2563 #define SLEEP_ON_HEAD                                   \
2564         spin_lock_irqsave(&q->lock,flags);              \
2565         __add_wait_queue(q, &wait);                     \
2566         spin_unlock(&q->lock);
2567
2568 #define SLEEP_ON_TAIL                                   \
2569         spin_lock_irq(&q->lock);                        \
2570         __remove_wait_queue(q, &wait);                  \
2571         spin_unlock_irqrestore(&q->lock, flags);
2572
2573 #define SLEEP_ON_BKLCHECK                               \
2574         if (unlikely(!kernel_locked()) &&               \
2575             sleep_on_bkl_warnings < 10) {               \
2576                 sleep_on_bkl_warnings++;                \
2577                 WARN_ON(1);                             \
2578         }
2579
2580 static int sleep_on_bkl_warnings;
2581
2582 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
2583 {
2584         SLEEP_ON_VAR
2585
2586         SLEEP_ON_BKLCHECK
2587
2588         current->state = TASK_INTERRUPTIBLE;
2589
2590         SLEEP_ON_HEAD
2591         schedule();
2592         SLEEP_ON_TAIL
2593 }
2594
2595 EXPORT_SYMBOL(interruptible_sleep_on);
2596
2597 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2598 {
2599         SLEEP_ON_VAR
2600
2601         SLEEP_ON_BKLCHECK
2602
2603         current->state = TASK_INTERRUPTIBLE;
2604
2605         SLEEP_ON_HEAD
2606         timeout = schedule_timeout(timeout);
2607         SLEEP_ON_TAIL
2608
2609         return timeout;
2610 }
2611
2612 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2613
2614 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2615 {
2616         SLEEP_ON_VAR
2617
2618         SLEEP_ON_BKLCHECK
2619
2620         current->state = TASK_UNINTERRUPTIBLE;
2621
2622         SLEEP_ON_HEAD
2623         timeout = schedule_timeout(timeout);
2624         SLEEP_ON_TAIL
2625
2626         return timeout;
2627 }
2628
2629 EXPORT_SYMBOL(sleep_on_timeout);
2630
2631 void set_user_nice(task_t *p, long nice)
2632 {
2633         unsigned long flags;
2634         prio_array_t *array;
2635         runqueue_t *rq;
2636         int old_prio, new_prio, delta;
2637
2638         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
2639                 return;
2640         /*
2641          * We have to be careful, if called from sys_setpriority(),
2642          * the task might be in the middle of scheduling on another CPU.
2643          */
2644         rq = task_rq_lock(p, &flags);
2645         /*
2646          * The RT priorities are set via setscheduler(), but we still
2647          * allow the 'normal' nice value to be set - but as expected
2648          * it wont have any effect on scheduling until the task is
2649          * not SCHED_NORMAL:
2650          */
2651         if (rt_task(p)) {
2652                 p->static_prio = NICE_TO_PRIO(nice);
2653                 goto out_unlock;
2654         }
2655         array = p->array;
2656         if (array)
2657                 dequeue_task(p, array);
2658
2659         old_prio = p->prio;
2660         new_prio = NICE_TO_PRIO(nice);
2661         delta = new_prio - old_prio;
2662         p->static_prio = NICE_TO_PRIO(nice);
2663         p->prio += delta;
2664
2665         if (array) {
2666                 enqueue_task(p, array);
2667                 /*
2668                  * If the task increased its priority or is running and
2669                  * lowered its priority, then reschedule its CPU:
2670                  */
2671                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2672                         resched_task(rq->curr);
2673         }
2674 out_unlock:
2675         task_rq_unlock(rq, &flags);
2676 }
2677
2678 EXPORT_SYMBOL(set_user_nice);
2679
2680 #ifdef __ARCH_WANT_SYS_NICE
2681
2682 /*
2683  * sys_nice - change the priority of the current process.
2684  * @increment: priority increment
2685  *
2686  * sys_setpriority is a more generic, but much slower function that
2687  * does similar things.
2688  */
2689 asmlinkage long sys_nice(int increment)
2690 {
2691         int retval;
2692         long nice;
2693
2694         /*
2695          * Setpriority might change our priority at the same moment.
2696          * We don't have to worry. Conceptually one call occurs first
2697          * and we have a single winner.
2698          */
2699         if (increment < 0) {
2700                 if (!capable(CAP_SYS_NICE))
2701                         return -EPERM;
2702                 if (increment < -40)
2703                         increment = -40;
2704         }
2705         if (increment > 40)
2706                 increment = 40;
2707
2708         nice = PRIO_TO_NICE(current->static_prio) + increment;
2709         if (nice < -20)
2710                 nice = -20;
2711         if (nice > 19)
2712                 nice = 19;
2713
2714         retval = security_task_setnice(current, nice);
2715         if (retval)
2716                 return retval;
2717
2718         set_user_nice(current, nice);
2719         return 0;
2720 }
2721
2722 #endif
2723
2724 /**
2725  * task_prio - return the priority value of a given task.
2726  * @p: the task in question.
2727  *
2728  * This is the priority value as seen by users in /proc.
2729  * RT tasks are offset by -200. Normal tasks are centered
2730  * around 0, value goes from -16 to +15.
2731  */
2732 int task_prio(const task_t *p)
2733 {
2734         return p->prio - MAX_RT_PRIO;
2735 }
2736
2737 /**
2738  * task_nice - return the nice value of a given task.
2739  * @p: the task in question.
2740  */
2741 int task_nice(const task_t *p)
2742 {
2743         return TASK_NICE(p);
2744 }
2745
2746 EXPORT_SYMBOL(task_nice);
2747
2748 /**
2749  * idle_cpu - is a given cpu idle currently?
2750  * @cpu: the processor in question.
2751  */
2752 int idle_cpu(int cpu)
2753 {
2754         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
2755 }
2756
2757 EXPORT_SYMBOL_GPL(idle_cpu);
2758
2759 /**
2760  * find_process_by_pid - find a process with a matching PID value.
2761  * @pid: the pid in question.
2762  */
2763 static inline task_t *find_process_by_pid(pid_t pid)
2764 {
2765         return pid ? find_task_by_pid(pid) : current;
2766 }
2767
2768 /* Actually do priority change: must hold rq lock. */
2769 static void __setscheduler(struct task_struct *p, int policy, int prio)
2770 {
2771         BUG_ON(p->array);
2772         p->policy = policy;
2773         p->rt_priority = prio;
2774         if (policy != SCHED_NORMAL)
2775                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
2776         else
2777                 p->prio = p->static_prio;
2778 }
2779
2780 /*
2781  * setscheduler - change the scheduling policy and/or RT priority of a thread.
2782  */
2783 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
2784 {
2785         struct sched_param lp;
2786         int retval = -EINVAL;
2787         int oldprio;
2788         prio_array_t *array;
2789         unsigned long flags;
2790         runqueue_t *rq;
2791         task_t *p;
2792
2793         if (!param || pid < 0)
2794                 goto out_nounlock;
2795
2796         retval = -EFAULT;
2797         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
2798                 goto out_nounlock;
2799
2800         /*
2801          * We play safe to avoid deadlocks.
2802          */
2803         read_lock_irq(&tasklist_lock);
2804
2805         p = find_process_by_pid(pid);
2806
2807         retval = -ESRCH;
2808         if (!p)
2809                 goto out_unlock_tasklist;
2810
2811         /*
2812          * To be able to change p->policy safely, the apropriate
2813          * runqueue lock must be held.
2814          */
2815         rq = task_rq_lock(p, &flags);
2816
2817         if (policy < 0)
2818                 policy = p->policy;
2819         else {
2820                 retval = -EINVAL;
2821                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
2822                                 policy != SCHED_NORMAL)
2823                         goto out_unlock;
2824         }
2825
2826         /*
2827          * Valid priorities for SCHED_FIFO and SCHED_RR are
2828          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
2829          */
2830         retval = -EINVAL;
2831         if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
2832                 goto out_unlock;
2833         if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
2834                 goto out_unlock;
2835
2836         retval = -EPERM;
2837         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
2838             !capable(CAP_SYS_NICE))
2839                 goto out_unlock;
2840         if ((current->euid != p->euid) && (current->euid != p->uid) &&
2841             !capable(CAP_SYS_NICE))
2842                 goto out_unlock;
2843
2844         retval = security_task_setscheduler(p, policy, &lp);
2845         if (retval)
2846                 goto out_unlock;
2847
2848         array = p->array;
2849         if (array)
2850                 deactivate_task(p, task_rq(p));
2851         retval = 0;
2852         oldprio = p->prio;
2853         __setscheduler(p, policy, lp.sched_priority);
2854         if (array) {
2855                 __activate_task(p, task_rq(p));
2856                 /*
2857                  * Reschedule if we are currently running on this runqueue and
2858                  * our priority decreased, or if we are not currently running on
2859                  * this runqueue and our priority is higher than the current's
2860                  */
2861                 if (task_running(rq, p)) {
2862                         if (p->prio > oldprio)
2863                                 resched_task(rq->curr);
2864                 } else if (TASK_PREEMPTS_CURR(p, rq))
2865                         resched_task(rq->curr);
2866         }
2867
2868 out_unlock:
2869         task_rq_unlock(rq, &flags);
2870 out_unlock_tasklist:
2871         read_unlock_irq(&tasklist_lock);
2872
2873 out_nounlock:
2874         return retval;
2875 }
2876
2877 /**
2878  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
2879  * @pid: the pid in question.
2880  * @policy: new policy
2881  * @param: structure containing the new RT priority.
2882  */
2883 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
2884                                        struct sched_param __user *param)
2885 {
2886         return setscheduler(pid, policy, param);
2887 }
2888
2889 /**
2890  * sys_sched_setparam - set/change the RT priority of a thread
2891  * @pid: the pid in question.
2892  * @param: structure containing the new RT priority.
2893  */
2894 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
2895 {
2896         return setscheduler(pid, -1, param);
2897 }
2898
2899 /**
2900  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
2901  * @pid: the pid in question.
2902  */
2903 asmlinkage long sys_sched_getscheduler(pid_t pid)
2904 {
2905         int retval = -EINVAL;
2906         task_t *p;
2907
2908         if (pid < 0)
2909                 goto out_nounlock;
2910
2911         retval = -ESRCH;
2912         read_lock(&tasklist_lock);
2913         p = find_process_by_pid(pid);
2914         if (p) {
2915                 retval = security_task_getscheduler(p);
2916                 if (!retval)
2917                         retval = p->policy;
2918         }
2919         read_unlock(&tasklist_lock);
2920
2921 out_nounlock:
2922         return retval;
2923 }
2924
2925 /**
2926  * sys_sched_getscheduler - get the RT priority of a thread
2927  * @pid: the pid in question.
2928  * @param: structure containing the RT priority.
2929  */
2930 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
2931 {
2932         struct sched_param lp;
2933         int retval = -EINVAL;
2934         task_t *p;
2935
2936         if (!param || pid < 0)
2937                 goto out_nounlock;
2938
2939         read_lock(&tasklist_lock);
2940         p = find_process_by_pid(pid);
2941         retval = -ESRCH;
2942         if (!p)
2943                 goto out_unlock;
2944
2945         retval = security_task_getscheduler(p);
2946         if (retval)
2947                 goto out_unlock;
2948
2949         lp.sched_priority = p->rt_priority;
2950         read_unlock(&tasklist_lock);
2951
2952         /*
2953          * This one might sleep, we cannot do it with a spinlock held ...
2954          */
2955         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
2956
2957 out_nounlock:
2958         return retval;
2959
2960 out_unlock:
2961         read_unlock(&tasklist_lock);
2962         return retval;
2963 }
2964
2965 /**
2966  * sys_sched_setaffinity - set the cpu affinity of a process
2967  * @pid: pid of the process
2968  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2969  * @user_mask_ptr: user-space pointer to the new cpu mask
2970  */
2971 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
2972                                       unsigned long __user *user_mask_ptr)
2973 {
2974         cpumask_t new_mask;
2975         int retval;
2976         task_t *p;
2977
2978         if (len < sizeof(new_mask))
2979                 return -EINVAL;
2980
2981         if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
2982                 return -EFAULT;
2983
2984         lock_cpu_hotplug();
2985         read_lock(&tasklist_lock);
2986
2987         p = find_process_by_pid(pid);
2988         if (!p) {
2989                 read_unlock(&tasklist_lock);
2990                 unlock_cpu_hotplug();
2991                 return -ESRCH;
2992         }
2993
2994         /*
2995          * It is not safe to call set_cpus_allowed with the
2996          * tasklist_lock held.  We will bump the task_struct's
2997          * usage count and then drop tasklist_lock.
2998          */
2999         get_task_struct(p);
3000         read_unlock(&tasklist_lock);
3001
3002         retval = -EPERM;
3003         if ((current->euid != p->euid) && (current->euid != p->uid) &&
3004                         !capable(CAP_SYS_NICE))
3005                 goto out_unlock;
3006
3007         retval = set_cpus_allowed(p, new_mask);
3008
3009 out_unlock:
3010         put_task_struct(p);
3011         unlock_cpu_hotplug();
3012         return retval;
3013 }
3014
3015 /**
3016  * sys_sched_getaffinity - get the cpu affinity of a process
3017  * @pid: pid of the process
3018  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3019  * @user_mask_ptr: user-space pointer to hold the current cpu mask
3020  */
3021 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3022                                       unsigned long __user *user_mask_ptr)
3023 {
3024         unsigned int real_len;
3025         cpumask_t mask;
3026         int retval;
3027         task_t *p;
3028
3029         real_len = sizeof(mask);
3030         if (len < real_len)
3031                 return -EINVAL;
3032
3033         lock_cpu_hotplug();
3034         read_lock(&tasklist_lock);
3035
3036         retval = -ESRCH;
3037         p = find_process_by_pid(pid);
3038         if (!p)
3039                 goto out_unlock;
3040
3041         retval = 0;
3042         cpus_and(mask, p->cpus_allowed, cpu_possible_map);
3043
3044 out_unlock:
3045         read_unlock(&tasklist_lock);
3046         unlock_cpu_hotplug();
3047         if (retval)
3048                 return retval;
3049         if (copy_to_user(user_mask_ptr, &mask, real_len))
3050                 return -EFAULT;
3051         return real_len;
3052 }
3053
3054 /**
3055  * sys_sched_yield - yield the current processor to other threads.
3056  *
3057  * this function yields the current CPU by moving the calling thread
3058  * to the expired array. If there are no other threads running on this
3059  * CPU then this function will return.
3060  */
3061 asmlinkage long sys_sched_yield(void)
3062 {
3063         runqueue_t *rq = this_rq_lock();
3064         prio_array_t *array = current->array;
3065         prio_array_t *target = rq->expired;
3066
3067         /*
3068          * We implement yielding by moving the task into the expired
3069          * queue.
3070          *
3071          * (special rule: RT tasks will just roundrobin in the active
3072          *  array.)
3073          */
3074         if (unlikely(rt_task(current)))
3075                 target = rq->active;
3076
3077         dequeue_task(current, array);
3078         enqueue_task(current, target);
3079
3080         /*
3081          * Since we are going to call schedule() anyway, there's
3082          * no need to preempt or enable interrupts:
3083          */
3084         _raw_spin_unlock(&rq->lock);
3085         preempt_enable_no_resched();
3086
3087         schedule();
3088
3089         return 0;
3090 }
3091
3092 void __sched __cond_resched(void)
3093 {
3094         set_current_state(TASK_RUNNING);
3095         schedule();
3096 }
3097
3098 EXPORT_SYMBOL(__cond_resched);
3099
3100 /**
3101  * yield - yield the current processor to other threads.
3102  *
3103  * this is a shortcut for kernel-space yielding - it marks the
3104  * thread runnable and calls sys_sched_yield().
3105  */
3106 void __sched yield(void)
3107 {
3108         set_current_state(TASK_RUNNING);
3109         sys_sched_yield();
3110 }
3111
3112 EXPORT_SYMBOL(yield);
3113
3114 /*
3115  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
3116  * that process accounting knows that this is a task in IO wait state.
3117  *
3118  * But don't do that if it is a deliberate, throttling IO wait (this task
3119  * has set its backing_dev_info: the queue against which it should throttle)
3120  */
3121 void __sched io_schedule(void)
3122 {
3123         struct runqueue *rq = this_rq();
3124         def_delay_var(dstart);
3125
3126         start_delay_set(dstart,PF_IOWAIT);
3127         atomic_inc(&rq->nr_iowait);
3128         schedule();
3129         atomic_dec(&rq->nr_iowait);
3130         add_io_delay(dstart);
3131 }
3132
3133 EXPORT_SYMBOL(io_schedule);
3134
3135 long __sched io_schedule_timeout(long timeout)
3136 {
3137         struct runqueue *rq = this_rq();
3138         long ret;
3139         def_delay_var(dstart);
3140
3141         start_delay_set(dstart,PF_IOWAIT);
3142         atomic_inc(&rq->nr_iowait);
3143         ret = schedule_timeout(timeout);
3144         atomic_dec(&rq->nr_iowait);
3145         add_io_delay(dstart);
3146         return ret;
3147 }
3148
3149 /**
3150  * sys_sched_get_priority_max - return maximum RT priority.
3151  * @policy: scheduling class.
3152  *
3153  * this syscall returns the maximum rt_priority that can be used
3154  * by a given scheduling class.
3155  */
3156 asmlinkage long sys_sched_get_priority_max(int policy)
3157 {
3158         int ret = -EINVAL;
3159
3160         switch (policy) {
3161         case SCHED_FIFO:
3162         case SCHED_RR:
3163                 ret = MAX_USER_RT_PRIO-1;
3164                 break;
3165         case SCHED_NORMAL:
3166                 ret = 0;
3167                 break;
3168         }
3169         return ret;
3170 }
3171
3172 /**
3173  * sys_sched_get_priority_min - return minimum RT priority.
3174  * @policy: scheduling class.
3175  *
3176  * this syscall returns the minimum rt_priority that can be used
3177  * by a given scheduling class.
3178  */
3179 asmlinkage long sys_sched_get_priority_min(int policy)
3180 {
3181         int ret = -EINVAL;
3182
3183         switch (policy) {
3184         case SCHED_FIFO:
3185         case SCHED_RR:
3186                 ret = 1;
3187                 break;
3188         case SCHED_NORMAL:
3189                 ret = 0;
3190         }
3191         return ret;
3192 }
3193
3194 /**
3195  * sys_sched_rr_get_interval - return the default timeslice of a process.
3196  * @pid: pid of the process.
3197  * @interval: userspace pointer to the timeslice value.
3198  *
3199  * this syscall writes the default timeslice value of a given process
3200  * into the user-space timespec buffer. A value of '0' means infinity.
3201  */
3202 asmlinkage
3203 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3204 {
3205         int retval = -EINVAL;
3206         struct timespec t;
3207         task_t *p;
3208
3209         if (pid < 0)
3210                 goto out_nounlock;
3211
3212         retval = -ESRCH;
3213         read_lock(&tasklist_lock);
3214         p = find_process_by_pid(pid);
3215         if (!p)
3216                 goto out_unlock;
3217
3218         retval = security_task_getscheduler(p);
3219         if (retval)
3220                 goto out_unlock;
3221
3222         jiffies_to_timespec(p->policy & SCHED_FIFO ?
3223                                 0 : task_timeslice(p), &t);
3224         read_unlock(&tasklist_lock);
3225         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3226 out_nounlock:
3227         return retval;
3228 out_unlock:
3229         read_unlock(&tasklist_lock);
3230         return retval;
3231 }
3232
3233 static inline struct task_struct *eldest_child(struct task_struct *p)
3234 {
3235         if (list_empty(&p->children)) return NULL;
3236         return list_entry(p->children.next,struct task_struct,sibling);
3237 }
3238
3239 static inline struct task_struct *older_sibling(struct task_struct *p)
3240 {
3241         if (p->sibling.prev==&p->parent->children) return NULL;
3242         return list_entry(p->sibling.prev,struct task_struct,sibling);
3243 }
3244
3245 static inline struct task_struct *younger_sibling(struct task_struct *p)
3246 {
3247         if (p->sibling.next==&p->parent->children) return NULL;
3248         return list_entry(p->sibling.next,struct task_struct,sibling);
3249 }
3250
3251 static void show_task(task_t * p)
3252 {
3253         task_t *relative;
3254         unsigned state;
3255         unsigned long free = 0;
3256         static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
3257
3258         printk("%-13.13s ", p->comm);
3259         state = p->state ? __ffs(p->state) + 1 : 0;
3260         if (state < ARRAY_SIZE(stat_nam))
3261                 printk(stat_nam[state]);
3262         else
3263                 printk("?");
3264 #if (BITS_PER_LONG == 32)
3265         if (state == TASK_RUNNING)
3266                 printk(" running ");
3267         else
3268                 printk(" %08lX ", thread_saved_pc(p));
3269 #else
3270         if (state == TASK_RUNNING)
3271                 printk("  running task   ");
3272         else
3273                 printk(" %016lx ", thread_saved_pc(p));
3274 #endif
3275 #ifdef CONFIG_DEBUG_STACK_USAGE
3276         {
3277                 unsigned long * n = (unsigned long *) (p->thread_info+1);
3278                 while (!*n)
3279                         n++;
3280                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3281         }
3282 #endif
3283         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3284         if ((relative = eldest_child(p)))
3285                 printk("%5d ", relative->pid);
3286         else
3287                 printk("      ");
3288         if ((relative = younger_sibling(p)))
3289                 printk("%7d", relative->pid);
3290         else
3291                 printk("       ");
3292         if ((relative = older_sibling(p)))
3293                 printk(" %5d", relative->pid);
3294         else
3295                 printk("      ");
3296         if (!p->mm)
3297                 printk(" (L-TLB)\n");
3298         else
3299                 printk(" (NOTLB)\n");
3300
3301         if (state != TASK_RUNNING)
3302                 show_stack(p, NULL);
3303 }
3304
3305 void show_state(void)
3306 {
3307         task_t *g, *p;
3308
3309 #if (BITS_PER_LONG == 32)
3310         printk("\n"
3311                "                                               sibling\n");
3312         printk("  task             PC      pid father child younger older\n");
3313 #else
3314         printk("\n"
3315                "                                                       sibling\n");
3316         printk("  task                 PC          pid father child younger older\n");
3317 #endif
3318         read_lock(&tasklist_lock);
3319         do_each_thread(g, p) {
3320                 /*
3321                  * reset the NMI-timeout, listing all files on a slow
3322                  * console might take alot of time:
3323                  */
3324                 touch_nmi_watchdog();
3325                 show_task(p);
3326         } while_each_thread(g, p);
3327
3328         read_unlock(&tasklist_lock);
3329 }
3330
3331 void __devinit init_idle(task_t *idle, int cpu)
3332 {
3333         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
3334         unsigned long flags;
3335
3336         local_irq_save(flags);
3337         double_rq_lock(idle_rq, rq);
3338
3339         idle_rq->curr = idle_rq->idle = idle;
3340         deactivate_task(idle, rq);
3341         idle->array = NULL;
3342         idle->prio = MAX_PRIO;
3343         idle->state = TASK_RUNNING;
3344         set_task_cpu(idle, cpu);
3345         double_rq_unlock(idle_rq, rq);
3346         set_tsk_need_resched(idle);
3347         local_irq_restore(flags);
3348
3349         /* Set the preempt count _outside_ the spinlocks! */
3350 #ifdef CONFIG_PREEMPT
3351         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
3352 #else
3353         idle->thread_info->preempt_count = 0;
3354 #endif
3355 }
3356
3357 /*
3358  * In a system that switches off the HZ timer nohz_cpu_mask
3359  * indicates which cpus entered this state. This is used
3360  * in the rcu update to wait only for active cpus. For system
3361  * which do not switch off the HZ timer nohz_cpu_mask should
3362  * always be CPU_MASK_NONE.
3363  */
3364 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
3365
3366 #ifdef CONFIG_SMP
3367 /*
3368  * This is how migration works:
3369  *
3370  * 1) we queue a migration_req_t structure in the source CPU's
3371  *    runqueue and wake up that CPU's migration thread.
3372  * 2) we down() the locked semaphore => thread blocks.
3373  * 3) migration thread wakes up (implicitly it forces the migrated
3374  *    thread off the CPU)
3375  * 4) it gets the migration request and checks whether the migrated
3376  *    task is still in the wrong runqueue.
3377  * 5) if it's in the wrong runqueue then the migration thread removes
3378  *    it and puts it into the right queue.
3379  * 6) migration thread up()s the semaphore.
3380  * 7) we wake up and the migration is done.
3381  */
3382
3383 /*
3384  * Change a given task's CPU affinity. Migrate the thread to a
3385  * proper CPU and schedule it away if the CPU it's executing on
3386  * is removed from the allowed bitmask.
3387  *
3388  * NOTE: the caller must have a valid reference to the task, the
3389  * task must not exit() & deallocate itself prematurely.  The
3390  * call is not atomic; no spinlocks may be held.
3391  */
3392 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
3393 {
3394         unsigned long flags;
3395         int ret = 0;
3396         migration_req_t req;
3397         runqueue_t *rq;
3398
3399         rq = task_rq_lock(p, &flags);
3400         if (any_online_cpu(new_mask) == NR_CPUS) {
3401                 ret = -EINVAL;
3402                 goto out;
3403         }
3404
3405         p->cpus_allowed = new_mask;
3406         /* Can the task run on the task's current CPU? If so, we're done */
3407         if (cpu_isset(task_cpu(p), new_mask))
3408                 goto out;
3409
3410         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
3411                 /* Need help from migration thread: drop lock and wait. */
3412                 task_rq_unlock(rq, &flags);
3413                 wake_up_process(rq->migration_thread);
3414                 wait_for_completion(&req.done);
3415                 return 0;
3416         }
3417 out:
3418         task_rq_unlock(rq, &flags);
3419         return ret;
3420 }
3421
3422 EXPORT_SYMBOL_GPL(set_cpus_allowed);
3423
3424 /*
3425  * Move (not current) task off this cpu, onto dest cpu.  We're doing
3426  * this because either it can't run here any more (set_cpus_allowed()
3427  * away from this CPU, or CPU going down), or because we're
3428  * attempting to rebalance this task on exec (sched_balance_exec).
3429  *
3430  * So we race with normal scheduler movements, but that's OK, as long
3431  * as the task is no longer on this CPU.
3432  */
3433 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
3434 {
3435         runqueue_t *rq_dest, *rq_src;
3436
3437         if (unlikely(cpu_is_offline(dest_cpu)))
3438                 return;
3439
3440         rq_src  = cpu_rq(src_cpu);
3441         rq_dest = cpu_rq(dest_cpu);
3442
3443         double_rq_lock(rq_src, rq_dest);
3444         /* Already moved. */
3445         if (task_cpu(p) != src_cpu)
3446                 goto out;
3447         /* Affinity changed (again). */
3448         if (!cpu_isset(dest_cpu, p->cpus_allowed))
3449                 goto out;
3450
3451         set_task_cpu(p, dest_cpu);
3452         if (p->array) {
3453                 /*
3454                  * Sync timestamp with rq_dest's before activating.
3455                  * The same thing could be achieved by doing this step
3456                  * afterwards, and pretending it was a local activate.
3457                  * This way is cleaner and logically correct.
3458                  */
3459                 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
3460                                 + rq_dest->timestamp_last_tick;
3461                 deactivate_task(p, rq_src);
3462                 activate_task(p, rq_dest, 0);
3463                 if (TASK_PREEMPTS_CURR(p, rq_dest))
3464                         resched_task(rq_dest->curr);
3465         }
3466
3467 out:
3468         double_rq_unlock(rq_src, rq_dest);
3469 }
3470
3471 /*
3472  * migration_thread - this is a highprio system thread that performs
3473  * thread migration by bumping thread off CPU then 'pushing' onto
3474  * another runqueue.
3475  */
3476 static int migration_thread(void * data)
3477 {
3478         runqueue_t *rq;
3479         int cpu = (long)data;
3480
3481         rq = cpu_rq(cpu);
3482         BUG_ON(rq->migration_thread != current);
3483
3484         set_current_state(TASK_INTERRUPTIBLE);
3485         while (!kthread_should_stop()) {
3486                 struct list_head *head;
3487                 migration_req_t *req;
3488
3489                 if (current->flags & PF_FREEZE)
3490                         refrigerator(PF_FREEZE);
3491
3492                 spin_lock_irq(&rq->lock);
3493
3494                 if (cpu_is_offline(cpu)) {
3495                         spin_unlock_irq(&rq->lock);
3496                         goto wait_to_die;
3497                 }
3498
3499                 if (rq->active_balance) {
3500                         active_load_balance(rq, cpu);
3501                         rq->active_balance = 0;
3502                 }
3503
3504                 head = &rq->migration_queue;
3505
3506                 if (list_empty(head)) {
3507                         spin_unlock_irq(&rq->lock);
3508                         schedule();
3509                         set_current_state(TASK_INTERRUPTIBLE);
3510                         continue;
3511                 }
3512                 req = list_entry(head->next, migration_req_t, list);
3513                 list_del_init(head->next);
3514
3515                 if (req->type == REQ_MOVE_TASK) {
3516                         spin_unlock(&rq->lock);
3517                         __migrate_task(req->task, smp_processor_id(),
3518                                         req->dest_cpu);
3519                         local_irq_enable();
3520                 } else if (req->type == REQ_SET_DOMAIN) {
3521                         rq->sd = req->sd;
3522                         spin_unlock_irq(&rq->lock);
3523                 } else {
3524                         spin_unlock_irq(&rq->lock);
3525                         WARN_ON(1);
3526                 }
3527
3528                 complete(&req->done);
3529         }
3530         __set_current_state(TASK_RUNNING);
3531         return 0;
3532
3533 wait_to_die:
3534         /* Wait for kthread_stop */
3535         set_current_state(TASK_INTERRUPTIBLE);
3536         while (!kthread_should_stop()) {
3537                 schedule();
3538                 set_current_state(TASK_INTERRUPTIBLE);
3539         }
3540         __set_current_state(TASK_RUNNING);
3541         return 0;
3542 }
3543
3544 #ifdef CONFIG_HOTPLUG_CPU
3545 /* migrate_all_tasks - function to migrate all tasks from the dead cpu.  */
3546 static void migrate_all_tasks(int src_cpu)
3547 {
3548         struct task_struct *tsk, *t;
3549         int dest_cpu;
3550         unsigned int node;
3551
3552         write_lock_irq(&tasklist_lock);
3553
3554         /* watch out for per node tasks, let's stay on this node */
3555         node = cpu_to_node(src_cpu);
3556
3557         do_each_thread(t, tsk) {
3558                 cpumask_t mask;
3559                 if (tsk == current)
3560                         continue;
3561
3562                 if (task_cpu(tsk) != src_cpu)
3563                         continue;
3564
3565                 /* Figure out where this task should go (attempting to
3566                  * keep it on-node), and check if it can be migrated
3567                  * as-is.  NOTE that kernel threads bound to more than
3568                  * one online cpu will be migrated. */
3569                 mask = node_to_cpumask(node);
3570                 cpus_and(mask, mask, tsk->cpus_allowed);
3571                 dest_cpu = any_online_cpu(mask);
3572                 if (dest_cpu == NR_CPUS)
3573                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
3574                 if (dest_cpu == NR_CPUS) {
3575                         cpus_clear(tsk->cpus_allowed);
3576                         cpus_complement(tsk->cpus_allowed);
3577                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
3578
3579                         /* Don't tell them about moving exiting tasks
3580                            or kernel threads (both mm NULL), since
3581                            they never leave kernel. */
3582                         if (tsk->mm && printk_ratelimit())
3583                                 printk(KERN_INFO "process %d (%s) no "
3584                                        "longer affine to cpu%d\n",
3585                                        tsk->pid, tsk->comm, src_cpu);
3586                 }
3587
3588                 __migrate_task(tsk, src_cpu, dest_cpu);
3589         } while_each_thread(t, tsk);
3590
3591         write_unlock_irq(&tasklist_lock);
3592 }
3593
3594 /* Schedules idle task to be the next runnable task on current CPU.
3595  * It does so by boosting its priority to highest possible and adding it to
3596  * the _front_ of runqueue. Used by CPU offline code.
3597  */
3598 void sched_idle_next(void)
3599 {
3600         int cpu = smp_processor_id();
3601         runqueue_t *rq = this_rq();
3602         struct task_struct *p = rq->idle;
3603         unsigned long flags;
3604
3605         /* cpu has to be offline */
3606         BUG_ON(cpu_online(cpu));
3607
3608         /* Strictly not necessary since rest of the CPUs are stopped by now
3609          * and interrupts disabled on current cpu.
3610          */
3611         spin_lock_irqsave(&rq->lock, flags);
3612
3613         __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
3614         /* Add idle task to _front_ of it's priority queue */
3615         __activate_idle_task(p, rq);
3616
3617         spin_unlock_irqrestore(&rq->lock, flags);
3618 }
3619 #endif /* CONFIG_HOTPLUG_CPU */
3620
3621 /*
3622  * migration_call - callback that gets triggered when a CPU is added.
3623  * Here we can start up the necessary migration thread for the new CPU.
3624  */
3625 static int migration_call(struct notifier_block *nfb, unsigned long action,
3626                           void *hcpu)
3627 {
3628         int cpu = (long)hcpu;
3629         struct task_struct *p;
3630         struct runqueue *rq;
3631         unsigned long flags;
3632
3633         switch (action) {
3634         case CPU_UP_PREPARE:
3635                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
3636                 if (IS_ERR(p))
3637                         return NOTIFY_BAD;
3638                 kthread_bind(p, cpu);
3639                 /* Must be high prio: stop_machine expects to yield to it. */
3640                 rq = task_rq_lock(p, &flags);
3641                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
3642                 task_rq_unlock(rq, &flags);
3643                 cpu_rq(cpu)->migration_thread = p;
3644                 break;
3645         case CPU_ONLINE:
3646                 /* Strictly unneccessary, as first user will wake it. */
3647                 wake_up_process(cpu_rq(cpu)->migration_thread);
3648                 break;
3649 #ifdef CONFIG_HOTPLUG_CPU
3650         case CPU_UP_CANCELED:
3651                 /* Unbind it from offline cpu so it can run.  Fall thru. */
3652                 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
3653                 kthread_stop(cpu_rq(cpu)->migration_thread);
3654                 cpu_rq(cpu)->migration_thread = NULL;
3655                 break;
3656         case CPU_DEAD:
3657                 migrate_all_tasks(cpu);
3658                 rq = cpu_rq(cpu);
3659                 kthread_stop(rq->migration_thread);
3660                 rq->migration_thread = NULL;
3661                 /* Idle task back to normal (off runqueue, low prio) */
3662                 rq = task_rq_lock(rq->idle, &flags);
3663                 deactivate_task(rq->idle, rq);
3664                 rq->idle->static_prio = MAX_PRIO;
3665                 __setscheduler(rq->idle, SCHED_NORMAL, 0);
3666                 task_rq_unlock(rq, &flags);
3667                 BUG_ON(rq->nr_running != 0);
3668
3669                 /* No need to migrate the tasks: it was best-effort if
3670                  * they didn't do lock_cpu_hotplug().  Just wake up
3671                  * the requestors. */
3672                 spin_lock_irq(&rq->lock);
3673                 while (!list_empty(&rq->migration_queue)) {
3674                         migration_req_t *req;
3675                         req = list_entry(rq->migration_queue.next,
3676                                          migration_req_t, list);
3677                         BUG_ON(req->type != REQ_MOVE_TASK);
3678                         list_del_init(&req->list);
3679                         complete(&req->done);
3680                 }
3681                 spin_unlock_irq(&rq->lock);
3682                 break;
3683 #endif
3684         }
3685         return NOTIFY_OK;
3686 }
3687
3688 /* Register at highest priority so that task migration (migrate_all_tasks)
3689  * happens before everything else.
3690  */
3691 static struct notifier_block __devinitdata migration_notifier = {
3692         .notifier_call = migration_call,
3693         .priority = 10
3694 };
3695
3696 int __init migration_init(void)
3697 {
3698         void *cpu = (void *)(long)smp_processor_id();
3699         /* Start one for boot CPU. */
3700         migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
3701         migration_call(&migration_notifier, CPU_ONLINE, cpu);
3702         register_cpu_notifier(&migration_notifier);
3703         return 0;
3704 }
3705 #endif
3706
3707 /*
3708  * The 'big kernel lock'
3709  *
3710  * This spinlock is taken and released recursively by lock_kernel()
3711  * and unlock_kernel().  It is transparently dropped and reaquired
3712  * over schedule().  It is used to protect legacy code that hasn't
3713  * been migrated to a proper locking design yet.
3714  *
3715  * Don't use in new code.
3716  *
3717  * Note: spinlock debugging needs this even on !CONFIG_SMP.
3718  */
3719 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
3720 EXPORT_SYMBOL(kernel_flag);
3721
3722 #ifdef CONFIG_SMP
3723 /* Attach the domain 'sd' to 'cpu' as its base domain */
3724 void cpu_attach_domain(struct sched_domain *sd, int cpu)
3725 {
3726         migration_req_t req;
3727         unsigned long flags;
3728         runqueue_t *rq = cpu_rq(cpu);
3729         int local = 1;
3730
3731         lock_cpu_hotplug();
3732
3733         spin_lock_irqsave(&rq->lock, flags);
3734
3735         if (cpu == smp_processor_id() || !cpu_online(cpu)) {
3736                 rq->sd = sd;
3737         } else {
3738                 init_completion(&req.done);
3739                 req.type = REQ_SET_DOMAIN;
3740                 req.sd = sd;
3741                 list_add(&req.list, &rq->migration_queue);
3742                 local = 0;
3743         }
3744
3745         spin_unlock_irqrestore(&rq->lock, flags);
3746
3747         if (!local) {
3748                 wake_up_process(rq->migration_thread);
3749                 wait_for_completion(&req.done);
3750         }
3751
3752         unlock_cpu_hotplug();
3753 }
3754
3755 #ifdef ARCH_HAS_SCHED_DOMAIN
3756 extern void __init arch_init_sched_domains(void);
3757 #else
3758 static struct sched_group sched_group_cpus[NR_CPUS];
3759 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
3760 #ifdef CONFIG_NUMA
3761 static struct sched_group sched_group_nodes[MAX_NUMNODES];
3762 static DEFINE_PER_CPU(struct sched_domain, node_domains);
3763 static void __init arch_init_sched_domains(void)
3764 {
3765         int i;
3766         struct sched_group *first_node = NULL, *last_node = NULL;
3767
3768         /* Set up domains */
3769         for_each_cpu(i) {
3770                 int node = cpu_to_node(i);
3771                 cpumask_t nodemask = node_to_cpumask(node);
3772                 struct sched_domain *node_sd = &per_cpu(node_domains, i);
3773                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3774
3775                 *node_sd = SD_NODE_INIT;
3776                 node_sd->span = cpu_possible_map;
3777                 node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
3778
3779                 *cpu_sd = SD_CPU_INIT;
3780                 cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
3781                 cpu_sd->groups = &sched_group_cpus[i];
3782                 cpu_sd->parent = node_sd;
3783         }
3784
3785         /* Set up groups */
3786         for (i = 0; i < MAX_NUMNODES; i++) {
3787                 cpumask_t tmp = node_to_cpumask(i);
3788                 cpumask_t nodemask;
3789                 struct sched_group *first_cpu = NULL, *last_cpu = NULL;
3790                 struct sched_group *node = &sched_group_nodes[i];
3791                 int j;
3792
3793                 cpus_and(nodemask, tmp, cpu_possible_map);
3794
3795                 if (cpus_empty(nodemask))
3796                         continue;
3797
3798                 node->cpumask = nodemask;
3799                 node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
3800
3801                 for_each_cpu_mask(j, node->cpumask) {
3802                         struct sched_group *cpu = &sched_group_cpus[j];
3803
3804                         cpus_clear(cpu->cpumask);
3805                         cpu_set(j, cpu->cpumask);
3806                         cpu->cpu_power = SCHED_LOAD_SCALE;
3807
3808                         if (!first_cpu)
3809                                 first_cpu = cpu;
3810                         if (last_cpu)
3811                                 last_cpu->next = cpu;
3812                         last_cpu = cpu;
3813                 }
3814                 last_cpu->next = first_cpu;
3815
3816                 if (!first_node)
3817                         first_node = node;
3818                 if (last_node)
3819                         last_node->next = node;
3820                 last_node = node;
3821         }
3822         last_node->next = first_node;
3823
3824         mb();
3825         for_each_cpu(i) {
3826                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3827                 cpu_attach_domain(cpu_sd, i);
3828         }
3829 }
3830
3831 #else /* !CONFIG_NUMA */
3832 static void __init arch_init_sched_domains(void)
3833 {
3834         int i;
3835         struct sched_group *first_cpu = NULL, *last_cpu = NULL;
3836
3837         /* Set up domains */
3838         for_each_cpu(i) {
3839                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3840
3841                 *cpu_sd = SD_CPU_INIT;
3842                 cpu_sd->span = cpu_possible_map;
3843                 cpu_sd->groups = &sched_group_cpus[i];
3844         }
3845
3846         /* Set up CPU groups */
3847         for_each_cpu_mask(i, cpu_possible_map) {
3848                 struct sched_group *cpu = &sched_group_cpus[i];
3849
3850                 cpus_clear(cpu->cpumask);
3851                 cpu_set(i, cpu->cpumask);
3852                 cpu->cpu_power = SCHED_LOAD_SCALE;
3853
3854                 if (!first_cpu)
3855                         first_cpu = cpu;
3856                 if (last_cpu)
3857                         last_cpu->next = cpu;
3858                 last_cpu = cpu;
3859         }
3860         last_cpu->next = first_cpu;
3861
3862         mb(); /* domains were modified outside the lock */
3863         for_each_cpu(i) {
3864                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
3865                 cpu_attach_domain(cpu_sd, i);
3866         }
3867 }
3868
3869 #endif /* CONFIG_NUMA */
3870 #endif /* ARCH_HAS_SCHED_DOMAIN */
3871
3872 #define SCHED_DOMAIN_DEBUG
3873 #ifdef SCHED_DOMAIN_DEBUG
3874 void sched_domain_debug(void)
3875 {
3876         int i;
3877
3878         for_each_cpu(i) {
3879                 runqueue_t *rq = cpu_rq(i);
3880                 struct sched_domain *sd;
3881                 int level = 0;
3882
3883                 sd = rq->sd;
3884
3885                 printk(KERN_WARNING "CPU%d: %s\n",
3886                                 i, (cpu_online(i) ? " online" : "offline"));
3887
3888                 do {
3889                         int j;
3890                         char str[NR_CPUS];
3891                         struct sched_group *group = sd->groups;
3892                         cpumask_t groupmask, tmp;
3893
3894                         cpumask_scnprintf(str, NR_CPUS, sd->span);
3895                         cpus_clear(groupmask);
3896
3897                         printk(KERN_DEBUG);
3898                         for (j = 0; j < level + 1; j++)
3899                                 printk(" ");
3900                         printk("domain %d: span %s\n", level, str);
3901
3902                         if (!cpu_isset(i, sd->span))
3903                                 printk(KERN_WARNING "ERROR domain->span does not contain CPU%d\n", i);
3904                         if (!cpu_isset(i, group->cpumask))
3905                                 printk(KERN_WARNING "ERROR domain->groups does not contain CPU%d\n", i);
3906                         if (!group->cpu_power)
3907                                 printk(KERN_WARNING "ERROR domain->cpu_power not set\n");
3908
3909                         printk(KERN_WARNING);
3910                         for (j = 0; j < level + 2; j++)
3911                                 printk(" ");
3912                         printk("groups:");
3913                         do {
3914                                 if (!group) {
3915                                         printk(" ERROR: NULL");
3916                                         break;
3917                                 }
3918
3919                                 if (!cpus_weight(group->cpumask))
3920                                         printk(" ERROR empty group:");
3921
3922                                 cpus_and(tmp, groupmask, group->cpumask);
3923                                 if (cpus_weight(tmp) > 0)
3924                                         printk(" ERROR repeated CPUs:");
3925
3926                                 cpus_or(groupmask, groupmask, group->cpumask);
3927
3928                                 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
3929                                 printk(" %s", str);
3930
3931                                 group = group->next;
3932                         } while (group != sd->groups);
3933                         printk("\n");
3934
3935                         if (!cpus_equal(sd->span, groupmask))
3936                                 printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
3937
3938                         level++;
3939                         sd = sd->parent;
3940
3941                         if (sd) {
3942                                 cpus_and(tmp, groupmask, sd->span);
3943                                 if (!cpus_equal(tmp, groupmask))
3944                                         printk(KERN_WARNING "ERROR parent span is not a superset of domain->span\n");
3945                         }
3946
3947                 } while (sd);
3948         }
3949 }
3950 #else
3951 #define sched_domain_debug() {}
3952 #endif
3953
3954 void __init sched_init_smp(void)
3955 {
3956         arch_init_sched_domains();
3957         sched_domain_debug();
3958 }
3959 #else
3960 void __init sched_init_smp(void)
3961 {
3962 }
3963 #endif /* CONFIG_SMP */
3964
3965 int in_sched_functions(unsigned long addr)
3966 {
3967         /* Linker adds these: start and end of __sched functions */
3968         extern char __sched_text_start[], __sched_text_end[];
3969         return addr >= (unsigned long)__sched_text_start
3970                 && addr < (unsigned long)__sched_text_end;
3971 }
3972
3973 void __init sched_init(void)
3974 {
3975         runqueue_t *rq;
3976         int i, j, k;
3977
3978 #ifdef CONFIG_SMP
3979         /* Set up an initial dummy domain for early boot */
3980         static struct sched_domain sched_domain_init;
3981         static struct sched_group sched_group_init;
3982         cpumask_t cpu_mask_all = CPU_MASK_ALL;
3983
3984         memset(&sched_domain_init, 0, sizeof(struct sched_domain));
3985         sched_domain_init.span = cpu_mask_all;
3986         sched_domain_init.groups = &sched_group_init;
3987         sched_domain_init.last_balance = jiffies;
3988         sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
3989
3990         memset(&sched_group_init, 0, sizeof(struct sched_group));
3991         sched_group_init.cpumask = cpu_mask_all;
3992         sched_group_init.next = &sched_group_init;
3993         sched_group_init.cpu_power = SCHED_LOAD_SCALE;
3994 #endif
3995
3996         for (i = 0; i < NR_CPUS; i++) {
3997                 prio_array_t *array;
3998
3999                 rq = cpu_rq(i);
4000                 spin_lock_init(&rq->lock);
4001                 rq->active = rq->arrays;
4002                 rq->expired = rq->arrays + 1;
4003                 rq->best_expired_prio = MAX_PRIO;
4004
4005 #ifdef CONFIG_SMP
4006                 rq->sd = &sched_domain_init;
4007                 rq->cpu_load = 0;
4008                 rq->active_balance = 0;
4009                 rq->push_cpu = 0;
4010                 rq->migration_thread = NULL;
4011                 INIT_LIST_HEAD(&rq->migration_queue);
4012 #endif
4013                 INIT_LIST_HEAD(&rq->hold_queue);
4014                 atomic_set(&rq->nr_iowait, 0);
4015
4016                 for (j = 0; j < 2; j++) {
4017                         array = rq->arrays + j;
4018                         for (k = 0; k < MAX_PRIO; k++) {
4019                                 INIT_LIST_HEAD(array->queue + k);
4020                                 __clear_bit(k, array->bitmap);
4021                         }
4022                         // delimiter for bitsearch
4023                         __set_bit(MAX_PRIO, array->bitmap);
4024                 }
4025         }
4026         /*
4027          * We have to do a little magic to get the first
4028          * thread right in SMP mode.
4029          */
4030         rq = this_rq();
4031         rq->curr = current;
4032         rq->idle = current;
4033         set_task_cpu(current, smp_processor_id());
4034         wake_up_forked_process(current);
4035
4036         /*
4037          * The boot idle thread does lazy MMU switching as well:
4038          */
4039         atomic_inc(&init_mm.mm_count);
4040         enter_lazy_tlb(&init_mm, current);
4041 }
4042
4043 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4044 void __might_sleep(char *file, int line)
4045 {
4046 #if defined(in_atomic)
4047         static unsigned long prev_jiffy;        /* ratelimiting */
4048
4049         if ((in_atomic() || irqs_disabled()) &&
4050             system_state == SYSTEM_RUNNING) {
4051                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
4052                         return;
4053                 prev_jiffy = jiffies;
4054                 printk(KERN_ERR "Debug: sleeping function called from invalid"
4055                                 " context at %s:%d\n", file, line);
4056                 printk("in_atomic():%d, irqs_disabled():%d\n",
4057                         in_atomic(), irqs_disabled());
4058                 dump_stack();
4059         }
4060 #endif
4061 }
4062 EXPORT_SYMBOL(__might_sleep);
4063 #endif
4064
4065
4066 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
4067 /*
4068  * This could be a long-held lock.  If another CPU holds it for a long time,
4069  * and that CPU is not asked to reschedule then *this* CPU will spin on the
4070  * lock for a long time, even if *this* CPU is asked to reschedule.
4071  *
4072  * So what we do here, in the slow (contended) path is to spin on the lock by
4073  * hand while permitting preemption.
4074  *
4075  * Called inside preempt_disable().
4076  */
4077 void __sched __preempt_spin_lock(spinlock_t *lock)
4078 {
4079         if (preempt_count() > 1) {
4080                 _raw_spin_lock(lock);
4081                 return;
4082         }
4083         do {
4084                 preempt_enable();
4085                 while (spin_is_locked(lock))
4086                         cpu_relax();
4087                 preempt_disable();
4088         } while (!_raw_spin_trylock(lock));
4089 }
4090
4091 EXPORT_SYMBOL(__preempt_spin_lock);
4092
4093 void __sched __preempt_write_lock(rwlock_t *lock)
4094 {
4095         if (preempt_count() > 1) {
4096                 _raw_write_lock(lock);
4097                 return;
4098         }
4099
4100         do {
4101                 preempt_enable();
4102                 while (rwlock_is_locked(lock))
4103                         cpu_relax();
4104                 preempt_disable();
4105         } while (!_raw_write_trylock(lock));
4106 }
4107
4108 EXPORT_SYMBOL(__preempt_write_lock);
4109 #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */
4110
4111 #ifdef CONFIG_DELAY_ACCT
4112 int task_running_sys(struct task_struct *p)
4113 {
4114        return task_running(task_rq(p),p);
4115 }
4116 EXPORT_SYMBOL(task_running_sys);
4117 #endif
4118