4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
21 #include <linux/module.h>
22 #include <linux/nmi.h>
23 #include <linux/init.h>
24 #include <asm/uaccess.h>
25 #include <linux/highmem.h>
26 #include <linux/smp_lock.h>
27 #include <asm/mmu_context.h>
28 #include <linux/interrupt.h>
29 #include <linux/completion.h>
30 #include <linux/kernel_stat.h>
31 #include <linux/security.h>
32 #include <linux/notifier.h>
33 #include <linux/suspend.h>
34 #include <linux/blkdev.h>
35 #include <linux/delay.h>
36 #include <linux/smp.h>
37 #include <linux/timer.h>
38 #include <linux/rcupdate.h>
39 #include <linux/cpu.h>
40 #include <linux/percpu.h>
41 #include <linux/kthread.h>
44 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
46 #define cpu_to_node_mask(cpu) (cpu_online_map)
50 * Convert user-nice values [ -20 ... 0 ... 19 ]
51 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
54 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
55 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
56 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
59 * 'User priority' is the nice value converted to something we
60 * can work with better when scaling various scheduler parameters,
61 * it's a [ 0 ... 39 ] range.
63 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
64 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
65 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
66 #define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
67 (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
70 * Some helpers for converting nanosecond timing to jiffy resolution
72 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
73 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
76 * These are the 'tuning knobs' of the scheduler:
78 * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
79 * maximum timeslice is 200 msecs. Timeslices get refilled after
82 #define MIN_TIMESLICE ( 10 * HZ / 1000)
83 #define MAX_TIMESLICE (200 * HZ / 1000)
84 #define ON_RUNQUEUE_WEIGHT 30
85 #define CHILD_PENALTY 95
86 #define PARENT_PENALTY 100
88 #define PRIO_BONUS_RATIO 25
89 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
90 #define INTERACTIVE_DELTA 2
91 #define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS)
92 #define STARVATION_LIMIT (MAX_SLEEP_AVG)
93 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
94 #define NODE_THRESHOLD 125
95 #define CREDIT_LIMIT 100
98 * If a task is 'interactive' then we reinsert it in the active
99 * array after it has expired its current timeslice. (it will not
100 * continue to run immediately, it will still roundrobin with
101 * other interactive tasks.)
103 * This part scales the interactivity limit depending on niceness.
105 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
106 * Here are a few examples of different nice levels:
108 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
109 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
110 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
111 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
112 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
114 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
115 * priority range a task can explore, a value of '1' means the
116 * task is rated interactive.)
118 * Ie. nice +19 tasks can never get 'interactive' enough to be
119 * reinserted into the active array. And only heavily CPU-hog nice -20
120 * tasks will be expired. Default nice 0 tasks are somewhere between,
121 * it takes some effort for them to get interactive, but it's not
125 #define CURRENT_BONUS(p) \
126 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
130 #define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
131 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
134 #define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
135 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
138 #define SCALE(v1,v1_max,v2_max) \
139 (v1) * (v2_max) / (v1_max)
142 (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
145 #define TASK_INTERACTIVE(p) \
146 ((p)->prio <= (p)->static_prio - DELTA(p))
148 #define INTERACTIVE_SLEEP(p) \
149 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
150 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
152 #define HIGH_CREDIT(p) \
153 ((p)->interactive_credit > CREDIT_LIMIT)
155 #define LOW_CREDIT(p) \
156 ((p)->interactive_credit < -CREDIT_LIMIT)
158 #define TASK_PREEMPTS_CURR(p, rq) \
159 ((p)->prio < (rq)->curr->prio)
162 * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
163 * to time slice values.
165 * The higher a thread's priority, the bigger timeslices
166 * it gets during one round of execution. But even the lowest
167 * priority thread gets MIN_TIMESLICE worth of execution time.
169 * task_timeslice() is the interface that is used by the scheduler.
172 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
173 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
174 (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
176 static inline unsigned int task_timeslice(task_t *p)
178 return BASE_TIMESLICE(p);
182 * These are the runqueue data structures:
185 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
187 typedef struct runqueue runqueue_t;
191 unsigned long bitmap[BITMAP_SIZE];
192 struct list_head queue[MAX_PRIO];
196 * This is the main, per-CPU runqueue data structure.
198 * Locking rule: those places that want to lock multiple runqueues
199 * (such as the load balancing or the thread migration code), lock
200 * acquire operations must be ordered by ascending &runqueue.
204 unsigned long long nr_switches;
205 unsigned long nr_running, expired_timestamp, nr_uninterruptible,
208 struct mm_struct *prev_mm;
209 prio_array_t *active, *expired, arrays[2];
210 int best_expired_prio, prev_cpu_load[NR_CPUS];
212 atomic_t *node_nr_running;
213 int prev_node_load[MAX_NUMNODES];
215 task_t *migration_thread;
216 struct list_head migration_queue;
221 static DEFINE_PER_CPU(struct runqueue, runqueues);
223 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
224 #define this_rq() (&__get_cpu_var(runqueues))
225 #define task_rq(p) cpu_rq(task_cpu(p))
226 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
228 extern unsigned long __scheduling_functions_start_here;
229 extern unsigned long __scheduling_functions_end_here;
230 const unsigned long scheduling_functions_start_here =
231 (unsigned long)&__scheduling_functions_start_here;
232 const unsigned long scheduling_functions_end_here =
233 (unsigned long)&__scheduling_functions_end_here;
236 * Default context-switch locking:
238 #ifndef prepare_arch_switch
239 # define prepare_arch_switch(rq, next) do { } while (0)
240 # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
241 # define task_running(rq, p) ((rq)->curr == (p))
247 * Keep track of running tasks.
250 static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
251 {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
253 static inline void nr_running_init(struct runqueue *rq)
255 rq->node_nr_running = &node_nr_running[0];
258 static inline void nr_running_inc(runqueue_t *rq)
260 atomic_inc(rq->node_nr_running);
264 static inline void nr_running_dec(runqueue_t *rq)
266 atomic_dec(rq->node_nr_running);
270 __init void node_nr_running_init(void)
274 for (i = 0; i < NR_CPUS; i++) {
276 cpu_rq(i)->node_nr_running =
277 &node_nr_running[cpu_to_node(i)];
281 #else /* !CONFIG_NUMA */
283 # define nr_running_init(rq) do { } while (0)
284 # define nr_running_inc(rq) do { (rq)->nr_running++; } while (0)
285 # define nr_running_dec(rq) do { (rq)->nr_running--; } while (0)
287 #endif /* CONFIG_NUMA */
290 * task_rq_lock - lock the runqueue a given task resides on and disable
291 * interrupts. Note the ordering: we can safely lookup the task_rq without
292 * explicitly disabling preemption.
294 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
299 local_irq_save(*flags);
301 spin_lock(&rq->lock);
302 if (unlikely(rq != task_rq(p))) {
303 spin_unlock_irqrestore(&rq->lock, *flags);
304 goto repeat_lock_task;
309 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
311 spin_unlock_irqrestore(&rq->lock, *flags);
315 * rq_lock - lock a given runqueue and disable interrupts.
317 static inline runqueue_t *this_rq_lock(void)
323 spin_lock(&rq->lock);
328 static inline void rq_unlock(runqueue_t *rq)
330 spin_unlock_irq(&rq->lock);
334 * Adding/removing a task to/from a priority array:
336 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
339 list_del(&p->run_list);
340 if (list_empty(array->queue + p->prio))
341 __clear_bit(p->prio, array->bitmap);
344 static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
346 list_add_tail(&p->run_list, array->queue + p->prio);
347 __set_bit(p->prio, array->bitmap);
353 * effective_prio - return the priority that is based on the static
354 * priority but is modified by bonuses/penalties.
356 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
357 * into the -5 ... 0 ... +5 bonus/penalty range.
359 * We use 25% of the full 0...39 priority range so that:
361 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
362 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
364 * Both properties are important to certain workloads.
366 static int effective_prio(task_t *p)
373 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
375 prio = p->static_prio - bonus;
376 if (prio < MAX_RT_PRIO)
378 if (prio > MAX_PRIO-1)
384 * __activate_task - move a task to the runqueue.
386 static inline void __activate_task(task_t *p, runqueue_t *rq)
388 enqueue_task(p, rq->active);
392 static void recalc_task_prio(task_t *p, unsigned long long now)
394 unsigned long long __sleep_time = now - p->timestamp;
395 unsigned long sleep_time;
397 if (__sleep_time > NS_MAX_SLEEP_AVG)
398 sleep_time = NS_MAX_SLEEP_AVG;
400 sleep_time = (unsigned long)__sleep_time;
402 if (likely(sleep_time > 0)) {
404 * User tasks that sleep a long time are categorised as
405 * idle and will get just interactive status to stay active &
406 * prevent them suddenly becoming cpu hogs and starving
409 if (p->mm && p->activated != -1 &&
410 sleep_time > INTERACTIVE_SLEEP(p)) {
411 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
414 p->interactive_credit++;
417 * The lower the sleep avg a task has the more
418 * rapidly it will rise with sleep time.
420 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
423 * Tasks with low interactive_credit are limited to
424 * one timeslice worth of sleep avg bonus.
427 sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
428 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
431 * Non high_credit tasks waking from uninterruptible
432 * sleep are limited in their sleep_avg rise as they
433 * are likely to be cpu hogs waiting on I/O
435 if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
436 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
438 else if (p->sleep_avg + sleep_time >=
439 INTERACTIVE_SLEEP(p)) {
440 p->sleep_avg = INTERACTIVE_SLEEP(p);
446 * This code gives a bonus to interactive tasks.
448 * The boost works by updating the 'average sleep time'
449 * value here, based on ->timestamp. The more time a
450 * task spends sleeping, the higher the average gets -
451 * and the higher the priority boost gets as well.
453 p->sleep_avg += sleep_time;
455 if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
456 p->sleep_avg = NS_MAX_SLEEP_AVG;
458 p->interactive_credit++;
463 p->prio = effective_prio(p);
467 * activate_task - move a task to the runqueue and do priority recalculation
469 * Update all the scheduling statistics stuff. (sleep average
470 * calculation, priority modifiers, etc.)
472 static inline void activate_task(task_t *p, runqueue_t *rq)
474 unsigned long long now = sched_clock();
476 recalc_task_prio(p, now);
479 * This checks to make sure it's not an uninterruptible task
480 * that is now waking up.
484 * Tasks which were woken up by interrupts (ie. hw events)
485 * are most likely of interactive nature. So we give them
486 * the credit of extending their sleep time to the period
487 * of time they spend on the runqueue, waiting for execution
488 * on a CPU, first time around:
494 * Normal first-time wakeups get a credit too for
495 * on-runqueue time, but it will be weighted down:
502 __activate_task(p, rq);
506 * deactivate_task - remove a task from the runqueue.
508 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
511 if (p->state == TASK_UNINTERRUPTIBLE)
512 rq->nr_uninterruptible++;
513 dequeue_task(p, p->array);
518 * resched_task - mark a task 'to be rescheduled now'.
520 * On UP this means the setting of the need_resched flag, on SMP it
521 * might also involve a cross-CPU call to trigger the scheduler on
524 static inline void resched_task(task_t *p)
527 int need_resched, nrpolling;
530 /* minimise the chance of sending an interrupt to poll_idle() */
531 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
532 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
533 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
535 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
536 smp_send_reschedule(task_cpu(p));
539 set_tsk_need_resched(p);
544 * task_curr - is this task currently executing on a CPU?
545 * @p: the task in question.
547 inline int task_curr(task_t *p)
549 return cpu_curr(task_cpu(p)) == p;
554 struct list_head list;
556 struct completion done;
560 * The task's runqueue lock must be held, and the new mask must be valid.
561 * Returns true if you have to wait for migration thread.
563 static int __set_cpus_allowed(task_t *p, cpumask_t new_mask,
564 migration_req_t *req)
566 runqueue_t *rq = task_rq(p);
568 p->cpus_allowed = new_mask;
570 * Can the task run on the task's current CPU? If not then
571 * migrate the thread off to a proper CPU.
573 if (cpu_isset(task_cpu(p), new_mask))
577 * If the task is not on a runqueue (and not running), then
578 * it is sufficient to simply update the task's cpu field.
580 if (!p->array && !task_running(rq, p)) {
581 set_task_cpu(p, any_online_cpu(p->cpus_allowed));
585 init_completion(&req->done);
587 list_add(&req->list, &rq->migration_queue);
592 * wait_task_inactive - wait for a thread to unschedule.
594 * The caller must ensure that the task *will* unschedule sometime soon,
595 * else this function might spin for a *long* time. This function can't
596 * be called with interrupts off, or it may introduce deadlock with
597 * smp_call_function() if an IPI is sent by the same process we are
598 * waiting to become inactive.
600 void wait_task_inactive(task_t * p)
607 rq = task_rq_lock(p, &flags);
608 /* Must be off runqueue entirely, not preempted. */
609 if (unlikely(p->array)) {
610 /* If it's preempted, we yield. It could be a while. */
611 preempted = !task_running(rq, p);
612 task_rq_unlock(rq, &flags);
618 task_rq_unlock(rq, &flags);
622 * kick_process - kick a running thread to enter/exit the kernel
623 * @p: the to-be-kicked thread
625 * Cause a process which is running on another CPU to enter
626 * kernel-mode, without any delay. (to get signals handled.)
628 void kick_process(task_t *p)
634 if ((cpu != smp_processor_id()) && task_curr(p))
635 smp_send_reschedule(cpu);
639 EXPORT_SYMBOL_GPL(kick_process);
644 * try_to_wake_up - wake up a thread
645 * @p: the to-be-woken-up thread
646 * @state: the mask of task states that can be woken
647 * @sync: do a synchronous wakeup?
649 * Put it on the run-queue if it's not already there. The "current"
650 * thread is always on the run-queue (except when the actual
651 * re-schedule is in progress), and as such you're allowed to do
652 * the simpler "current->state = TASK_RUNNING" to mark yourself
653 * runnable without the overhead of this.
655 * returns failure only if the task is already active.
657 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
665 rq = task_rq_lock(p, &flags);
666 old_state = p->state;
667 if (old_state & state) {
670 * Fast-migrate the task if it's not running or runnable
671 * currently. Do not violate hard affinity.
673 if (unlikely(sync && !task_running(rq, p) &&
674 (task_cpu(p) != smp_processor_id()) &&
675 cpu_isset(smp_processor_id(),
677 !cpu_is_offline(smp_processor_id()))) {
678 set_task_cpu(p, smp_processor_id());
679 task_rq_unlock(rq, &flags);
680 goto repeat_lock_task;
682 if (old_state == TASK_UNINTERRUPTIBLE) {
683 rq->nr_uninterruptible--;
685 * Tasks on involuntary sleep don't earn
686 * sleep_avg beyond just interactive state.
690 if (sync && (task_cpu(p) == smp_processor_id()))
691 __activate_task(p, rq);
693 activate_task(p, rq);
694 if (TASK_PREEMPTS_CURR(p, rq))
695 resched_task(rq->curr);
699 p->state = TASK_RUNNING;
701 task_rq_unlock(rq, &flags);
705 int fastcall wake_up_process(task_t * p)
707 return try_to_wake_up(p, TASK_STOPPED |
708 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
711 EXPORT_SYMBOL(wake_up_process);
713 int fastcall wake_up_state(task_t *p, unsigned int state)
715 return try_to_wake_up(p, state, 0);
719 * Perform scheduler related setup for a newly forked process p.
720 * p is forked by current.
722 void fastcall sched_fork(task_t *p)
725 * We mark the process as running here, but have not actually
726 * inserted it onto the runqueue yet. This guarantees that
727 * nobody will actually run it, and a signal or other external
728 * event cannot wake it up and insert it on the runqueue either.
730 p->state = TASK_RUNNING;
731 INIT_LIST_HEAD(&p->run_list);
733 spin_lock_init(&p->switch_lock);
734 #ifdef CONFIG_PREEMPT
736 * During context-switch we hold precisely one spinlock, which
737 * schedule_tail drops. (in the common case it's this_rq()->lock,
738 * but it also can be p->switch_lock.) So we compensate with a count
739 * of 1. Also, we want to start with kernel preemption disabled.
741 p->thread_info->preempt_count = 1;
744 * Share the timeslice between parent and child, thus the
745 * total amount of pending timeslices in the system doesn't change,
746 * resulting in more scheduling fairness.
749 p->time_slice = (current->time_slice + 1) >> 1;
751 * The remainder of the first timeslice might be recovered by
752 * the parent if the child exits early enough.
754 p->first_time_slice = 1;
755 current->time_slice >>= 1;
756 p->timestamp = sched_clock();
757 if (!current->time_slice) {
759 * This case is rare, it happens when the parent has only
760 * a single jiffy left from its timeslice. Taking the
761 * runqueue lock is not a problem.
763 current->time_slice = 1;
765 scheduler_tick(0, 0);
773 * wake_up_forked_process - wake up a freshly forked process.
775 * This function will do some initial scheduler statistics housekeeping
776 * that must be done for every newly created process.
778 void fastcall wake_up_forked_process(task_t * p)
781 runqueue_t *rq = task_rq_lock(current, &flags);
783 BUG_ON(p->state != TASK_RUNNING);
786 * We decrease the sleep average of forking parents
787 * and children as well, to keep max-interactive tasks
788 * from forking tasks that are max-interactive.
790 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
791 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
793 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
794 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
796 p->interactive_credit = 0;
798 p->prio = effective_prio(p);
799 set_task_cpu(p, smp_processor_id());
801 if (unlikely(!current->array))
802 __activate_task(p, rq);
804 p->prio = current->prio;
805 list_add_tail(&p->run_list, ¤t->run_list);
806 p->array = current->array;
807 p->array->nr_active++;
810 task_rq_unlock(rq, &flags);
814 * Potentially available exiting-child timeslices are
815 * retrieved here - this way the parent does not get
816 * penalized for creating too many threads.
818 * (this cannot be used to 'generate' timeslices
819 * artificially, because any timeslice recovered here
820 * was given away by the parent in the first place.)
822 void fastcall sched_exit(task_t * p)
827 local_irq_save(flags);
828 if (p->first_time_slice) {
829 p->parent->time_slice += p->time_slice;
830 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
831 p->parent->time_slice = MAX_TIMESLICE;
833 local_irq_restore(flags);
835 * If the child was a (relative-) CPU hog then decrease
836 * the sleep_avg of the parent as well.
838 rq = task_rq_lock(p->parent, &flags);
839 if (p->sleep_avg < p->parent->sleep_avg)
840 p->parent->sleep_avg = p->parent->sleep_avg /
841 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
843 task_rq_unlock(rq, &flags);
847 * finish_task_switch - clean up after a task-switch
848 * @prev: the thread we just switched away from.
850 * We enter this with the runqueue still locked, and finish_arch_switch()
851 * will unlock it along with doing any other architecture-specific cleanup
854 * Note that we may have delayed dropping an mm in context_switch(). If
855 * so, we finish that here outside of the runqueue lock. (Doing it
856 * with the lock held can cause deadlocks; see schedule() for
859 static inline void finish_task_switch(task_t *prev)
861 runqueue_t *rq = this_rq();
862 struct mm_struct *mm = rq->prev_mm;
863 unsigned long prev_task_flags;
868 * A task struct has one reference for the use as "current".
869 * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
870 * schedule one last time. The schedule call will never return,
871 * and the scheduled task must drop that reference.
872 * The test for TASK_ZOMBIE must occur while the runqueue locks are
873 * still held, otherwise prev could be scheduled on another cpu, die
874 * there before we look at prev->state, and then the reference would
876 * Manfred Spraul <manfred@colorfullife.com>
878 prev_task_flags = prev->flags;
879 finish_arch_switch(rq, prev);
882 if (unlikely(prev_task_flags & PF_DEAD))
883 put_task_struct(prev);
887 * schedule_tail - first thing a freshly forked thread must call.
888 * @prev: the thread we just switched away from.
890 asmlinkage void schedule_tail(task_t *prev)
892 finish_task_switch(prev);
894 if (current->set_child_tid)
895 put_user(current->pid, current->set_child_tid);
899 * context_switch - switch to the new MM and the new
900 * thread's register state.
903 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
905 struct mm_struct *mm = next->mm;
906 struct mm_struct *oldmm = prev->active_mm;
909 next->active_mm = oldmm;
910 atomic_inc(&oldmm->mm_count);
911 enter_lazy_tlb(oldmm, next);
913 switch_mm(oldmm, mm, next);
915 if (unlikely(!prev->mm)) {
916 prev->active_mm = NULL;
917 WARN_ON(rq->prev_mm);
921 /* Here we just switch the register state and the stack. */
922 switch_to(prev, next, prev);
928 * nr_running, nr_uninterruptible and nr_context_switches:
930 * externally visible scheduler statistics: current number of runnable
931 * threads, current number of uninterruptible-sleeping threads, total
932 * number of context switches performed since bootup.
934 unsigned long nr_running(void)
936 unsigned long i, sum = 0;
938 for (i = 0; i < NR_CPUS; i++)
939 sum += cpu_rq(i)->nr_running;
944 unsigned long nr_uninterruptible(void)
946 unsigned long i, sum = 0;
949 sum += cpu_rq(i)->nr_uninterruptible;
954 unsigned long long nr_context_switches(void)
956 unsigned long long i, sum = 0;
959 sum += cpu_rq(i)->nr_switches;
964 unsigned long nr_iowait(void)
966 unsigned long i, sum = 0;
969 sum += atomic_read(&cpu_rq(i)->nr_iowait);
975 * double_rq_lock - safely lock two runqueues
977 * Note this does not disable interrupts like task_rq_lock,
978 * you need to do so manually before calling.
980 static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
983 spin_lock(&rq1->lock);
986 spin_lock(&rq1->lock);
987 spin_lock(&rq2->lock);
989 spin_lock(&rq2->lock);
990 spin_lock(&rq1->lock);
996 * double_rq_unlock - safely unlock two runqueues
998 * Note this does not restore interrupts like task_rq_unlock,
999 * you need to do so manually after calling.
1001 static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1003 spin_unlock(&rq1->lock);
1005 spin_unlock(&rq2->lock);
1010 * If dest_cpu is allowed for this process, migrate the task to it.
1011 * This is accomplished by forcing the cpu_allowed mask to only
1012 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1013 * the cpu_allowed mask is restored.
1015 static void sched_migrate_task(task_t *p, int dest_cpu)
1018 migration_req_t req;
1019 unsigned long flags;
1020 cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu);
1023 rq = task_rq_lock(p, &flags);
1024 old_mask = p->cpus_allowed;
1025 if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu))
1028 /* force the process onto the specified CPU */
1029 if (__set_cpus_allowed(p, new_mask, &req)) {
1030 /* Need to wait for migration thread. */
1031 task_rq_unlock(rq, &flags);
1032 wake_up_process(rq->migration_thread);
1033 wait_for_completion(&req.done);
1035 /* If we raced with sys_sched_setaffinity, don't
1037 rq = task_rq_lock(p, &flags);
1038 if (likely(cpus_equal(p->cpus_allowed, new_mask))) {
1039 /* Restore old mask: won't need migration
1040 * thread, since current cpu is allowed. */
1041 BUG_ON(__set_cpus_allowed(p, old_mask, NULL));
1045 task_rq_unlock(rq, &flags);
1046 unlock_cpu_hotplug();
1050 * Find the least loaded CPU. Slightly favor the current CPU by
1051 * setting its runqueue length as the minimum to start.
1053 static int sched_best_cpu(struct task_struct *p)
1055 int i, minload, load, best_cpu, node = 0;
1058 best_cpu = task_cpu(p);
1059 if (cpu_rq(best_cpu)->nr_running <= 2)
1063 for_each_node_with_cpus(i) {
1065 * Node load is always divided by nr_cpus_node to normalise
1066 * load values in case cpu count differs from node to node.
1067 * We first multiply node_nr_running by 10 to get a little
1068 * better resolution.
1070 load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
1071 if (load < minload) {
1078 cpumask = node_to_cpumask(node);
1079 for (i = 0; i < NR_CPUS; ++i) {
1080 if (!cpu_isset(i, cpumask))
1082 if (cpu_rq(i)->nr_running < minload) {
1084 minload = cpu_rq(i)->nr_running;
1090 void sched_balance_exec(void)
1095 new_cpu = sched_best_cpu(current);
1096 if (new_cpu != smp_processor_id())
1097 sched_migrate_task(current, new_cpu);
1102 * Find the busiest node. All previous node loads contribute with a
1103 * geometrically deccaying weight to the load measure:
1104 * load_{t} = load_{t-1}/2 + nr_node_running_{t}
1105 * This way sudden load peaks are flattened out a bit.
1106 * Node load is divided by nr_cpus_node() in order to compare nodes
1107 * of different cpu count but also [first] multiplied by 10 to
1108 * provide better resolution.
1110 static int find_busiest_node(int this_node)
1112 int i, node = -1, load, this_load, maxload;
1114 if (!nr_cpus_node(this_node))
1116 this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
1117 + (10 * atomic_read(&node_nr_running[this_node])
1118 / nr_cpus_node(this_node));
1119 this_rq()->prev_node_load[this_node] = this_load;
1120 for_each_node_with_cpus(i) {
1123 load = (this_rq()->prev_node_load[i] >> 1)
1124 + (10 * atomic_read(&node_nr_running[i])
1126 this_rq()->prev_node_load[i] = load;
1127 if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
1135 #endif /* CONFIG_NUMA */
1140 * double_lock_balance - lock the busiest runqueue
1142 * this_rq is locked already. Recalculate nr_running if we have to
1143 * drop the runqueue lock.
1146 unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest,
1147 int this_cpu, int idle,
1148 unsigned int nr_running)
1150 if (unlikely(!spin_trylock(&busiest->lock))) {
1151 if (busiest < this_rq) {
1152 spin_unlock(&this_rq->lock);
1153 spin_lock(&busiest->lock);
1154 spin_lock(&this_rq->lock);
1155 /* Need to recalculate nr_running */
1156 if (idle || (this_rq->nr_running >
1157 this_rq->prev_cpu_load[this_cpu]))
1158 nr_running = this_rq->nr_running;
1160 nr_running = this_rq->prev_cpu_load[this_cpu];
1162 spin_lock(&busiest->lock);
1168 * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
1171 runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle,
1172 int *imbalance, cpumask_t cpumask)
1174 int nr_running, load, max_load, i;
1175 runqueue_t *busiest, *rq_src;
1178 * We search all runqueues to find the most busy one.
1179 * We do this lockless to reduce cache-bouncing overhead,
1180 * we re-check the 'best' source CPU later on again, with
1183 * We fend off statistical fluctuations in runqueue lengths by
1184 * saving the runqueue length (as seen by the balancing CPU) during
1185 * the previous load-balancing operation and using the smaller one
1186 * of the current and saved lengths. If a runqueue is long enough
1187 * for a longer amount of time then we recognize it and pull tasks
1190 * The 'current runqueue length' is a statistical maximum variable,
1191 * for that one we take the longer one - to avoid fluctuations in
1192 * the other direction. So for a load-balance to happen it needs
1193 * stable long runqueue on the target CPU and stable short runqueue
1194 * on the local runqueue.
1196 * We make an exception if this CPU is about to become idle - in
1197 * that case we are less picky about moving a task across CPUs and
1198 * take what can be taken.
1200 if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
1201 nr_running = this_rq->nr_running;
1203 nr_running = this_rq->prev_cpu_load[this_cpu];
1207 for (i = 0; i < NR_CPUS; i++) {
1208 if (!cpu_isset(i, cpumask))
1212 if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
1213 load = rq_src->nr_running;
1215 load = this_rq->prev_cpu_load[i];
1216 this_rq->prev_cpu_load[i] = rq_src->nr_running;
1218 if ((load > max_load) && (rq_src != this_rq)) {
1224 if (likely(!busiest))
1227 *imbalance = max_load - nr_running;
1229 /* It needs an at least ~25% imbalance to trigger balancing. */
1230 if (!idle && ((*imbalance)*4 < max_load)) {
1235 nr_running = double_lock_balance(this_rq, busiest, this_cpu,
1238 * Make sure nothing changed since we checked the
1241 if (busiest->nr_running <= nr_running) {
1242 spin_unlock(&busiest->lock);
1250 * pull_task - move a task from a remote runqueue to the local runqueue.
1251 * Both runqueues must be locked.
1254 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1255 runqueue_t *this_rq, int this_cpu)
1257 dequeue_task(p, src_array);
1258 nr_running_dec(src_rq);
1259 set_task_cpu(p, this_cpu);
1260 nr_running_inc(this_rq);
1261 enqueue_task(p, this_rq->active);
1262 p->timestamp = sched_clock() -
1263 (src_rq->timestamp_last_tick - p->timestamp);
1265 * Note that idle threads have a prio of MAX_PRIO, for this test
1266 * to be always true for them.
1268 if (TASK_PREEMPTS_CURR(p, this_rq))
1273 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1276 int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
1278 unsigned long delta = rq->timestamp_last_tick - tsk->timestamp;
1281 * We do not migrate tasks that are:
1282 * 1) running (obviously), or
1283 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1284 * 3) are cache-hot on their current CPU.
1286 if (task_running(rq, tsk))
1288 if (!cpu_isset(this_cpu, tsk->cpus_allowed))
1290 if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks)))
1296 * Current runqueue is empty, or rebalance tick: if there is an
1297 * inbalance (current runqueue is too short) then pull from
1298 * busiest runqueue(s).
1300 * We call this with the current runqueue locked,
1303 static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
1305 int imbalance, idx, this_cpu = smp_processor_id();
1306 runqueue_t *busiest;
1307 prio_array_t *array;
1308 struct list_head *head, *curr;
1311 if (cpu_is_offline(this_cpu))
1314 busiest = find_busiest_queue(this_rq, this_cpu, idle,
1315 &imbalance, cpumask);
1320 * We only want to steal a number of tasks equal to 1/2 the imbalance,
1321 * otherwise we'll just shift the imbalance to the new queue:
1326 * We first consider expired tasks. Those will likely not be
1327 * executed in the near future, and they are most likely to
1328 * be cache-cold, thus switching CPUs has the least effect
1331 if (busiest->expired->nr_active)
1332 array = busiest->expired;
1334 array = busiest->active;
1337 /* Start searching at priority 0: */
1341 idx = sched_find_first_bit(array->bitmap);
1343 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1344 if (idx >= MAX_PRIO) {
1345 if (array == busiest->expired) {
1346 array = busiest->active;
1352 head = array->queue + idx;
1355 tmp = list_entry(curr, task_t, run_list);
1359 if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
1365 pull_task(busiest, array, tmp, this_rq, this_cpu);
1367 /* Only migrate one task if we are idle */
1368 if (!idle && --imbalance) {
1375 spin_unlock(&busiest->lock);
1381 * One of the idle_cpu_tick() and busy_cpu_tick() functions will
1382 * get called every timer tick, on every CPU. Our balancing action
1383 * frequency and balancing agressivity depends on whether the CPU is
1386 * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
1387 * systems with HZ=100, every 10 msecs.)
1389 * On NUMA, do a node-rebalance every 400 msecs.
1391 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1392 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
1393 #define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
1394 #define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
1397 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
1399 int node = find_busiest_node(cpu_to_node(this_cpu));
1402 cpumask_t cpumask = node_to_cpumask(node);
1403 cpu_set(this_cpu, cpumask);
1404 spin_lock(&this_rq->lock);
1405 load_balance(this_rq, idle, cpumask);
1406 spin_unlock(&this_rq->lock);
1411 static void rebalance_tick(runqueue_t *this_rq, int idle)
1414 int this_cpu = smp_processor_id();
1416 unsigned long j = jiffies;
1419 * First do inter-node rebalancing, then intra-node rebalancing,
1420 * if both events happen in the same tick. The inter-node
1421 * rebalancing does not necessarily have to create a perfect
1422 * balance within the node, since we load-balance the most loaded
1423 * node with the current CPU. (ie. other CPUs in the local node
1424 * are not balanced.)
1428 if (!(j % IDLE_NODE_REBALANCE_TICK))
1429 balance_node(this_rq, idle, this_cpu);
1431 if (!(j % IDLE_REBALANCE_TICK)) {
1432 spin_lock(&this_rq->lock);
1433 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1434 spin_unlock(&this_rq->lock);
1439 if (!(j % BUSY_NODE_REBALANCE_TICK))
1440 balance_node(this_rq, idle, this_cpu);
1442 if (!(j % BUSY_REBALANCE_TICK)) {
1443 spin_lock(&this_rq->lock);
1444 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1445 spin_unlock(&this_rq->lock);
1450 * on UP we do not need to balance between CPUs:
1452 static inline void rebalance_tick(runqueue_t *this_rq, int idle)
1457 DEFINE_PER_CPU(struct kernel_stat, kstat);
1459 EXPORT_PER_CPU_SYMBOL(kstat);
1462 * We place interactive tasks back into the active array, if possible.
1464 * To guarantee that this does not starve expired tasks we ignore the
1465 * interactivity of a task if the first expired task had to wait more
1466 * than a 'reasonable' amount of time. This deadline timeout is
1467 * load-dependent, as the frequency of array switched decreases with
1468 * increasing number of running tasks. We also ignore the interactivity
1469 * if a better static_prio task has expired:
1471 #define EXPIRED_STARVING(rq) \
1472 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1473 (jiffies - (rq)->expired_timestamp >= \
1474 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1475 ((rq)->curr->static_prio > (rq)->best_expired_prio))
1478 * This function gets called by the timer code, with HZ frequency.
1479 * We call it with interrupts disabled.
1481 * It also gets called by the fork code, when changing the parent's
1484 void scheduler_tick(int user_ticks, int sys_ticks)
1486 int cpu = smp_processor_id();
1487 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1488 runqueue_t *rq = this_rq();
1489 task_t *p = current;
1491 rq->timestamp_last_tick = sched_clock();
1493 if (rcu_pending(cpu))
1494 rcu_check_callbacks(cpu, user_ticks);
1496 /* note: this timer irq context must be accounted for as well */
1497 if (hardirq_count() - HARDIRQ_OFFSET) {
1498 cpustat->irq += sys_ticks;
1500 } else if (softirq_count()) {
1501 cpustat->softirq += sys_ticks;
1505 if (p == rq->idle) {
1506 if (atomic_read(&rq->nr_iowait) > 0)
1507 cpustat->iowait += sys_ticks;
1509 cpustat->idle += sys_ticks;
1510 rebalance_tick(rq, 1);
1513 if (TASK_NICE(p) > 0)
1514 cpustat->nice += user_ticks;
1516 cpustat->user += user_ticks;
1517 cpustat->system += sys_ticks;
1519 /* Task might have expired already, but not scheduled off yet */
1520 if (p->array != rq->active) {
1521 set_tsk_need_resched(p);
1524 spin_lock(&rq->lock);
1526 * The task was running during this tick - update the
1527 * time slice counter. Note: we do not update a thread's
1528 * priority until it either goes to sleep or uses up its
1529 * timeslice. This makes it possible for interactive tasks
1530 * to use up their timeslices at their highest priority levels.
1532 if (unlikely(rt_task(p))) {
1534 * RR tasks need a special form of timeslice management.
1535 * FIFO tasks have no timeslices.
1537 if ((p->policy == SCHED_RR) && !--p->time_slice) {
1538 p->time_slice = task_timeslice(p);
1539 p->first_time_slice = 0;
1540 set_tsk_need_resched(p);
1542 /* put it at the end of the queue: */
1543 dequeue_task(p, rq->active);
1544 enqueue_task(p, rq->active);
1548 if (!--p->time_slice) {
1549 dequeue_task(p, rq->active);
1550 set_tsk_need_resched(p);
1551 p->prio = effective_prio(p);
1552 p->time_slice = task_timeslice(p);
1553 p->first_time_slice = 0;
1555 if (!rq->expired_timestamp)
1556 rq->expired_timestamp = jiffies;
1557 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1558 enqueue_task(p, rq->expired);
1559 if (p->static_prio < rq->best_expired_prio)
1560 rq->best_expired_prio = p->static_prio;
1562 enqueue_task(p, rq->active);
1565 * Prevent a too long timeslice allowing a task to monopolize
1566 * the CPU. We do this by splitting up the timeslice into
1569 * Note: this does not mean the task's timeslices expire or
1570 * get lost in any way, they just might be preempted by
1571 * another task of equal priority. (one with higher
1572 * priority would have preempted this task already.) We
1573 * requeue this task to the end of the list on this priority
1574 * level, which is in essence a round-robin of tasks with
1577 * This only applies to tasks in the interactive
1578 * delta range with at least TIMESLICE_GRANULARITY to requeue.
1580 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1581 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1582 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1583 (p->array == rq->active)) {
1585 dequeue_task(p, rq->active);
1586 set_tsk_need_resched(p);
1587 p->prio = effective_prio(p);
1588 enqueue_task(p, rq->active);
1592 spin_unlock(&rq->lock);
1594 rebalance_tick(rq, 0);
1598 * schedule() is the main scheduler function.
1600 asmlinkage void __sched schedule(void)
1603 task_t *prev, *next;
1605 prio_array_t *array;
1606 struct list_head *queue;
1607 unsigned long long now;
1608 unsigned long run_time;
1612 * Test if we are atomic. Since do_exit() needs to call into
1613 * schedule() atomically, we ignore that path for now.
1614 * Otherwise, whine if we are scheduling when we should not be.
1616 if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
1617 if (unlikely(in_atomic())) {
1618 printk(KERN_ERR "bad: scheduling while atomic!\n");
1628 release_kernel_lock(prev);
1629 now = sched_clock();
1630 if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
1631 run_time = now - prev->timestamp;
1633 run_time = NS_MAX_SLEEP_AVG;
1636 * Tasks with interactive credits get charged less run_time
1637 * at high sleep_avg to delay them losing their interactive
1640 if (HIGH_CREDIT(prev))
1641 run_time /= (CURRENT_BONUS(prev) ? : 1);
1643 spin_lock_irq(&rq->lock);
1646 * if entering off of a kernel preemption go straight
1647 * to picking the next task.
1649 switch_count = &prev->nivcsw;
1650 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1651 switch_count = &prev->nvcsw;
1652 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
1653 unlikely(signal_pending(prev))))
1654 prev->state = TASK_RUNNING;
1656 deactivate_task(prev, rq);
1659 if (unlikely(!rq->nr_running)) {
1661 load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
1663 if (!rq->nr_running) {
1665 rq->expired_timestamp = 0;
1671 if (unlikely(!array->nr_active)) {
1673 * Switch the active and expired arrays.
1675 rq->active = rq->expired;
1676 rq->expired = array;
1678 rq->expired_timestamp = 0;
1679 rq->best_expired_prio = MAX_PRIO;
1682 idx = sched_find_first_bit(array->bitmap);
1683 queue = array->queue + idx;
1684 next = list_entry(queue->next, task_t, run_list);
1686 if (!rt_task(next) && next->activated > 0) {
1687 unsigned long long delta = now - next->timestamp;
1689 if (next->activated == 1)
1690 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1692 array = next->array;
1693 dequeue_task(next, array);
1694 recalc_task_prio(next, next->timestamp + delta);
1695 enqueue_task(next, array);
1697 next->activated = 0;
1700 clear_tsk_need_resched(prev);
1701 RCU_qsctr(task_cpu(prev))++;
1703 prev->sleep_avg -= run_time;
1704 if ((long)prev->sleep_avg <= 0) {
1705 prev->sleep_avg = 0;
1706 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
1707 prev->interactive_credit--;
1709 add_delay_ts(prev,runcpu_total,prev->timestamp,now);
1710 prev->timestamp = now;
1712 if (likely(prev != next)) {
1713 add_delay_ts(next,waitcpu_total,next->timestamp,now);
1714 inc_delay(next,runs);
1715 next->timestamp = now;
1720 prepare_arch_switch(rq, next);
1721 prev = context_switch(rq, prev, next);
1724 finish_task_switch(prev);
1726 spin_unlock_irq(&rq->lock);
1728 reacquire_kernel_lock(current);
1729 preempt_enable_no_resched();
1730 if (test_thread_flag(TIF_NEED_RESCHED))
1734 EXPORT_SYMBOL(schedule);
1736 #ifdef CONFIG_PREEMPT
1738 * this is is the entry point to schedule() from in-kernel preemption
1739 * off of preempt_enable. Kernel preemptions off return from interrupt
1740 * occur there and call schedule directly.
1742 asmlinkage void __sched preempt_schedule(void)
1744 struct thread_info *ti = current_thread_info();
1747 * If there is a non-zero preempt_count or interrupts are disabled,
1748 * we do not want to preempt the current task. Just return..
1750 if (unlikely(ti->preempt_count || irqs_disabled()))
1754 ti->preempt_count = PREEMPT_ACTIVE;
1756 ti->preempt_count = 0;
1758 /* we could miss a preemption opportunity between schedule and now */
1760 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
1764 EXPORT_SYMBOL(preempt_schedule);
1765 #endif /* CONFIG_PREEMPT */
1767 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync)
1769 task_t *p = curr->task;
1770 return try_to_wake_up(p, mode, sync);
1773 EXPORT_SYMBOL(default_wake_function);
1776 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
1777 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
1778 * number) then we wake all the non-exclusive tasks and one exclusive task.
1780 * There are circumstances in which we can try to wake a task which has already
1781 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
1782 * zero in this (rare) case, and we handle it by continuing to scan the queue.
1784 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
1785 int nr_exclusive, int sync)
1787 struct list_head *tmp, *next;
1789 list_for_each_safe(tmp, next, &q->task_list) {
1792 curr = list_entry(tmp, wait_queue_t, task_list);
1793 flags = curr->flags;
1794 if (curr->func(curr, mode, sync) &&
1795 (flags & WQ_FLAG_EXCLUSIVE) &&
1802 * __wake_up - wake up threads blocked on a waitqueue.
1804 * @mode: which threads
1805 * @nr_exclusive: how many wake-one or wake-many threads to wake up
1807 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1809 unsigned long flags;
1811 spin_lock_irqsave(&q->lock, flags);
1812 __wake_up_common(q, mode, nr_exclusive, 0);
1813 spin_unlock_irqrestore(&q->lock, flags);
1816 EXPORT_SYMBOL(__wake_up);
1819 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
1821 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
1823 __wake_up_common(q, mode, 1, 0);
1827 * __wake_up - sync- wake up threads blocked on a waitqueue.
1829 * @mode: which threads
1830 * @nr_exclusive: how many wake-one or wake-many threads to wake up
1832 * The sync wakeup differs that the waker knows that it will schedule
1833 * away soon, so while the target thread will be woken up, it will not
1834 * be migrated to another CPU - ie. the two threads are 'synchronized'
1835 * with each other. This can prevent needless bouncing between CPUs.
1837 * On UP it can prevent extra preemption.
1839 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1841 unsigned long flags;
1846 spin_lock_irqsave(&q->lock, flags);
1847 if (likely(nr_exclusive))
1848 __wake_up_common(q, mode, nr_exclusive, 1);
1850 __wake_up_common(q, mode, nr_exclusive, 0);
1851 spin_unlock_irqrestore(&q->lock, flags);
1853 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
1855 void fastcall complete(struct completion *x)
1857 unsigned long flags;
1859 spin_lock_irqsave(&x->wait.lock, flags);
1861 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1863 spin_unlock_irqrestore(&x->wait.lock, flags);
1865 EXPORT_SYMBOL(complete);
1867 void fastcall complete_all(struct completion *x)
1869 unsigned long flags;
1871 spin_lock_irqsave(&x->wait.lock, flags);
1872 x->done += UINT_MAX/2;
1873 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1875 spin_unlock_irqrestore(&x->wait.lock, flags);
1877 EXPORT_SYMBOL(complete_all);
1879 void fastcall __sched wait_for_completion(struct completion *x)
1882 spin_lock_irq(&x->wait.lock);
1884 DECLARE_WAITQUEUE(wait, current);
1886 wait.flags |= WQ_FLAG_EXCLUSIVE;
1887 __add_wait_queue_tail(&x->wait, &wait);
1889 __set_current_state(TASK_UNINTERRUPTIBLE);
1890 spin_unlock_irq(&x->wait.lock);
1892 spin_lock_irq(&x->wait.lock);
1894 __remove_wait_queue(&x->wait, &wait);
1897 spin_unlock_irq(&x->wait.lock);
1899 EXPORT_SYMBOL(wait_for_completion);
1901 #define SLEEP_ON_VAR \
1902 unsigned long flags; \
1903 wait_queue_t wait; \
1904 init_waitqueue_entry(&wait, current);
1906 #define SLEEP_ON_HEAD \
1907 spin_lock_irqsave(&q->lock,flags); \
1908 __add_wait_queue(q, &wait); \
1909 spin_unlock(&q->lock);
1911 #define SLEEP_ON_TAIL \
1912 spin_lock_irq(&q->lock); \
1913 __remove_wait_queue(q, &wait); \
1914 spin_unlock_irqrestore(&q->lock, flags);
1916 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
1920 current->state = TASK_INTERRUPTIBLE;
1927 EXPORT_SYMBOL(interruptible_sleep_on);
1929 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1933 current->state = TASK_INTERRUPTIBLE;
1936 timeout = schedule_timeout(timeout);
1942 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
1944 void fastcall __sched sleep_on(wait_queue_head_t *q)
1948 current->state = TASK_UNINTERRUPTIBLE;
1955 EXPORT_SYMBOL(sleep_on);
1957 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
1961 current->state = TASK_UNINTERRUPTIBLE;
1964 timeout = schedule_timeout(timeout);
1970 EXPORT_SYMBOL(sleep_on_timeout);
1972 void set_user_nice(task_t *p, long nice)
1974 unsigned long flags;
1975 prio_array_t *array;
1977 int old_prio, new_prio, delta;
1979 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1982 * We have to be careful, if called from sys_setpriority(),
1983 * the task might be in the middle of scheduling on another CPU.
1985 rq = task_rq_lock(p, &flags);
1987 * The RT priorities are set via setscheduler(), but we still
1988 * allow the 'normal' nice value to be set - but as expected
1989 * it wont have any effect on scheduling until the task is
1993 p->static_prio = NICE_TO_PRIO(nice);
1998 dequeue_task(p, array);
2001 new_prio = NICE_TO_PRIO(nice);
2002 delta = new_prio - old_prio;
2003 p->static_prio = NICE_TO_PRIO(nice);
2007 enqueue_task(p, array);
2009 * If the task increased its priority or is running and
2010 * lowered its priority, then reschedule its CPU:
2012 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2013 resched_task(rq->curr);
2016 task_rq_unlock(rq, &flags);
2019 EXPORT_SYMBOL(set_user_nice);
2024 * sys_nice - change the priority of the current process.
2025 * @increment: priority increment
2027 * sys_setpriority is a more generic, but much slower function that
2028 * does similar things.
2030 asmlinkage long sys_nice(int increment)
2036 * Setpriority might change our priority at the same moment.
2037 * We don't have to worry. Conceptually one call occurs first
2038 * and we have a single winner.
2040 if (increment < 0) {
2041 if (!capable(CAP_SYS_NICE))
2043 if (increment < -40)
2049 nice = PRIO_TO_NICE(current->static_prio) + increment;
2055 retval = security_task_setnice(current, nice);
2059 set_user_nice(current, nice);
2066 * task_prio - return the priority value of a given task.
2067 * @p: the task in question.
2069 * This is the priority value as seen by users in /proc.
2070 * RT tasks are offset by -200. Normal tasks are centered
2071 * around 0, value goes from -16 to +15.
2073 int task_prio(task_t *p)
2075 return p->prio - MAX_RT_PRIO;
2079 * task_nice - return the nice value of a given task.
2080 * @p: the task in question.
2082 int task_nice(task_t *p)
2084 return TASK_NICE(p);
2087 EXPORT_SYMBOL(task_nice);
2090 * idle_cpu - is a given cpu idle currently?
2091 * @cpu: the processor in question.
2093 int idle_cpu(int cpu)
2095 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
2098 EXPORT_SYMBOL_GPL(idle_cpu);
2101 * find_process_by_pid - find a process with a matching PID value.
2102 * @pid: the pid in question.
2104 static inline task_t *find_process_by_pid(pid_t pid)
2106 return pid ? find_task_by_pid(pid) : current;
2109 /* Actually do priority change: must hold rq lock. */
2110 static void __setscheduler(struct task_struct *p, int policy, int prio)
2114 p->rt_priority = prio;
2115 if (policy != SCHED_NORMAL)
2116 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
2118 p->prio = p->static_prio;
2122 * setscheduler - change the scheduling policy and/or RT priority of a thread.
2124 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
2126 struct sched_param lp;
2127 int retval = -EINVAL;
2129 prio_array_t *array;
2130 unsigned long flags;
2134 if (!param || pid < 0)
2138 if (copy_from_user(&lp, param, sizeof(struct sched_param)))
2142 * We play safe to avoid deadlocks.
2144 read_lock_irq(&tasklist_lock);
2146 p = find_process_by_pid(pid);
2150 goto out_unlock_tasklist;
2153 * To be able to change p->policy safely, the apropriate
2154 * runqueue lock must be held.
2156 rq = task_rq_lock(p, &flags);
2162 if (policy != SCHED_FIFO && policy != SCHED_RR &&
2163 policy != SCHED_NORMAL)
2168 * Valid priorities for SCHED_FIFO and SCHED_RR are
2169 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
2172 if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
2174 if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
2178 if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
2179 !capable(CAP_SYS_NICE))
2181 if ((current->euid != p->euid) && (current->euid != p->uid) &&
2182 !capable(CAP_SYS_NICE))
2185 retval = security_task_setscheduler(p, policy, &lp);
2191 deactivate_task(p, task_rq(p));
2194 __setscheduler(p, policy, lp.sched_priority);
2196 __activate_task(p, task_rq(p));
2198 * Reschedule if we are currently running on this runqueue and
2199 * our priority decreased, or if we are not currently running on
2200 * this runqueue and our priority is higher than the current's
2202 if (task_running(rq, p)) {
2203 if (p->prio > oldprio)
2204 resched_task(rq->curr);
2205 } else if (p->prio < rq->curr->prio)
2206 resched_task(rq->curr);
2210 task_rq_unlock(rq, &flags);
2211 out_unlock_tasklist:
2212 read_unlock_irq(&tasklist_lock);
2219 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
2220 * @pid: the pid in question.
2221 * @policy: new policy
2222 * @param: structure containing the new RT priority.
2224 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
2225 struct sched_param __user *param)
2227 return setscheduler(pid, policy, param);
2231 * sys_sched_setparam - set/change the RT priority of a thread
2232 * @pid: the pid in question.
2233 * @param: structure containing the new RT priority.
2235 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
2237 return setscheduler(pid, -1, param);
2241 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
2242 * @pid: the pid in question.
2244 asmlinkage long sys_sched_getscheduler(pid_t pid)
2246 int retval = -EINVAL;
2253 read_lock(&tasklist_lock);
2254 p = find_process_by_pid(pid);
2256 retval = security_task_getscheduler(p);
2260 read_unlock(&tasklist_lock);
2267 * sys_sched_getscheduler - get the RT priority of a thread
2268 * @pid: the pid in question.
2269 * @param: structure containing the RT priority.
2271 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
2273 struct sched_param lp;
2274 int retval = -EINVAL;
2277 if (!param || pid < 0)
2280 read_lock(&tasklist_lock);
2281 p = find_process_by_pid(pid);
2286 retval = security_task_getscheduler(p);
2290 lp.sched_priority = p->rt_priority;
2291 read_unlock(&tasklist_lock);
2294 * This one might sleep, we cannot do it with a spinlock held ...
2296 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
2302 read_unlock(&tasklist_lock);
2307 * sys_sched_setaffinity - set the cpu affinity of a process
2308 * @pid: pid of the process
2309 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2310 * @user_mask_ptr: user-space pointer to the new cpu mask
2312 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
2313 unsigned long __user *user_mask_ptr)
2319 if (len < sizeof(new_mask))
2322 if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
2326 read_lock(&tasklist_lock);
2328 p = find_process_by_pid(pid);
2330 read_unlock(&tasklist_lock);
2331 unlock_cpu_hotplug();
2336 * It is not safe to call set_cpus_allowed with the
2337 * tasklist_lock held. We will bump the task_struct's
2338 * usage count and then drop tasklist_lock.
2341 read_unlock(&tasklist_lock);
2344 if ((current->euid != p->euid) && (current->euid != p->uid) &&
2345 !capable(CAP_SYS_NICE))
2348 retval = set_cpus_allowed(p, new_mask);
2352 unlock_cpu_hotplug();
2357 * sys_sched_getaffinity - get the cpu affinity of a process
2358 * @pid: pid of the process
2359 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2360 * @user_mask_ptr: user-space pointer to hold the current cpu mask
2362 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
2363 unsigned long __user *user_mask_ptr)
2365 unsigned int real_len;
2370 real_len = sizeof(mask);
2374 read_lock(&tasklist_lock);
2377 p = find_process_by_pid(pid);
2382 cpus_and(mask, p->cpus_allowed, cpu_possible_map);
2385 read_unlock(&tasklist_lock);
2388 if (copy_to_user(user_mask_ptr, &mask, real_len))
2394 * sys_sched_yield - yield the current processor to other threads.
2396 * this function yields the current CPU by moving the calling thread
2397 * to the expired array. If there are no other threads running on this
2398 * CPU then this function will return.
2400 asmlinkage long sys_sched_yield(void)
2402 runqueue_t *rq = this_rq_lock();
2403 prio_array_t *array = current->array;
2406 * We implement yielding by moving the task into the expired
2409 * (special rule: RT tasks will just roundrobin in the active
2412 if (likely(!rt_task(current))) {
2413 dequeue_task(current, array);
2414 enqueue_task(current, rq->expired);
2416 list_del(¤t->run_list);
2417 list_add_tail(¤t->run_list, array->queue + current->prio);
2420 * Since we are going to call schedule() anyway, there's
2421 * no need to preempt:
2423 _raw_spin_unlock(&rq->lock);
2424 preempt_enable_no_resched();
2431 void __sched __cond_resched(void)
2433 set_current_state(TASK_RUNNING);
2437 EXPORT_SYMBOL(__cond_resched);
2440 * yield - yield the current processor to other threads.
2442 * this is a shortcut for kernel-space yielding - it marks the
2443 * thread runnable and calls sys_sched_yield().
2445 void __sched yield(void)
2447 set_current_state(TASK_RUNNING);
2451 EXPORT_SYMBOL(yield);
2454 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
2455 * that process accounting knows that this is a task in IO wait state.
2457 * But don't do that if it is a deliberate, throttling IO wait (this task
2458 * has set its backing_dev_info: the queue against which it should throttle)
2460 void __sched io_schedule(void)
2462 struct runqueue *rq = this_rq();
2463 def_delay_var(dstart);
2465 start_delay_set(dstart,PF_IOWAIT);
2466 atomic_inc(&rq->nr_iowait);
2468 atomic_dec(&rq->nr_iowait);
2469 add_io_delay(dstart);
2472 EXPORT_SYMBOL(io_schedule);
2474 long __sched io_schedule_timeout(long timeout)
2476 struct runqueue *rq = this_rq();
2478 def_delay_var(dstart);
2480 start_delay_set(dstart,PF_IOWAIT);
2481 atomic_inc(&rq->nr_iowait);
2482 ret = schedule_timeout(timeout);
2483 atomic_dec(&rq->nr_iowait);
2484 add_io_delay(dstart);
2489 * sys_sched_get_priority_max - return maximum RT priority.
2490 * @policy: scheduling class.
2492 * this syscall returns the maximum rt_priority that can be used
2493 * by a given scheduling class.
2495 asmlinkage long sys_sched_get_priority_max(int policy)
2502 ret = MAX_USER_RT_PRIO-1;
2512 * sys_sched_get_priority_min - return minimum RT priority.
2513 * @policy: scheduling class.
2515 * this syscall returns the minimum rt_priority that can be used
2516 * by a given scheduling class.
2518 asmlinkage long sys_sched_get_priority_min(int policy)
2534 * sys_sched_rr_get_interval - return the default timeslice of a process.
2535 * @pid: pid of the process.
2536 * @interval: userspace pointer to the timeslice value.
2538 * this syscall writes the default timeslice value of a given process
2539 * into the user-space timespec buffer. A value of '0' means infinity.
2542 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
2544 int retval = -EINVAL;
2552 read_lock(&tasklist_lock);
2553 p = find_process_by_pid(pid);
2557 retval = security_task_getscheduler(p);
2561 jiffies_to_timespec(p->policy & SCHED_FIFO ?
2562 0 : task_timeslice(p), &t);
2563 read_unlock(&tasklist_lock);
2564 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2568 read_unlock(&tasklist_lock);
2572 static inline struct task_struct *eldest_child(struct task_struct *p)
2574 if (list_empty(&p->children)) return NULL;
2575 return list_entry(p->children.next,struct task_struct,sibling);
2578 static inline struct task_struct *older_sibling(struct task_struct *p)
2580 if (p->sibling.prev==&p->parent->children) return NULL;
2581 return list_entry(p->sibling.prev,struct task_struct,sibling);
2584 static inline struct task_struct *younger_sibling(struct task_struct *p)
2586 if (p->sibling.next==&p->parent->children) return NULL;
2587 return list_entry(p->sibling.next,struct task_struct,sibling);
2590 static void show_task(task_t * p)
2594 unsigned long free = 0;
2595 static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
2597 printk("%-13.13s ", p->comm);
2598 state = p->state ? __ffs(p->state) + 1 : 0;
2599 if (state < ARRAY_SIZE(stat_nam))
2600 printk(stat_nam[state]);
2603 #if (BITS_PER_LONG == 32)
2604 if (state == TASK_RUNNING)
2605 printk(" running ");
2607 printk(" %08lX ", thread_saved_pc(p));
2609 if (state == TASK_RUNNING)
2610 printk(" running task ");
2612 printk(" %016lx ", thread_saved_pc(p));
2614 #ifdef CONFIG_DEBUG_STACK_USAGE
2616 unsigned long * n = (unsigned long *) (p->thread_info+1);
2619 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
2622 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
2623 if ((relative = eldest_child(p)))
2624 printk("%5d ", relative->pid);
2627 if ((relative = younger_sibling(p)))
2628 printk("%7d", relative->pid);
2631 if ((relative = older_sibling(p)))
2632 printk(" %5d", relative->pid);
2636 printk(" (L-TLB)\n");
2638 printk(" (NOTLB)\n");
2640 if (state != TASK_RUNNING)
2641 show_stack(p, NULL);
2644 void show_state(void)
2648 #if (BITS_PER_LONG == 32)
2651 printk(" task PC pid father child younger older\n");
2655 printk(" task PC pid father child younger older\n");
2657 read_lock(&tasklist_lock);
2658 do_each_thread(g, p) {
2660 * reset the NMI-timeout, listing all files on a slow
2661 * console might take alot of time:
2663 touch_nmi_watchdog();
2665 } while_each_thread(g, p);
2667 read_unlock(&tasklist_lock);
2670 void __init init_idle(task_t *idle, int cpu)
2672 runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
2673 unsigned long flags;
2675 local_irq_save(flags);
2676 double_rq_lock(idle_rq, rq);
2678 idle_rq->curr = idle_rq->idle = idle;
2679 deactivate_task(idle, rq);
2681 idle->prio = MAX_PRIO;
2682 idle->state = TASK_RUNNING;
2683 set_task_cpu(idle, cpu);
2684 double_rq_unlock(idle_rq, rq);
2685 set_tsk_need_resched(idle);
2686 local_irq_restore(flags);
2688 /* Set the preempt count _outside_ the spinlocks! */
2689 #ifdef CONFIG_PREEMPT
2690 idle->thread_info->preempt_count = (idle->lock_depth >= 0);
2692 idle->thread_info->preempt_count = 0;
2697 * In a system that switches off the HZ timer idle_cpu_mask
2698 * indicates which cpus entered this state. This is used
2699 * in the rcu update to wait only for active cpus. For system
2700 * which do not switch off the HZ timer idle_cpu_mask should
2701 * always be CPU_MASK_NONE.
2703 cpumask_t idle_cpu_mask = CPU_MASK_NONE;
2707 * This is how migration works:
2709 * 1) we queue a migration_req_t structure in the source CPU's
2710 * runqueue and wake up that CPU's migration thread.
2711 * 2) we down() the locked semaphore => thread blocks.
2712 * 3) migration thread wakes up (implicitly it forces the migrated
2713 * thread off the CPU)
2714 * 4) it gets the migration request and checks whether the migrated
2715 * task is still in the wrong runqueue.
2716 * 5) if it's in the wrong runqueue then the migration thread removes
2717 * it and puts it into the right queue.
2718 * 6) migration thread up()s the semaphore.
2719 * 7) we wake up and the migration is done.
2723 * Change a given task's CPU affinity. Migrate the thread to a
2724 * proper CPU and schedule it away if the CPU it's executing on
2725 * is removed from the allowed bitmask.
2727 * NOTE: the caller must have a valid reference to the task, the
2728 * task must not exit() & deallocate itself prematurely. The
2729 * call is not atomic; no spinlocks may be held.
2731 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
2733 unsigned long flags;
2735 migration_req_t req;
2738 rq = task_rq_lock(p, &flags);
2739 if (any_online_cpu(new_mask) == NR_CPUS) {
2744 if (__set_cpus_allowed(p, new_mask, &req)) {
2745 /* Need help from migration thread: drop lock and wait. */
2746 task_rq_unlock(rq, &flags);
2747 wake_up_process(rq->migration_thread);
2748 wait_for_completion(&req.done);
2752 task_rq_unlock(rq, &flags);
2756 EXPORT_SYMBOL_GPL(set_cpus_allowed);
2758 /* Move (not current) task off this cpu, onto dest cpu. */
2759 static void move_task_away(struct task_struct *p, int dest_cpu)
2761 runqueue_t *rq_dest;
2763 rq_dest = cpu_rq(dest_cpu);
2765 double_rq_lock(this_rq(), rq_dest);
2766 if (task_cpu(p) != smp_processor_id())
2767 goto out; /* Already moved */
2769 set_task_cpu(p, dest_cpu);
2771 deactivate_task(p, this_rq());
2772 activate_task(p, rq_dest);
2773 if (p->prio < rq_dest->curr->prio)
2774 resched_task(rq_dest->curr);
2776 p->timestamp = rq_dest->timestamp_last_tick;
2779 double_rq_unlock(this_rq(), rq_dest);
2783 * migration_thread - this is a highprio system thread that performs
2784 * thread migration by bumping thread off CPU then 'pushing' onto
2787 static int migration_thread(void * data)
2790 int cpu = (long)data;
2793 BUG_ON(rq->migration_thread != current);
2795 while (!kthread_should_stop()) {
2796 struct list_head *head;
2797 migration_req_t *req;
2799 if (current->flags & PF_FREEZE)
2800 refrigerator(PF_FREEZE);
2802 spin_lock_irq(&rq->lock);
2803 head = &rq->migration_queue;
2804 current->state = TASK_INTERRUPTIBLE;
2805 if (list_empty(head)) {
2806 spin_unlock_irq(&rq->lock);
2810 req = list_entry(head->next, migration_req_t, list);
2811 list_del_init(head->next);
2812 spin_unlock(&rq->lock);
2814 move_task_away(req->task,
2815 any_online_cpu(req->task->cpus_allowed));
2817 complete(&req->done);
2822 #ifdef CONFIG_HOTPLUG_CPU
2823 /* migrate_all_tasks - function to migrate all the tasks from the
2824 * current cpu caller must have already scheduled this to the target
2825 * cpu via set_cpus_allowed. Machine is stopped. */
2826 void migrate_all_tasks(void)
2828 struct task_struct *tsk, *t;
2829 int dest_cpu, src_cpu;
2832 /* We're nailed to this CPU. */
2833 src_cpu = smp_processor_id();
2835 /* Not required, but here for neatness. */
2836 write_lock(&tasklist_lock);
2838 /* watch out for per node tasks, let's stay on this node */
2839 node = cpu_to_node(src_cpu);
2841 do_each_thread(t, tsk) {
2846 if (task_cpu(tsk) != src_cpu)
2849 /* Figure out where this task should go (attempting to
2850 * keep it on-node), and check if it can be migrated
2851 * as-is. NOTE that kernel threads bound to more than
2852 * one online cpu will be migrated. */
2853 mask = node_to_cpumask(node);
2854 cpus_and(mask, mask, tsk->cpus_allowed);
2855 dest_cpu = any_online_cpu(mask);
2856 if (dest_cpu == NR_CPUS)
2857 dest_cpu = any_online_cpu(tsk->cpus_allowed);
2858 if (dest_cpu == NR_CPUS) {
2859 cpus_clear(tsk->cpus_allowed);
2860 cpus_complement(tsk->cpus_allowed);
2861 dest_cpu = any_online_cpu(tsk->cpus_allowed);
2863 /* Don't tell them about moving exiting tasks
2864 or kernel threads (both mm NULL), since
2865 they never leave kernel. */
2866 if (tsk->mm && printk_ratelimit())
2867 printk(KERN_INFO "process %d (%s) no "
2868 "longer affine to cpu%d\n",
2869 tsk->pid, tsk->comm, src_cpu);
2872 move_task_away(tsk, dest_cpu);
2873 } while_each_thread(t, tsk);
2875 write_unlock(&tasklist_lock);
2877 #endif /* CONFIG_HOTPLUG_CPU */
2880 * migration_call - callback that gets triggered when a CPU is added.
2881 * Here we can start up the necessary migration thread for the new CPU.
2883 static int migration_call(struct notifier_block *nfb, unsigned long action,
2886 int cpu = (long)hcpu;
2887 struct task_struct *p;
2888 struct runqueue *rq;
2889 unsigned long flags;
2892 case CPU_UP_PREPARE:
2893 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
2896 kthread_bind(p, cpu);
2897 /* Must be high prio: stop_machine expects to yield to it. */
2898 rq = task_rq_lock(p, &flags);
2899 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
2900 task_rq_unlock(rq, &flags);
2901 cpu_rq(cpu)->migration_thread = p;
2904 /* Strictly unneccessary, as first user will wake it. */
2905 wake_up_process(cpu_rq(cpu)->migration_thread);
2907 #ifdef CONFIG_HOTPLUG_CPU
2908 case CPU_UP_CANCELED:
2909 /* Unbind it from offline cpu so it can run. Fall thru. */
2910 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
2912 kthread_stop(cpu_rq(cpu)->migration_thread);
2913 cpu_rq(cpu)->migration_thread = NULL;
2914 BUG_ON(cpu_rq(cpu)->nr_running != 0);
2921 static struct notifier_block __devinitdata migration_notifier = {
2922 .notifier_call = migration_call,
2925 int __init migration_init(void)
2927 void *cpu = (void *)(long)smp_processor_id();
2928 /* Start one for boot CPU. */
2929 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
2930 migration_call(&migration_notifier, CPU_ONLINE, cpu);
2931 register_cpu_notifier(&migration_notifier);
2937 * The 'big kernel lock'
2939 * This spinlock is taken and released recursively by lock_kernel()
2940 * and unlock_kernel(). It is transparently dropped and reaquired
2941 * over schedule(). It is used to protect legacy code that hasn't
2942 * been migrated to a proper locking design yet.
2944 * Don't use in new code.
2946 * Note: spinlock debugging needs this even on !CONFIG_SMP.
2948 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
2949 EXPORT_SYMBOL(kernel_flag);
2951 void __init sched_init(void)
2956 for (i = 0; i < NR_CPUS; i++) {
2957 prio_array_t *array;
2960 rq->active = rq->arrays;
2961 rq->expired = rq->arrays + 1;
2962 rq->best_expired_prio = MAX_PRIO;
2964 spin_lock_init(&rq->lock);
2965 INIT_LIST_HEAD(&rq->migration_queue);
2966 atomic_set(&rq->nr_iowait, 0);
2967 nr_running_init(rq);
2969 for (j = 0; j < 2; j++) {
2970 array = rq->arrays + j;
2971 for (k = 0; k < MAX_PRIO; k++) {
2972 INIT_LIST_HEAD(array->queue + k);
2973 __clear_bit(k, array->bitmap);
2975 // delimiter for bitsearch
2976 __set_bit(MAX_PRIO, array->bitmap);
2980 * We have to do a little magic to get the first
2981 * thread right in SMP mode.
2986 set_task_cpu(current, smp_processor_id());
2987 wake_up_forked_process(current);
2992 * The boot idle thread does lazy MMU switching as well:
2994 atomic_inc(&init_mm.mm_count);
2995 enter_lazy_tlb(&init_mm, current);
2998 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
2999 void __might_sleep(char *file, int line)
3001 #if defined(in_atomic)
3002 static unsigned long prev_jiffy; /* ratelimiting */
3004 if ((in_atomic() || irqs_disabled()) &&
3005 system_state == SYSTEM_RUNNING) {
3006 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
3008 prev_jiffy = jiffies;
3009 printk(KERN_ERR "Debug: sleeping function called from invalid"
3010 " context at %s:%d\n", file, line);
3011 printk("in_atomic():%d, irqs_disabled():%d\n",
3012 in_atomic(), irqs_disabled());
3017 EXPORT_SYMBOL(__might_sleep);
3021 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
3023 * This could be a long-held lock. If another CPU holds it for a long time,
3024 * and that CPU is not asked to reschedule then *this* CPU will spin on the
3025 * lock for a long time, even if *this* CPU is asked to reschedule.
3027 * So what we do here, in the slow (contended) path is to spin on the lock by
3028 * hand while permitting preemption.
3030 * Called inside preempt_disable().
3032 void __sched __preempt_spin_lock(spinlock_t *lock)
3034 if (preempt_count() > 1) {
3035 _raw_spin_lock(lock);
3040 while (spin_is_locked(lock))
3043 } while (!_raw_spin_trylock(lock));
3046 EXPORT_SYMBOL(__preempt_spin_lock);
3048 void __sched __preempt_write_lock(rwlock_t *lock)
3050 if (preempt_count() > 1) {
3051 _raw_write_lock(lock);
3057 while (rwlock_is_locked(lock))
3060 } while (!_raw_write_trylock(lock));
3063 EXPORT_SYMBOL(__preempt_write_lock);
3064 #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */
3066 #ifdef CONFIG_DELAY_ACCT
3067 int task_running_sys(struct task_struct *p)
3069 return task_running(task_rq(p),p);
3071 EXPORT_SYMBOL(task_running_sys);