4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
21 #include <linux/module.h>
22 #include <linux/nmi.h>
23 #include <linux/init.h>
24 #include <asm/uaccess.h>
25 #include <linux/highmem.h>
26 #include <linux/smp_lock.h>
27 #include <asm/mmu_context.h>
28 #include <linux/interrupt.h>
29 #include <linux/completion.h>
30 #include <linux/kernel_stat.h>
31 #include <linux/security.h>
32 #include <linux/notifier.h>
33 #include <linux/suspend.h>
34 #include <linux/blkdev.h>
35 #include <linux/delay.h>
36 #include <linux/smp.h>
37 #include <linux/timer.h>
38 #include <linux/rcupdate.h>
39 #include <linux/cpu.h>
40 #include <linux/percpu.h>
41 #include <linux/kthread.h>
44 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
46 #define cpu_to_node_mask(cpu) (cpu_online_map)
50 * Convert user-nice values [ -20 ... 0 ... 19 ]
51 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
54 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
55 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
56 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
59 * 'User priority' is the nice value converted to something we
60 * can work with better when scaling various scheduler parameters,
61 * it's a [ 0 ... 39 ] range.
63 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
64 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
65 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
66 #define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
67 (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
70 * Some helpers for converting nanosecond timing to jiffy resolution
72 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
73 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
76 * These are the 'tuning knobs' of the scheduler:
78 * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
79 * maximum timeslice is 200 msecs. Timeslices get refilled after
82 #define MIN_TIMESLICE ( 10 * HZ / 1000)
83 #define MAX_TIMESLICE (200 * HZ / 1000)
84 #define ON_RUNQUEUE_WEIGHT 30
85 #define CHILD_PENALTY 95
86 #define PARENT_PENALTY 100
88 #define PRIO_BONUS_RATIO 25
89 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
90 #define INTERACTIVE_DELTA 2
91 #define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS)
92 #define STARVATION_LIMIT (MAX_SLEEP_AVG)
93 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
94 #define NODE_THRESHOLD 125
95 #define CREDIT_LIMIT 100
98 * If a task is 'interactive' then we reinsert it in the active
99 * array after it has expired its current timeslice. (it will not
100 * continue to run immediately, it will still roundrobin with
101 * other interactive tasks.)
103 * This part scales the interactivity limit depending on niceness.
105 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
106 * Here are a few examples of different nice levels:
108 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
109 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
110 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
111 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
112 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
114 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
115 * priority range a task can explore, a value of '1' means the
116 * task is rated interactive.)
118 * Ie. nice +19 tasks can never get 'interactive' enough to be
119 * reinserted into the active array. And only heavily CPU-hog nice -20
120 * tasks will be expired. Default nice 0 tasks are somewhere between,
121 * it takes some effort for them to get interactive, but it's not
125 #define CURRENT_BONUS(p) \
126 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
130 #define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
131 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
134 #define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
135 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
138 #define SCALE(v1,v1_max,v2_max) \
139 (v1) * (v2_max) / (v1_max)
142 (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
145 #define TASK_INTERACTIVE(p) \
146 ((p)->prio <= (p)->static_prio - DELTA(p))
148 #define INTERACTIVE_SLEEP(p) \
149 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
150 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
152 #define HIGH_CREDIT(p) \
153 ((p)->interactive_credit > CREDIT_LIMIT)
155 #define LOW_CREDIT(p) \
156 ((p)->interactive_credit < -CREDIT_LIMIT)
158 #define TASK_PREEMPTS_CURR(p, rq) \
159 ((p)->prio < (rq)->curr->prio)
162 * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
163 * to time slice values.
165 * The higher a thread's priority, the bigger timeslices
166 * it gets during one round of execution. But even the lowest
167 * priority thread gets MIN_TIMESLICE worth of execution time.
169 * task_timeslice() is the interface that is used by the scheduler.
172 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
173 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
174 (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
176 static inline unsigned int task_timeslice(task_t *p)
178 return BASE_TIMESLICE(p);
182 * These are the runqueue data structures:
185 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
187 typedef struct runqueue runqueue_t;
191 unsigned long bitmap[BITMAP_SIZE];
192 struct list_head queue[MAX_PRIO];
196 * This is the main, per-CPU runqueue data structure.
198 * Locking rule: those places that want to lock multiple runqueues
199 * (such as the load balancing or the thread migration code), lock
200 * acquire operations must be ordered by ascending &runqueue.
204 unsigned long long nr_switches;
205 unsigned long nr_running, expired_timestamp, nr_uninterruptible,
208 struct mm_struct *prev_mm;
209 prio_array_t *active, *expired, arrays[2];
210 int best_expired_prio, prev_cpu_load[NR_CPUS];
212 atomic_t *node_nr_running;
213 int prev_node_load[MAX_NUMNODES];
215 task_t *migration_thread;
216 struct list_head migration_queue;
221 static DEFINE_PER_CPU(struct runqueue, runqueues);
223 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
224 #define this_rq() (&__get_cpu_var(runqueues))
225 #define task_rq(p) cpu_rq(task_cpu(p))
226 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
228 extern unsigned long __scheduling_functions_start_here;
229 extern unsigned long __scheduling_functions_end_here;
230 const unsigned long scheduling_functions_start_here =
231 (unsigned long)&__scheduling_functions_start_here;
232 const unsigned long scheduling_functions_end_here =
233 (unsigned long)&__scheduling_functions_end_here;
236 * Default context-switch locking:
238 #ifndef prepare_arch_switch
239 # define prepare_arch_switch(rq, next) do { } while (0)
240 # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
241 # define task_running(rq, p) ((rq)->curr == (p))
247 * Keep track of running tasks.
250 static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
251 {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
253 static inline void nr_running_init(struct runqueue *rq)
255 rq->node_nr_running = &node_nr_running[0];
258 static inline void nr_running_inc(runqueue_t *rq)
260 atomic_inc(rq->node_nr_running);
264 static inline void nr_running_dec(runqueue_t *rq)
266 atomic_dec(rq->node_nr_running);
270 __init void node_nr_running_init(void)
274 for (i = 0; i < NR_CPUS; i++) {
276 cpu_rq(i)->node_nr_running =
277 &node_nr_running[cpu_to_node(i)];
281 #else /* !CONFIG_NUMA */
283 # define nr_running_init(rq) do { } while (0)
284 # define nr_running_inc(rq) do { (rq)->nr_running++; } while (0)
285 # define nr_running_dec(rq) do { (rq)->nr_running--; } while (0)
287 #endif /* CONFIG_NUMA */
290 * task_rq_lock - lock the runqueue a given task resides on and disable
291 * interrupts. Note the ordering: we can safely lookup the task_rq without
292 * explicitly disabling preemption.
294 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
299 local_irq_save(*flags);
301 spin_lock(&rq->lock);
302 if (unlikely(rq != task_rq(p))) {
303 spin_unlock_irqrestore(&rq->lock, *flags);
304 goto repeat_lock_task;
309 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
311 spin_unlock_irqrestore(&rq->lock, *flags);
315 * rq_lock - lock a given runqueue and disable interrupts.
317 static inline runqueue_t *this_rq_lock(void)
323 spin_lock(&rq->lock);
328 static inline void rq_unlock(runqueue_t *rq)
330 spin_unlock_irq(&rq->lock);
334 * Adding/removing a task to/from a priority array:
336 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
339 list_del(&p->run_list);
340 if (list_empty(array->queue + p->prio))
341 __clear_bit(p->prio, array->bitmap);
344 static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
346 list_add_tail(&p->run_list, array->queue + p->prio);
347 __set_bit(p->prio, array->bitmap);
353 * effective_prio - return the priority that is based on the static
354 * priority but is modified by bonuses/penalties.
356 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
357 * into the -5 ... 0 ... +5 bonus/penalty range.
359 * We use 25% of the full 0...39 priority range so that:
361 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
362 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
364 * Both properties are important to certain workloads.
366 static int effective_prio(task_t *p)
373 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
375 prio = p->static_prio - bonus;
376 if (prio < MAX_RT_PRIO)
378 if (prio > MAX_PRIO-1)
384 * __activate_task - move a task to the runqueue.
386 static inline void __activate_task(task_t *p, runqueue_t *rq)
388 enqueue_task(p, rq->active);
392 static void recalc_task_prio(task_t *p, unsigned long long now)
394 unsigned long long __sleep_time = now - p->timestamp;
395 unsigned long sleep_time;
397 if (__sleep_time > NS_MAX_SLEEP_AVG)
398 sleep_time = NS_MAX_SLEEP_AVG;
400 sleep_time = (unsigned long)__sleep_time;
402 if (likely(sleep_time > 0)) {
404 * User tasks that sleep a long time are categorised as
405 * idle and will get just interactive status to stay active &
406 * prevent them suddenly becoming cpu hogs and starving
409 if (p->mm && p->activated != -1 &&
410 sleep_time > INTERACTIVE_SLEEP(p)) {
411 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
414 p->interactive_credit++;
417 * The lower the sleep avg a task has the more
418 * rapidly it will rise with sleep time.
420 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
423 * Tasks with low interactive_credit are limited to
424 * one timeslice worth of sleep avg bonus.
427 sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
428 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
431 * Non high_credit tasks waking from uninterruptible
432 * sleep are limited in their sleep_avg rise as they
433 * are likely to be cpu hogs waiting on I/O
435 if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
436 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
438 else if (p->sleep_avg + sleep_time >=
439 INTERACTIVE_SLEEP(p)) {
440 p->sleep_avg = INTERACTIVE_SLEEP(p);
446 * This code gives a bonus to interactive tasks.
448 * The boost works by updating the 'average sleep time'
449 * value here, based on ->timestamp. The more time a
450 * task spends sleeping, the higher the average gets -
451 * and the higher the priority boost gets as well.
453 p->sleep_avg += sleep_time;
455 if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
456 p->sleep_avg = NS_MAX_SLEEP_AVG;
458 p->interactive_credit++;
463 p->prio = effective_prio(p);
467 * activate_task - move a task to the runqueue and do priority recalculation
469 * Update all the scheduling statistics stuff. (sleep average
470 * calculation, priority modifiers, etc.)
472 static inline void activate_task(task_t *p, runqueue_t *rq)
474 unsigned long long now = sched_clock();
476 recalc_task_prio(p, now);
479 * This checks to make sure it's not an uninterruptible task
480 * that is now waking up.
484 * Tasks which were woken up by interrupts (ie. hw events)
485 * are most likely of interactive nature. So we give them
486 * the credit of extending their sleep time to the period
487 * of time they spend on the runqueue, waiting for execution
488 * on a CPU, first time around:
494 * Normal first-time wakeups get a credit too for
495 * on-runqueue time, but it will be weighted down:
502 __activate_task(p, rq);
506 * deactivate_task - remove a task from the runqueue.
508 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
511 if (p->state == TASK_UNINTERRUPTIBLE)
512 rq->nr_uninterruptible++;
513 dequeue_task(p, p->array);
518 * resched_task - mark a task 'to be rescheduled now'.
520 * On UP this means the setting of the need_resched flag, on SMP it
521 * might also involve a cross-CPU call to trigger the scheduler on
524 static inline void resched_task(task_t *p)
527 int need_resched, nrpolling;
530 /* minimise the chance of sending an interrupt to poll_idle() */
531 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
532 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
533 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
535 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
536 smp_send_reschedule(task_cpu(p));
539 set_tsk_need_resched(p);
544 * task_curr - is this task currently executing on a CPU?
545 * @p: the task in question.
547 inline int task_curr(task_t *p)
549 return cpu_curr(task_cpu(p)) == p;
554 struct list_head list;
556 struct completion done;
560 * The task's runqueue lock must be held, and the new mask must be valid.
561 * Returns true if you have to wait for migration thread.
563 static int __set_cpus_allowed(task_t *p, cpumask_t new_mask,
564 migration_req_t *req)
566 runqueue_t *rq = task_rq(p);
568 p->cpus_allowed = new_mask;
570 * Can the task run on the task's current CPU? If not then
571 * migrate the thread off to a proper CPU.
573 if (cpu_isset(task_cpu(p), new_mask))
577 * If the task is not on a runqueue (and not running), then
578 * it is sufficient to simply update the task's cpu field.
580 if (!p->array && !task_running(rq, p)) {
581 set_task_cpu(p, any_online_cpu(p->cpus_allowed));
585 init_completion(&req->done);
587 list_add(&req->list, &rq->migration_queue);
592 * wait_task_inactive - wait for a thread to unschedule.
594 * The caller must ensure that the task *will* unschedule sometime soon,
595 * else this function might spin for a *long* time. This function can't
596 * be called with interrupts off, or it may introduce deadlock with
597 * smp_call_function() if an IPI is sent by the same process we are
598 * waiting to become inactive.
600 void wait_task_inactive(task_t * p)
607 rq = task_rq_lock(p, &flags);
608 /* Must be off runqueue entirely, not preempted. */
609 if (unlikely(p->array)) {
610 /* If it's preempted, we yield. It could be a while. */
611 preempted = !task_running(rq, p);
612 task_rq_unlock(rq, &flags);
618 task_rq_unlock(rq, &flags);
622 * kick_process - kick a running thread to enter/exit the kernel
623 * @p: the to-be-kicked thread
625 * Cause a process which is running on another CPU to enter
626 * kernel-mode, without any delay. (to get signals handled.)
628 void kick_process(task_t *p)
634 if ((cpu != smp_processor_id()) && task_curr(p))
635 smp_send_reschedule(cpu);
639 EXPORT_SYMBOL_GPL(kick_process);
644 * try_to_wake_up - wake up a thread
645 * @p: the to-be-woken-up thread
646 * @state: the mask of task states that can be woken
647 * @sync: do a synchronous wakeup?
649 * Put it on the run-queue if it's not already there. The "current"
650 * thread is always on the run-queue (except when the actual
651 * re-schedule is in progress), and as such you're allowed to do
652 * the simpler "current->state = TASK_RUNNING" to mark yourself
653 * runnable without the overhead of this.
655 * returns failure only if the task is already active.
657 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
665 rq = task_rq_lock(p, &flags);
666 old_state = p->state;
667 if (old_state & state) {
670 * Fast-migrate the task if it's not running or runnable
671 * currently. Do not violate hard affinity.
673 if (unlikely(sync && !task_running(rq, p) &&
674 (task_cpu(p) != smp_processor_id()) &&
675 cpu_isset(smp_processor_id(),
677 !cpu_is_offline(smp_processor_id()))) {
678 set_task_cpu(p, smp_processor_id());
679 task_rq_unlock(rq, &flags);
680 goto repeat_lock_task;
682 if (old_state == TASK_UNINTERRUPTIBLE) {
683 rq->nr_uninterruptible--;
685 * Tasks on involuntary sleep don't earn
686 * sleep_avg beyond just interactive state.
690 if (sync && (task_cpu(p) == smp_processor_id()))
691 __activate_task(p, rq);
693 activate_task(p, rq);
694 if (TASK_PREEMPTS_CURR(p, rq))
695 resched_task(rq->curr);
699 p->state = TASK_RUNNING;
701 task_rq_unlock(rq, &flags);
705 int fastcall wake_up_process(task_t * p)
707 return try_to_wake_up(p, TASK_STOPPED |
708 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
711 EXPORT_SYMBOL(wake_up_process);
713 int fastcall wake_up_state(task_t *p, unsigned int state)
715 return try_to_wake_up(p, state, 0);
719 * Perform scheduler related setup for a newly forked process p.
720 * p is forked by current.
722 void fastcall sched_fork(task_t *p)
725 * We mark the process as running here, but have not actually
726 * inserted it onto the runqueue yet. This guarantees that
727 * nobody will actually run it, and a signal or other external
728 * event cannot wake it up and insert it on the runqueue either.
730 p->state = TASK_RUNNING;
731 INIT_LIST_HEAD(&p->run_list);
733 spin_lock_init(&p->switch_lock);
734 #ifdef CONFIG_PREEMPT
736 * During context-switch we hold precisely one spinlock, which
737 * schedule_tail drops. (in the common case it's this_rq()->lock,
738 * but it also can be p->switch_lock.) So we compensate with a count
739 * of 1. Also, we want to start with kernel preemption disabled.
741 p->thread_info->preempt_count = 1;
744 * Share the timeslice between parent and child, thus the
745 * total amount of pending timeslices in the system doesn't change,
746 * resulting in more scheduling fairness.
749 p->time_slice = (current->time_slice + 1) >> 1;
751 * The remainder of the first timeslice might be recovered by
752 * the parent if the child exits early enough.
754 p->first_time_slice = 1;
755 current->time_slice >>= 1;
756 p->timestamp = sched_clock();
757 if (!current->time_slice) {
759 * This case is rare, it happens when the parent has only
760 * a single jiffy left from its timeslice. Taking the
761 * runqueue lock is not a problem.
763 current->time_slice = 1;
765 scheduler_tick(0, 0);
773 * wake_up_forked_process - wake up a freshly forked process.
775 * This function will do some initial scheduler statistics housekeeping
776 * that must be done for every newly created process.
778 void fastcall wake_up_forked_process(task_t * p)
781 runqueue_t *rq = task_rq_lock(current, &flags);
783 BUG_ON(p->state != TASK_RUNNING);
786 * We decrease the sleep average of forking parents
787 * and children as well, to keep max-interactive tasks
788 * from forking tasks that are max-interactive.
790 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
791 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
793 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
794 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
796 p->interactive_credit = 0;
798 p->prio = effective_prio(p);
799 set_task_cpu(p, smp_processor_id());
801 if (unlikely(!current->array))
802 __activate_task(p, rq);
804 p->prio = current->prio;
805 list_add_tail(&p->run_list, ¤t->run_list);
806 p->array = current->array;
807 p->array->nr_active++;
810 task_rq_unlock(rq, &flags);
814 * Potentially available exiting-child timeslices are
815 * retrieved here - this way the parent does not get
816 * penalized for creating too many threads.
818 * (this cannot be used to 'generate' timeslices
819 * artificially, because any timeslice recovered here
820 * was given away by the parent in the first place.)
822 void fastcall sched_exit(task_t * p)
827 local_irq_save(flags);
828 if (p->first_time_slice) {
829 p->parent->time_slice += p->time_slice;
830 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
831 p->parent->time_slice = MAX_TIMESLICE;
833 local_irq_restore(flags);
835 * If the child was a (relative-) CPU hog then decrease
836 * the sleep_avg of the parent as well.
838 rq = task_rq_lock(p->parent, &flags);
839 if (p->sleep_avg < p->parent->sleep_avg)
840 p->parent->sleep_avg = p->parent->sleep_avg /
841 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
843 task_rq_unlock(rq, &flags);
847 * finish_task_switch - clean up after a task-switch
848 * @prev: the thread we just switched away from.
850 * We enter this with the runqueue still locked, and finish_arch_switch()
851 * will unlock it along with doing any other architecture-specific cleanup
854 * Note that we may have delayed dropping an mm in context_switch(). If
855 * so, we finish that here outside of the runqueue lock. (Doing it
856 * with the lock held can cause deadlocks; see schedule() for
859 static inline void finish_task_switch(task_t *prev)
861 runqueue_t *rq = this_rq();
862 struct mm_struct *mm = rq->prev_mm;
863 unsigned long prev_task_flags;
868 * A task struct has one reference for the use as "current".
869 * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
870 * schedule one last time. The schedule call will never return,
871 * and the scheduled task must drop that reference.
872 * The test for TASK_ZOMBIE must occur while the runqueue locks are
873 * still held, otherwise prev could be scheduled on another cpu, die
874 * there before we look at prev->state, and then the reference would
876 * Manfred Spraul <manfred@colorfullife.com>
878 prev_task_flags = prev->flags;
879 finish_arch_switch(rq, prev);
882 if (unlikely(prev_task_flags & PF_DEAD))
883 put_task_struct(prev);
887 * schedule_tail - first thing a freshly forked thread must call.
888 * @prev: the thread we just switched away from.
890 asmlinkage void schedule_tail(task_t *prev)
892 finish_task_switch(prev);
894 if (current->set_child_tid)
895 put_user(current->pid, current->set_child_tid);
899 * context_switch - switch to the new MM and the new
900 * thread's register state.
903 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
905 struct mm_struct *mm = next->mm;
906 struct mm_struct *oldmm = prev->active_mm;
909 next->active_mm = oldmm;
910 atomic_inc(&oldmm->mm_count);
911 enter_lazy_tlb(oldmm, next);
913 switch_mm(oldmm, mm, next);
915 if (unlikely(!prev->mm)) {
916 prev->active_mm = NULL;
917 WARN_ON(rq->prev_mm);
921 /* Here we just switch the register state and the stack. */
922 switch_to(prev, next, prev);
928 * nr_running, nr_uninterruptible and nr_context_switches:
930 * externally visible scheduler statistics: current number of runnable
931 * threads, current number of uninterruptible-sleeping threads, total
932 * number of context switches performed since bootup.
934 unsigned long nr_running(void)
936 unsigned long i, sum = 0;
938 for (i = 0; i < NR_CPUS; i++)
939 sum += cpu_rq(i)->nr_running;
944 unsigned long nr_uninterruptible(void)
946 unsigned long i, sum = 0;
949 sum += cpu_rq(i)->nr_uninterruptible;
954 unsigned long long nr_context_switches(void)
956 unsigned long long i, sum = 0;
959 sum += cpu_rq(i)->nr_switches;
964 unsigned long nr_iowait(void)
966 unsigned long i, sum = 0;
969 sum += atomic_read(&cpu_rq(i)->nr_iowait);
975 * double_rq_lock - safely lock two runqueues
977 * Note this does not disable interrupts like task_rq_lock,
978 * you need to do so manually before calling.
980 static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
983 spin_lock(&rq1->lock);
986 spin_lock(&rq1->lock);
987 spin_lock(&rq2->lock);
989 spin_lock(&rq2->lock);
990 spin_lock(&rq1->lock);
996 * double_rq_unlock - safely unlock two runqueues
998 * Note this does not restore interrupts like task_rq_unlock,
999 * you need to do so manually after calling.
1001 static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1003 spin_unlock(&rq1->lock);
1005 spin_unlock(&rq2->lock);
1010 * If dest_cpu is allowed for this process, migrate the task to it.
1011 * This is accomplished by forcing the cpu_allowed mask to only
1012 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1013 * the cpu_allowed mask is restored.
1015 static void sched_migrate_task(task_t *p, int dest_cpu)
1018 migration_req_t req;
1019 unsigned long flags;
1020 cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu);
1023 rq = task_rq_lock(p, &flags);
1024 old_mask = p->cpus_allowed;
1025 if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu))
1028 /* force the process onto the specified CPU */
1029 if (__set_cpus_allowed(p, new_mask, &req)) {
1030 /* Need to wait for migration thread. */
1031 task_rq_unlock(rq, &flags);
1032 wake_up_process(rq->migration_thread);
1033 wait_for_completion(&req.done);
1035 /* If we raced with sys_sched_setaffinity, don't
1037 rq = task_rq_lock(p, &flags);
1038 if (likely(cpus_equal(p->cpus_allowed, new_mask))) {
1039 /* Restore old mask: won't need migration
1040 * thread, since current cpu is allowed. */
1041 BUG_ON(__set_cpus_allowed(p, old_mask, NULL));
1045 task_rq_unlock(rq, &flags);
1046 unlock_cpu_hotplug();
1050 * Find the least loaded CPU. Slightly favor the current CPU by
1051 * setting its runqueue length as the minimum to start.
1053 static int sched_best_cpu(struct task_struct *p)
1055 int i, minload, load, best_cpu, node = 0;
1058 best_cpu = task_cpu(p);
1059 if (cpu_rq(best_cpu)->nr_running <= 2)
1063 for_each_node_with_cpus(i) {
1065 * Node load is always divided by nr_cpus_node to normalise
1066 * load values in case cpu count differs from node to node.
1067 * We first multiply node_nr_running by 10 to get a little
1068 * better resolution.
1070 load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
1071 if (load < minload) {
1078 cpumask = node_to_cpumask(node);
1079 for (i = 0; i < NR_CPUS; ++i) {
1080 if (!cpu_isset(i, cpumask))
1082 if (cpu_rq(i)->nr_running < minload) {
1084 minload = cpu_rq(i)->nr_running;
1090 void sched_balance_exec(void)
1095 new_cpu = sched_best_cpu(current);
1096 if (new_cpu != smp_processor_id())
1097 sched_migrate_task(current, new_cpu);
1102 * Find the busiest node. All previous node loads contribute with a
1103 * geometrically deccaying weight to the load measure:
1104 * load_{t} = load_{t-1}/2 + nr_node_running_{t}
1105 * This way sudden load peaks are flattened out a bit.
1106 * Node load is divided by nr_cpus_node() in order to compare nodes
1107 * of different cpu count but also [first] multiplied by 10 to
1108 * provide better resolution.
1110 static int find_busiest_node(int this_node)
1112 int i, node = -1, load, this_load, maxload;
1114 if (!nr_cpus_node(this_node))
1116 this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
1117 + (10 * atomic_read(&node_nr_running[this_node])
1118 / nr_cpus_node(this_node));
1119 this_rq()->prev_node_load[this_node] = this_load;
1120 for_each_node_with_cpus(i) {
1123 load = (this_rq()->prev_node_load[i] >> 1)
1124 + (10 * atomic_read(&node_nr_running[i])
1126 this_rq()->prev_node_load[i] = load;
1127 if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
1135 #endif /* CONFIG_NUMA */
1140 * double_lock_balance - lock the busiest runqueue
1142 * this_rq is locked already. Recalculate nr_running if we have to
1143 * drop the runqueue lock.
1146 unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest,
1147 int this_cpu, int idle,
1148 unsigned int nr_running)
1150 if (unlikely(!spin_trylock(&busiest->lock))) {
1151 if (busiest < this_rq) {
1152 spin_unlock(&this_rq->lock);
1153 spin_lock(&busiest->lock);
1154 spin_lock(&this_rq->lock);
1155 /* Need to recalculate nr_running */
1156 if (idle || (this_rq->nr_running >
1157 this_rq->prev_cpu_load[this_cpu]))
1158 nr_running = this_rq->nr_running;
1160 nr_running = this_rq->prev_cpu_load[this_cpu];
1162 spin_lock(&busiest->lock);
1168 * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
1171 runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle,
1172 int *imbalance, cpumask_t cpumask)
1174 int nr_running, load, max_load, i;
1175 runqueue_t *busiest, *rq_src;
1178 * We search all runqueues to find the most busy one.
1179 * We do this lockless to reduce cache-bouncing overhead,
1180 * we re-check the 'best' source CPU later on again, with
1183 * We fend off statistical fluctuations in runqueue lengths by
1184 * saving the runqueue length (as seen by the balancing CPU) during
1185 * the previous load-balancing operation and using the smaller one
1186 * of the current and saved lengths. If a runqueue is long enough
1187 * for a longer amount of time then we recognize it and pull tasks
1190 * The 'current runqueue length' is a statistical maximum variable,
1191 * for that one we take the longer one - to avoid fluctuations in
1192 * the other direction. So for a load-balance to happen it needs
1193 * stable long runqueue on the target CPU and stable short runqueue
1194 * on the local runqueue.
1196 * We make an exception if this CPU is about to become idle - in
1197 * that case we are less picky about moving a task across CPUs and
1198 * take what can be taken.
1200 if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
1201 nr_running = this_rq->nr_running;
1203 nr_running = this_rq->prev_cpu_load[this_cpu];
1207 for (i = 0; i < NR_CPUS; i++) {
1208 if (!cpu_isset(i, cpumask))
1212 if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
1213 load = rq_src->nr_running;
1215 load = this_rq->prev_cpu_load[i];
1216 this_rq->prev_cpu_load[i] = rq_src->nr_running;
1218 if ((load > max_load) && (rq_src != this_rq)) {
1224 if (likely(!busiest))
1227 *imbalance = max_load - nr_running;
1229 /* It needs an at least ~25% imbalance to trigger balancing. */
1230 if (!idle && ((*imbalance)*4 < max_load)) {
1235 nr_running = double_lock_balance(this_rq, busiest, this_cpu,
1238 * Make sure nothing changed since we checked the
1241 if (busiest->nr_running <= nr_running) {
1242 spin_unlock(&busiest->lock);
1250 * pull_task - move a task from a remote runqueue to the local runqueue.
1251 * Both runqueues must be locked.
1254 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1255 runqueue_t *this_rq, int this_cpu)
1257 dequeue_task(p, src_array);
1258 nr_running_dec(src_rq);
1259 set_task_cpu(p, this_cpu);
1260 nr_running_inc(this_rq);
1261 enqueue_task(p, this_rq->active);
1262 p->timestamp = sched_clock() -
1263 (src_rq->timestamp_last_tick - p->timestamp);
1265 * Note that idle threads have a prio of MAX_PRIO, for this test
1266 * to be always true for them.
1268 if (TASK_PREEMPTS_CURR(p, this_rq))
1273 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1276 int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
1278 unsigned long delta = rq->timestamp_last_tick - tsk->timestamp;
1281 * We do not migrate tasks that are:
1282 * 1) running (obviously), or
1283 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1284 * 3) are cache-hot on their current CPU.
1286 if (task_running(rq, tsk))
1288 if (!cpu_isset(this_cpu, tsk->cpus_allowed))
1290 if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks)))
1296 * Current runqueue is empty, or rebalance tick: if there is an
1297 * inbalance (current runqueue is too short) then pull from
1298 * busiest runqueue(s).
1300 * We call this with the current runqueue locked,
1303 static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
1305 int imbalance, idx, this_cpu = smp_processor_id();
1306 runqueue_t *busiest;
1307 prio_array_t *array;
1308 struct list_head *head, *curr;
1311 if (cpu_is_offline(this_cpu))
1314 busiest = find_busiest_queue(this_rq, this_cpu, idle,
1315 &imbalance, cpumask);
1320 * We only want to steal a number of tasks equal to 1/2 the imbalance,
1321 * otherwise we'll just shift the imbalance to the new queue:
1326 * We first consider expired tasks. Those will likely not be
1327 * executed in the near future, and they are most likely to
1328 * be cache-cold, thus switching CPUs has the least effect
1331 if (busiest->expired->nr_active)
1332 array = busiest->expired;
1334 array = busiest->active;
1337 /* Start searching at priority 0: */
1341 idx = sched_find_first_bit(array->bitmap);
1343 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1344 if (idx >= MAX_PRIO) {
1345 if (array == busiest->expired) {
1346 array = busiest->active;
1352 head = array->queue + idx;
1355 tmp = list_entry(curr, task_t, run_list);
1359 if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
1365 pull_task(busiest, array, tmp, this_rq, this_cpu);
1367 /* Only migrate one task if we are idle */
1368 if (!idle && --imbalance) {
1375 spin_unlock(&busiest->lock);
1381 * One of the idle_cpu_tick() and busy_cpu_tick() functions will
1382 * get called every timer tick, on every CPU. Our balancing action
1383 * frequency and balancing agressivity depends on whether the CPU is
1386 * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
1387 * systems with HZ=100, every 10 msecs.)
1389 * On NUMA, do a node-rebalance every 400 msecs.
1391 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1392 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
1393 #define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
1394 #define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
1397 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
1399 int node = find_busiest_node(cpu_to_node(this_cpu));
1402 cpumask_t cpumask = node_to_cpumask(node);
1403 cpu_set(this_cpu, cpumask);
1404 spin_lock(&this_rq->lock);
1405 load_balance(this_rq, idle, cpumask);
1406 spin_unlock(&this_rq->lock);
1411 static void rebalance_tick(runqueue_t *this_rq, int idle)
1414 int this_cpu = smp_processor_id();
1416 unsigned long j = jiffies;
1419 * First do inter-node rebalancing, then intra-node rebalancing,
1420 * if both events happen in the same tick. The inter-node
1421 * rebalancing does not necessarily have to create a perfect
1422 * balance within the node, since we load-balance the most loaded
1423 * node with the current CPU. (ie. other CPUs in the local node
1424 * are not balanced.)
1428 if (!(j % IDLE_NODE_REBALANCE_TICK))
1429 balance_node(this_rq, idle, this_cpu);
1431 if (!(j % IDLE_REBALANCE_TICK)) {
1432 spin_lock(&this_rq->lock);
1433 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1434 spin_unlock(&this_rq->lock);
1439 if (!(j % BUSY_NODE_REBALANCE_TICK))
1440 balance_node(this_rq, idle, this_cpu);
1442 if (!(j % BUSY_REBALANCE_TICK)) {
1443 spin_lock(&this_rq->lock);
1444 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1445 spin_unlock(&this_rq->lock);
1450 * on UP we do not need to balance between CPUs:
1452 static inline void rebalance_tick(runqueue_t *this_rq, int idle)
1457 DEFINE_PER_CPU(struct kernel_stat, kstat);
1459 EXPORT_PER_CPU_SYMBOL(kstat);
1462 * We place interactive tasks back into the active array, if possible.
1464 * To guarantee that this does not starve expired tasks we ignore the
1465 * interactivity of a task if the first expired task had to wait more
1466 * than a 'reasonable' amount of time. This deadline timeout is
1467 * load-dependent, as the frequency of array switched decreases with
1468 * increasing number of running tasks. We also ignore the interactivity
1469 * if a better static_prio task has expired:
1471 #define EXPIRED_STARVING(rq) \
1472 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1473 (jiffies - (rq)->expired_timestamp >= \
1474 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1475 ((rq)->curr->static_prio > (rq)->best_expired_prio))
1478 * This function gets called by the timer code, with HZ frequency.
1479 * We call it with interrupts disabled.
1481 * It also gets called by the fork code, when changing the parent's
1484 void scheduler_tick(int user_ticks, int sys_ticks)
1486 int cpu = smp_processor_id();
1487 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1488 runqueue_t *rq = this_rq();
1489 task_t *p = current;
1491 rq->timestamp_last_tick = sched_clock();
1493 if (rcu_pending(cpu))
1494 rcu_check_callbacks(cpu, user_ticks);
1496 /* note: this timer irq context must be accounted for as well */
1497 if (hardirq_count() - HARDIRQ_OFFSET) {
1498 cpustat->irq += sys_ticks;
1500 } else if (softirq_count()) {
1501 cpustat->softirq += sys_ticks;
1505 if (p == rq->idle) {
1506 if (atomic_read(&rq->nr_iowait) > 0)
1507 cpustat->iowait += sys_ticks;
1509 cpustat->idle += sys_ticks;
1510 rebalance_tick(rq, 1);
1513 if (TASK_NICE(p) > 0)
1514 cpustat->nice += user_ticks;
1516 cpustat->user += user_ticks;
1517 cpustat->system += sys_ticks;
1519 /* Task might have expired already, but not scheduled off yet */
1520 if (p->array != rq->active) {
1521 set_tsk_need_resched(p);
1524 spin_lock(&rq->lock);
1526 * The task was running during this tick - update the
1527 * time slice counter. Note: we do not update a thread's
1528 * priority until it either goes to sleep or uses up its
1529 * timeslice. This makes it possible for interactive tasks
1530 * to use up their timeslices at their highest priority levels.
1532 if (unlikely(rt_task(p))) {
1534 * RR tasks need a special form of timeslice management.
1535 * FIFO tasks have no timeslices.
1537 if ((p->policy == SCHED_RR) && !--p->time_slice) {
1538 p->time_slice = task_timeslice(p);
1539 p->first_time_slice = 0;
1540 set_tsk_need_resched(p);
1542 /* put it at the end of the queue: */
1543 dequeue_task(p, rq->active);
1544 enqueue_task(p, rq->active);
1548 if (!--p->time_slice) {
1549 dequeue_task(p, rq->active);
1550 set_tsk_need_resched(p);
1551 p->prio = effective_prio(p);
1552 p->time_slice = task_timeslice(p);
1553 p->first_time_slice = 0;
1555 if (!rq->expired_timestamp)
1556 rq->expired_timestamp = jiffies;
1557 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1558 enqueue_task(p, rq->expired);
1559 if (p->static_prio < rq->best_expired_prio)
1560 rq->best_expired_prio = p->static_prio;
1562 enqueue_task(p, rq->active);
1565 * Prevent a too long timeslice allowing a task to monopolize
1566 * the CPU. We do this by splitting up the timeslice into
1569 * Note: this does not mean the task's timeslices expire or
1570 * get lost in any way, they just might be preempted by
1571 * another task of equal priority. (one with higher
1572 * priority would have preempted this task already.) We
1573 * requeue this task to the end of the list on this priority
1574 * level, which is in essence a round-robin of tasks with
1577 * This only applies to tasks in the interactive
1578 * delta range with at least TIMESLICE_GRANULARITY to requeue.
1580 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1581 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1582 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1583 (p->array == rq->active)) {
1585 dequeue_task(p, rq->active);
1586 set_tsk_need_resched(p);
1587 p->prio = effective_prio(p);
1588 enqueue_task(p, rq->active);
1592 spin_unlock(&rq->lock);
1594 rebalance_tick(rq, 0);
1598 * schedule() is the main scheduler function.
1600 asmlinkage void __sched schedule(void)
1603 task_t *prev, *next;
1605 prio_array_t *array;
1606 struct list_head *queue;
1607 unsigned long long now;
1608 unsigned long run_time;
1612 * Test if we are atomic. Since do_exit() needs to call into
1613 * schedule() atomically, we ignore that path for now.
1614 * Otherwise, whine if we are scheduling when we should not be.
1616 if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
1617 if (unlikely(in_atomic())) {
1618 printk(KERN_ERR "bad: scheduling while atomic!\n");
1628 release_kernel_lock(prev);
1629 now = sched_clock();
1630 if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
1631 run_time = now - prev->timestamp;
1633 run_time = NS_MAX_SLEEP_AVG;
1636 * Tasks with interactive credits get charged less run_time
1637 * at high sleep_avg to delay them losing their interactive
1640 if (HIGH_CREDIT(prev))
1641 run_time /= (CURRENT_BONUS(prev) ? : 1);
1643 spin_lock_irq(&rq->lock);
1646 * if entering off of a kernel preemption go straight
1647 * to picking the next task.
1649 switch_count = &prev->nivcsw;
1650 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1651 switch_count = &prev->nvcsw;
1652 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
1653 unlikely(signal_pending(prev))))
1654 prev->state = TASK_RUNNING;
1656 deactivate_task(prev, rq);
1659 if (unlikely(!rq->nr_running)) {
1661 load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
1663 if (!rq->nr_running) {
1665 rq->expired_timestamp = 0;
1671 if (unlikely(!array->nr_active)) {
1673 * Switch the active and expired arrays.
1675 rq->active = rq->expired;
1676 rq->expired = array;
1678 rq->expired_timestamp = 0;
1679 rq->best_expired_prio = MAX_PRIO;
1682 idx = sched_find_first_bit(array->bitmap);
1683 queue = array->queue + idx;
1684 next = list_entry(queue->next, task_t, run_list);
1686 if (!rt_task(next) && next->activated > 0) {
1687 unsigned long long delta = now - next->timestamp;
1689 if (next->activated == 1)
1690 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1692 array = next->array;
1693 dequeue_task(next, array);
1694 recalc_task_prio(next, next->timestamp + delta);
1695 enqueue_task(next, array);
1697 next->activated = 0;
1700 clear_tsk_need_resched(prev);
1701 RCU_qsctr(task_cpu(prev))++;
1703 prev->sleep_avg -= run_time;
1704 if ((long)prev->sleep_avg <= 0) {
1705 prev->sleep_avg = 0;
1706 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
1707 prev->interactive_credit--;
1709 prev->timestamp = now;
1711 if (likely(prev != next)) {
1712 next->timestamp = now;
1717 prepare_arch_switch(rq, next);
1718 prev = context_switch(rq, prev, next);
1721 finish_task_switch(prev);
1723 spin_unlock_irq(&rq->lock);
1725 reacquire_kernel_lock(current);
1726 preempt_enable_no_resched();
1727 if (test_thread_flag(TIF_NEED_RESCHED))
1731 EXPORT_SYMBOL(schedule);
1733 #ifdef CONFIG_PREEMPT
1735 * this is is the entry point to schedule() from in-kernel preemption
1736 * off of preempt_enable. Kernel preemptions off return from interrupt
1737 * occur there and call schedule directly.
1739 asmlinkage void __sched preempt_schedule(void)
1741 struct thread_info *ti = current_thread_info();
1744 * If there is a non-zero preempt_count or interrupts are disabled,
1745 * we do not want to preempt the current task. Just return..
1747 if (unlikely(ti->preempt_count || irqs_disabled()))
1751 ti->preempt_count = PREEMPT_ACTIVE;
1753 ti->preempt_count = 0;
1755 /* we could miss a preemption opportunity between schedule and now */
1757 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
1761 EXPORT_SYMBOL(preempt_schedule);
1762 #endif /* CONFIG_PREEMPT */
1764 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync)
1766 task_t *p = curr->task;
1767 return try_to_wake_up(p, mode, sync);
1770 EXPORT_SYMBOL(default_wake_function);
1773 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
1774 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
1775 * number) then we wake all the non-exclusive tasks and one exclusive task.
1777 * There are circumstances in which we can try to wake a task which has already
1778 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
1779 * zero in this (rare) case, and we handle it by continuing to scan the queue.
1781 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
1782 int nr_exclusive, int sync)
1784 struct list_head *tmp, *next;
1786 list_for_each_safe(tmp, next, &q->task_list) {
1789 curr = list_entry(tmp, wait_queue_t, task_list);
1790 flags = curr->flags;
1791 if (curr->func(curr, mode, sync) &&
1792 (flags & WQ_FLAG_EXCLUSIVE) &&
1799 * __wake_up - wake up threads blocked on a waitqueue.
1801 * @mode: which threads
1802 * @nr_exclusive: how many wake-one or wake-many threads to wake up
1804 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1806 unsigned long flags;
1808 spin_lock_irqsave(&q->lock, flags);
1809 __wake_up_common(q, mode, nr_exclusive, 0);
1810 spin_unlock_irqrestore(&q->lock, flags);
1813 EXPORT_SYMBOL(__wake_up);
1816 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
1818 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
1820 __wake_up_common(q, mode, 1, 0);
1824 * __wake_up - sync- wake up threads blocked on a waitqueue.
1826 * @mode: which threads
1827 * @nr_exclusive: how many wake-one or wake-many threads to wake up
1829 * The sync wakeup differs that the waker knows that it will schedule
1830 * away soon, so while the target thread will be woken up, it will not
1831 * be migrated to another CPU - ie. the two threads are 'synchronized'
1832 * with each other. This can prevent needless bouncing between CPUs.
1834 * On UP it can prevent extra preemption.
1836 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1838 unsigned long flags;
1843 spin_lock_irqsave(&q->lock, flags);
1844 if (likely(nr_exclusive))
1845 __wake_up_common(q, mode, nr_exclusive, 1);
1847 __wake_up_common(q, mode, nr_exclusive, 0);
1848 spin_unlock_irqrestore(&q->lock, flags);
1850 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
1852 void fastcall complete(struct completion *x)
1854 unsigned long flags;
1856 spin_lock_irqsave(&x->wait.lock, flags);
1858 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1860 spin_unlock_irqrestore(&x->wait.lock, flags);
1862 EXPORT_SYMBOL(complete);
1864 void fastcall complete_all(struct completion *x)
1866 unsigned long flags;
1868 spin_lock_irqsave(&x->wait.lock, flags);
1869 x->done += UINT_MAX/2;
1870 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1872 spin_unlock_irqrestore(&x->wait.lock, flags);
1874 EXPORT_SYMBOL(complete_all);
1876 void fastcall __sched wait_for_completion(struct completion *x)
1879 spin_lock_irq(&x->wait.lock);
1881 DECLARE_WAITQUEUE(wait, current);
1883 wait.flags |= WQ_FLAG_EXCLUSIVE;
1884 __add_wait_queue_tail(&x->wait, &wait);
1886 __set_current_state(TASK_UNINTERRUPTIBLE);
1887 spin_unlock_irq(&x->wait.lock);
1889 spin_lock_irq(&x->wait.lock);
1891 __remove_wait_queue(&x->wait, &wait);
1894 spin_unlock_irq(&x->wait.lock);
1896 EXPORT_SYMBOL(wait_for_completion);
1898 #define SLEEP_ON_VAR \
1899 unsigned long flags; \
1900 wait_queue_t wait; \
1901 init_waitqueue_entry(&wait, current);
1903 #define SLEEP_ON_HEAD \
1904 spin_lock_irqsave(&q->lock,flags); \
1905 __add_wait_queue(q, &wait); \
1906 spin_unlock(&q->lock);
1908 #define SLEEP_ON_TAIL \
1909 spin_lock_irq(&q->lock); \
1910 __remove_wait_queue(q, &wait); \
1911 spin_unlock_irqrestore(&q->lock, flags);
1913 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
1917 current->state = TASK_INTERRUPTIBLE;
1924 EXPORT_SYMBOL(interruptible_sleep_on);
1926 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1930 current->state = TASK_INTERRUPTIBLE;
1933 timeout = schedule_timeout(timeout);
1939 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
1941 void fastcall __sched sleep_on(wait_queue_head_t *q)
1945 current->state = TASK_UNINTERRUPTIBLE;
1952 EXPORT_SYMBOL(sleep_on);
1954 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
1958 current->state = TASK_UNINTERRUPTIBLE;
1961 timeout = schedule_timeout(timeout);
1967 EXPORT_SYMBOL(sleep_on_timeout);
1969 void set_user_nice(task_t *p, long nice)
1971 unsigned long flags;
1972 prio_array_t *array;
1974 int old_prio, new_prio, delta;
1976 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1979 * We have to be careful, if called from sys_setpriority(),
1980 * the task might be in the middle of scheduling on another CPU.
1982 rq = task_rq_lock(p, &flags);
1984 * The RT priorities are set via setscheduler(), but we still
1985 * allow the 'normal' nice value to be set - but as expected
1986 * it wont have any effect on scheduling until the task is
1990 p->static_prio = NICE_TO_PRIO(nice);
1995 dequeue_task(p, array);
1998 new_prio = NICE_TO_PRIO(nice);
1999 delta = new_prio - old_prio;
2000 p->static_prio = NICE_TO_PRIO(nice);
2004 enqueue_task(p, array);
2006 * If the task increased its priority or is running and
2007 * lowered its priority, then reschedule its CPU:
2009 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2010 resched_task(rq->curr);
2013 task_rq_unlock(rq, &flags);
2016 EXPORT_SYMBOL(set_user_nice);
2021 * sys_nice - change the priority of the current process.
2022 * @increment: priority increment
2024 * sys_setpriority is a more generic, but much slower function that
2025 * does similar things.
2027 asmlinkage long sys_nice(int increment)
2033 * Setpriority might change our priority at the same moment.
2034 * We don't have to worry. Conceptually one call occurs first
2035 * and we have a single winner.
2037 if (increment < 0) {
2038 if (!capable(CAP_SYS_NICE))
2040 if (increment < -40)
2046 nice = PRIO_TO_NICE(current->static_prio) + increment;
2052 retval = security_task_setnice(current, nice);
2056 set_user_nice(current, nice);
2063 * task_prio - return the priority value of a given task.
2064 * @p: the task in question.
2066 * This is the priority value as seen by users in /proc.
2067 * RT tasks are offset by -200. Normal tasks are centered
2068 * around 0, value goes from -16 to +15.
2070 int task_prio(task_t *p)
2072 return p->prio - MAX_RT_PRIO;
2076 * task_nice - return the nice value of a given task.
2077 * @p: the task in question.
2079 int task_nice(task_t *p)
2081 return TASK_NICE(p);
2084 EXPORT_SYMBOL(task_nice);
2087 * idle_cpu - is a given cpu idle currently?
2088 * @cpu: the processor in question.
2090 int idle_cpu(int cpu)
2092 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
2095 EXPORT_SYMBOL_GPL(idle_cpu);
2098 * find_process_by_pid - find a process with a matching PID value.
2099 * @pid: the pid in question.
2101 static inline task_t *find_process_by_pid(pid_t pid)
2103 return pid ? find_task_by_pid(pid) : current;
2106 /* Actually do priority change: must hold rq lock. */
2107 static void __setscheduler(struct task_struct *p, int policy, int prio)
2111 p->rt_priority = prio;
2112 if (policy != SCHED_NORMAL)
2113 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
2115 p->prio = p->static_prio;
2119 * setscheduler - change the scheduling policy and/or RT priority of a thread.
2121 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
2123 struct sched_param lp;
2124 int retval = -EINVAL;
2126 prio_array_t *array;
2127 unsigned long flags;
2131 if (!param || pid < 0)
2135 if (copy_from_user(&lp, param, sizeof(struct sched_param)))
2139 * We play safe to avoid deadlocks.
2141 read_lock_irq(&tasklist_lock);
2143 p = find_process_by_pid(pid);
2147 goto out_unlock_tasklist;
2150 * To be able to change p->policy safely, the apropriate
2151 * runqueue lock must be held.
2153 rq = task_rq_lock(p, &flags);
2159 if (policy != SCHED_FIFO && policy != SCHED_RR &&
2160 policy != SCHED_NORMAL)
2165 * Valid priorities for SCHED_FIFO and SCHED_RR are
2166 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
2169 if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
2171 if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
2175 if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
2176 !capable(CAP_SYS_NICE))
2178 if ((current->euid != p->euid) && (current->euid != p->uid) &&
2179 !capable(CAP_SYS_NICE))
2182 retval = security_task_setscheduler(p, policy, &lp);
2188 deactivate_task(p, task_rq(p));
2191 __setscheduler(p, policy, lp.sched_priority);
2193 __activate_task(p, task_rq(p));
2195 * Reschedule if we are currently running on this runqueue and
2196 * our priority decreased, or if we are not currently running on
2197 * this runqueue and our priority is higher than the current's
2199 if (task_running(rq, p)) {
2200 if (p->prio > oldprio)
2201 resched_task(rq->curr);
2202 } else if (p->prio < rq->curr->prio)
2203 resched_task(rq->curr);
2207 task_rq_unlock(rq, &flags);
2208 out_unlock_tasklist:
2209 read_unlock_irq(&tasklist_lock);
2216 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
2217 * @pid: the pid in question.
2218 * @policy: new policy
2219 * @param: structure containing the new RT priority.
2221 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
2222 struct sched_param __user *param)
2224 return setscheduler(pid, policy, param);
2228 * sys_sched_setparam - set/change the RT priority of a thread
2229 * @pid: the pid in question.
2230 * @param: structure containing the new RT priority.
2232 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
2234 return setscheduler(pid, -1, param);
2238 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
2239 * @pid: the pid in question.
2241 asmlinkage long sys_sched_getscheduler(pid_t pid)
2243 int retval = -EINVAL;
2250 read_lock(&tasklist_lock);
2251 p = find_process_by_pid(pid);
2253 retval = security_task_getscheduler(p);
2257 read_unlock(&tasklist_lock);
2264 * sys_sched_getscheduler - get the RT priority of a thread
2265 * @pid: the pid in question.
2266 * @param: structure containing the RT priority.
2268 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
2270 struct sched_param lp;
2271 int retval = -EINVAL;
2274 if (!param || pid < 0)
2277 read_lock(&tasklist_lock);
2278 p = find_process_by_pid(pid);
2283 retval = security_task_getscheduler(p);
2287 lp.sched_priority = p->rt_priority;
2288 read_unlock(&tasklist_lock);
2291 * This one might sleep, we cannot do it with a spinlock held ...
2293 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
2299 read_unlock(&tasklist_lock);
2304 * sys_sched_setaffinity - set the cpu affinity of a process
2305 * @pid: pid of the process
2306 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2307 * @user_mask_ptr: user-space pointer to the new cpu mask
2309 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
2310 unsigned long __user *user_mask_ptr)
2316 if (len < sizeof(new_mask))
2319 if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
2323 read_lock(&tasklist_lock);
2325 p = find_process_by_pid(pid);
2327 read_unlock(&tasklist_lock);
2328 unlock_cpu_hotplug();
2333 * It is not safe to call set_cpus_allowed with the
2334 * tasklist_lock held. We will bump the task_struct's
2335 * usage count and then drop tasklist_lock.
2338 read_unlock(&tasklist_lock);
2341 if ((current->euid != p->euid) && (current->euid != p->uid) &&
2342 !capable(CAP_SYS_NICE))
2345 retval = set_cpus_allowed(p, new_mask);
2349 unlock_cpu_hotplug();
2354 * sys_sched_getaffinity - get the cpu affinity of a process
2355 * @pid: pid of the process
2356 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2357 * @user_mask_ptr: user-space pointer to hold the current cpu mask
2359 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
2360 unsigned long __user *user_mask_ptr)
2362 unsigned int real_len;
2367 real_len = sizeof(mask);
2371 read_lock(&tasklist_lock);
2374 p = find_process_by_pid(pid);
2379 cpus_and(mask, p->cpus_allowed, cpu_possible_map);
2382 read_unlock(&tasklist_lock);
2385 if (copy_to_user(user_mask_ptr, &mask, real_len))
2391 * sys_sched_yield - yield the current processor to other threads.
2393 * this function yields the current CPU by moving the calling thread
2394 * to the expired array. If there are no other threads running on this
2395 * CPU then this function will return.
2397 asmlinkage long sys_sched_yield(void)
2399 runqueue_t *rq = this_rq_lock();
2400 prio_array_t *array = current->array;
2403 * We implement yielding by moving the task into the expired
2406 * (special rule: RT tasks will just roundrobin in the active
2409 if (likely(!rt_task(current))) {
2410 dequeue_task(current, array);
2411 enqueue_task(current, rq->expired);
2413 list_del(¤t->run_list);
2414 list_add_tail(¤t->run_list, array->queue + current->prio);
2417 * Since we are going to call schedule() anyway, there's
2418 * no need to preempt:
2420 _raw_spin_unlock(&rq->lock);
2421 preempt_enable_no_resched();
2428 void __sched __cond_resched(void)
2430 set_current_state(TASK_RUNNING);
2434 EXPORT_SYMBOL(__cond_resched);
2437 * yield - yield the current processor to other threads.
2439 * this is a shortcut for kernel-space yielding - it marks the
2440 * thread runnable and calls sys_sched_yield().
2442 void __sched yield(void)
2444 set_current_state(TASK_RUNNING);
2448 EXPORT_SYMBOL(yield);
2451 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
2452 * that process accounting knows that this is a task in IO wait state.
2454 * But don't do that if it is a deliberate, throttling IO wait (this task
2455 * has set its backing_dev_info: the queue against which it should throttle)
2457 void __sched io_schedule(void)
2459 struct runqueue *rq = this_rq();
2461 atomic_inc(&rq->nr_iowait);
2463 atomic_dec(&rq->nr_iowait);
2466 EXPORT_SYMBOL(io_schedule);
2468 long __sched io_schedule_timeout(long timeout)
2470 struct runqueue *rq = this_rq();
2473 atomic_inc(&rq->nr_iowait);
2474 ret = schedule_timeout(timeout);
2475 atomic_dec(&rq->nr_iowait);
2480 * sys_sched_get_priority_max - return maximum RT priority.
2481 * @policy: scheduling class.
2483 * this syscall returns the maximum rt_priority that can be used
2484 * by a given scheduling class.
2486 asmlinkage long sys_sched_get_priority_max(int policy)
2493 ret = MAX_USER_RT_PRIO-1;
2503 * sys_sched_get_priority_min - return minimum RT priority.
2504 * @policy: scheduling class.
2506 * this syscall returns the minimum rt_priority that can be used
2507 * by a given scheduling class.
2509 asmlinkage long sys_sched_get_priority_min(int policy)
2525 * sys_sched_rr_get_interval - return the default timeslice of a process.
2526 * @pid: pid of the process.
2527 * @interval: userspace pointer to the timeslice value.
2529 * this syscall writes the default timeslice value of a given process
2530 * into the user-space timespec buffer. A value of '0' means infinity.
2533 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
2535 int retval = -EINVAL;
2543 read_lock(&tasklist_lock);
2544 p = find_process_by_pid(pid);
2548 retval = security_task_getscheduler(p);
2552 jiffies_to_timespec(p->policy & SCHED_FIFO ?
2553 0 : task_timeslice(p), &t);
2554 read_unlock(&tasklist_lock);
2555 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2559 read_unlock(&tasklist_lock);
2563 static inline struct task_struct *eldest_child(struct task_struct *p)
2565 if (list_empty(&p->children)) return NULL;
2566 return list_entry(p->children.next,struct task_struct,sibling);
2569 static inline struct task_struct *older_sibling(struct task_struct *p)
2571 if (p->sibling.prev==&p->parent->children) return NULL;
2572 return list_entry(p->sibling.prev,struct task_struct,sibling);
2575 static inline struct task_struct *younger_sibling(struct task_struct *p)
2577 if (p->sibling.next==&p->parent->children) return NULL;
2578 return list_entry(p->sibling.next,struct task_struct,sibling);
2581 static void show_task(task_t * p)
2585 unsigned long free = 0;
2586 static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
2588 printk("%-13.13s ", p->comm);
2589 state = p->state ? __ffs(p->state) + 1 : 0;
2590 if (state < ARRAY_SIZE(stat_nam))
2591 printk(stat_nam[state]);
2594 #if (BITS_PER_LONG == 32)
2595 if (state == TASK_RUNNING)
2596 printk(" running ");
2598 printk(" %08lX ", thread_saved_pc(p));
2600 if (state == TASK_RUNNING)
2601 printk(" running task ");
2603 printk(" %016lx ", thread_saved_pc(p));
2605 #ifdef CONFIG_DEBUG_STACK_USAGE
2607 unsigned long * n = (unsigned long *) (p->thread_info+1);
2610 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
2613 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
2614 if ((relative = eldest_child(p)))
2615 printk("%5d ", relative->pid);
2618 if ((relative = younger_sibling(p)))
2619 printk("%7d", relative->pid);
2622 if ((relative = older_sibling(p)))
2623 printk(" %5d", relative->pid);
2627 printk(" (L-TLB)\n");
2629 printk(" (NOTLB)\n");
2631 if (state != TASK_RUNNING)
2632 show_stack(p, NULL);
2635 void show_state(void)
2639 #if (BITS_PER_LONG == 32)
2642 printk(" task PC pid father child younger older\n");
2646 printk(" task PC pid father child younger older\n");
2648 read_lock(&tasklist_lock);
2649 do_each_thread(g, p) {
2651 * reset the NMI-timeout, listing all files on a slow
2652 * console might take alot of time:
2654 touch_nmi_watchdog();
2656 } while_each_thread(g, p);
2658 read_unlock(&tasklist_lock);
2661 void __init init_idle(task_t *idle, int cpu)
2663 runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
2664 unsigned long flags;
2666 local_irq_save(flags);
2667 double_rq_lock(idle_rq, rq);
2669 idle_rq->curr = idle_rq->idle = idle;
2670 deactivate_task(idle, rq);
2672 idle->prio = MAX_PRIO;
2673 idle->state = TASK_RUNNING;
2674 set_task_cpu(idle, cpu);
2675 double_rq_unlock(idle_rq, rq);
2676 set_tsk_need_resched(idle);
2677 local_irq_restore(flags);
2679 /* Set the preempt count _outside_ the spinlocks! */
2680 #ifdef CONFIG_PREEMPT
2681 idle->thread_info->preempt_count = (idle->lock_depth >= 0);
2683 idle->thread_info->preempt_count = 0;
2688 * In a system that switches off the HZ timer idle_cpu_mask
2689 * indicates which cpus entered this state. This is used
2690 * in the rcu update to wait only for active cpus. For system
2691 * which do not switch off the HZ timer idle_cpu_mask should
2692 * always be CPU_MASK_NONE.
2694 cpumask_t idle_cpu_mask = CPU_MASK_NONE;
2698 * This is how migration works:
2700 * 1) we queue a migration_req_t structure in the source CPU's
2701 * runqueue and wake up that CPU's migration thread.
2702 * 2) we down() the locked semaphore => thread blocks.
2703 * 3) migration thread wakes up (implicitly it forces the migrated
2704 * thread off the CPU)
2705 * 4) it gets the migration request and checks whether the migrated
2706 * task is still in the wrong runqueue.
2707 * 5) if it's in the wrong runqueue then the migration thread removes
2708 * it and puts it into the right queue.
2709 * 6) migration thread up()s the semaphore.
2710 * 7) we wake up and the migration is done.
2714 * Change a given task's CPU affinity. Migrate the thread to a
2715 * proper CPU and schedule it away if the CPU it's executing on
2716 * is removed from the allowed bitmask.
2718 * NOTE: the caller must have a valid reference to the task, the
2719 * task must not exit() & deallocate itself prematurely. The
2720 * call is not atomic; no spinlocks may be held.
2722 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
2724 unsigned long flags;
2726 migration_req_t req;
2729 rq = task_rq_lock(p, &flags);
2730 if (any_online_cpu(new_mask) == NR_CPUS) {
2735 if (__set_cpus_allowed(p, new_mask, &req)) {
2736 /* Need help from migration thread: drop lock and wait. */
2737 task_rq_unlock(rq, &flags);
2738 wake_up_process(rq->migration_thread);
2739 wait_for_completion(&req.done);
2743 task_rq_unlock(rq, &flags);
2747 EXPORT_SYMBOL_GPL(set_cpus_allowed);
2749 /* Move (not current) task off this cpu, onto dest cpu. */
2750 static void move_task_away(struct task_struct *p, int dest_cpu)
2752 runqueue_t *rq_dest;
2754 rq_dest = cpu_rq(dest_cpu);
2756 double_rq_lock(this_rq(), rq_dest);
2757 if (task_cpu(p) != smp_processor_id())
2758 goto out; /* Already moved */
2760 set_task_cpu(p, dest_cpu);
2762 deactivate_task(p, this_rq());
2763 activate_task(p, rq_dest);
2764 if (p->prio < rq_dest->curr->prio)
2765 resched_task(rq_dest->curr);
2767 p->timestamp = rq_dest->timestamp_last_tick;
2770 double_rq_unlock(this_rq(), rq_dest);
2774 * migration_thread - this is a highprio system thread that performs
2775 * thread migration by bumping thread off CPU then 'pushing' onto
2778 static int migration_thread(void * data)
2781 int cpu = (long)data;
2784 BUG_ON(rq->migration_thread != current);
2786 while (!kthread_should_stop()) {
2787 struct list_head *head;
2788 migration_req_t *req;
2790 if (current->flags & PF_FREEZE)
2791 refrigerator(PF_FREEZE);
2793 spin_lock_irq(&rq->lock);
2794 head = &rq->migration_queue;
2795 current->state = TASK_INTERRUPTIBLE;
2796 if (list_empty(head)) {
2797 spin_unlock_irq(&rq->lock);
2801 req = list_entry(head->next, migration_req_t, list);
2802 list_del_init(head->next);
2803 spin_unlock(&rq->lock);
2805 move_task_away(req->task,
2806 any_online_cpu(req->task->cpus_allowed));
2808 complete(&req->done);
2813 #ifdef CONFIG_HOTPLUG_CPU
2814 /* migrate_all_tasks - function to migrate all the tasks from the
2815 * current cpu caller must have already scheduled this to the target
2816 * cpu via set_cpus_allowed. Machine is stopped. */
2817 void migrate_all_tasks(void)
2819 struct task_struct *tsk, *t;
2820 int dest_cpu, src_cpu;
2823 /* We're nailed to this CPU. */
2824 src_cpu = smp_processor_id();
2826 /* Not required, but here for neatness. */
2827 write_lock(&tasklist_lock);
2829 /* watch out for per node tasks, let's stay on this node */
2830 node = cpu_to_node(src_cpu);
2832 do_each_thread(t, tsk) {
2837 if (task_cpu(tsk) != src_cpu)
2840 /* Figure out where this task should go (attempting to
2841 * keep it on-node), and check if it can be migrated
2842 * as-is. NOTE that kernel threads bound to more than
2843 * one online cpu will be migrated. */
2844 mask = node_to_cpumask(node);
2845 cpus_and(mask, mask, tsk->cpus_allowed);
2846 dest_cpu = any_online_cpu(mask);
2847 if (dest_cpu == NR_CPUS)
2848 dest_cpu = any_online_cpu(tsk->cpus_allowed);
2849 if (dest_cpu == NR_CPUS) {
2850 cpus_clear(tsk->cpus_allowed);
2851 cpus_complement(tsk->cpus_allowed);
2852 dest_cpu = any_online_cpu(tsk->cpus_allowed);
2854 /* Don't tell them about moving exiting tasks
2855 or kernel threads (both mm NULL), since
2856 they never leave kernel. */
2857 if (tsk->mm && printk_ratelimit())
2858 printk(KERN_INFO "process %d (%s) no "
2859 "longer affine to cpu%d\n",
2860 tsk->pid, tsk->comm, src_cpu);
2863 move_task_away(tsk, dest_cpu);
2864 } while_each_thread(t, tsk);
2866 write_unlock(&tasklist_lock);
2868 #endif /* CONFIG_HOTPLUG_CPU */
2871 * migration_call - callback that gets triggered when a CPU is added.
2872 * Here we can start up the necessary migration thread for the new CPU.
2874 static int migration_call(struct notifier_block *nfb, unsigned long action,
2877 int cpu = (long)hcpu;
2878 struct task_struct *p;
2879 struct runqueue *rq;
2880 unsigned long flags;
2883 case CPU_UP_PREPARE:
2884 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
2887 kthread_bind(p, cpu);
2888 /* Must be high prio: stop_machine expects to yield to it. */
2889 rq = task_rq_lock(p, &flags);
2890 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
2891 task_rq_unlock(rq, &flags);
2892 cpu_rq(cpu)->migration_thread = p;
2895 /* Strictly unneccessary, as first user will wake it. */
2896 wake_up_process(cpu_rq(cpu)->migration_thread);
2898 #ifdef CONFIG_HOTPLUG_CPU
2899 case CPU_UP_CANCELED:
2900 /* Unbind it from offline cpu so it can run. Fall thru. */
2901 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
2903 kthread_stop(cpu_rq(cpu)->migration_thread);
2904 cpu_rq(cpu)->migration_thread = NULL;
2905 BUG_ON(cpu_rq(cpu)->nr_running != 0);
2912 static struct notifier_block __devinitdata migration_notifier = {
2913 .notifier_call = migration_call,
2916 int __init migration_init(void)
2918 void *cpu = (void *)(long)smp_processor_id();
2919 /* Start one for boot CPU. */
2920 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
2921 migration_call(&migration_notifier, CPU_ONLINE, cpu);
2922 register_cpu_notifier(&migration_notifier);
2928 * The 'big kernel lock'
2930 * This spinlock is taken and released recursively by lock_kernel()
2931 * and unlock_kernel(). It is transparently dropped and reaquired
2932 * over schedule(). It is used to protect legacy code that hasn't
2933 * been migrated to a proper locking design yet.
2935 * Don't use in new code.
2937 * Note: spinlock debugging needs this even on !CONFIG_SMP.
2939 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
2940 EXPORT_SYMBOL(kernel_flag);
2942 void __init sched_init(void)
2947 for (i = 0; i < NR_CPUS; i++) {
2948 prio_array_t *array;
2951 rq->active = rq->arrays;
2952 rq->expired = rq->arrays + 1;
2953 rq->best_expired_prio = MAX_PRIO;
2955 spin_lock_init(&rq->lock);
2956 INIT_LIST_HEAD(&rq->migration_queue);
2957 atomic_set(&rq->nr_iowait, 0);
2958 nr_running_init(rq);
2960 for (j = 0; j < 2; j++) {
2961 array = rq->arrays + j;
2962 for (k = 0; k < MAX_PRIO; k++) {
2963 INIT_LIST_HEAD(array->queue + k);
2964 __clear_bit(k, array->bitmap);
2966 // delimiter for bitsearch
2967 __set_bit(MAX_PRIO, array->bitmap);
2971 * We have to do a little magic to get the first
2972 * thread right in SMP mode.
2977 set_task_cpu(current, smp_processor_id());
2978 wake_up_forked_process(current);
2983 * The boot idle thread does lazy MMU switching as well:
2985 atomic_inc(&init_mm.mm_count);
2986 enter_lazy_tlb(&init_mm, current);
2989 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
2990 void __might_sleep(char *file, int line)
2992 #if defined(in_atomic)
2993 static unsigned long prev_jiffy; /* ratelimiting */
2995 if ((in_atomic() || irqs_disabled()) &&
2996 system_state == SYSTEM_RUNNING) {
2997 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
2999 prev_jiffy = jiffies;
3000 printk(KERN_ERR "Debug: sleeping function called from invalid"
3001 " context at %s:%d\n", file, line);
3002 printk("in_atomic():%d, irqs_disabled():%d\n",
3003 in_atomic(), irqs_disabled());
3008 EXPORT_SYMBOL(__might_sleep);
3012 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
3014 * This could be a long-held lock. If another CPU holds it for a long time,
3015 * and that CPU is not asked to reschedule then *this* CPU will spin on the
3016 * lock for a long time, even if *this* CPU is asked to reschedule.
3018 * So what we do here, in the slow (contended) path is to spin on the lock by
3019 * hand while permitting preemption.
3021 * Called inside preempt_disable().
3023 void __sched __preempt_spin_lock(spinlock_t *lock)
3025 if (preempt_count() > 1) {
3026 _raw_spin_lock(lock);
3031 while (spin_is_locked(lock))
3034 } while (!_raw_spin_trylock(lock));
3037 EXPORT_SYMBOL(__preempt_spin_lock);
3039 void __sched __preempt_write_lock(rwlock_t *lock)
3041 if (preempt_count() > 1) {
3042 _raw_write_lock(lock);
3048 while (rwlock_is_locked(lock))
3051 } while (!_raw_write_trylock(lock));
3054 EXPORT_SYMBOL(__preempt_write_lock);
3055 #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */