4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
21 #include <linux/module.h>
22 #include <linux/nmi.h>
23 #include <linux/init.h>
24 #include <asm/uaccess.h>
25 #include <linux/highmem.h>
26 #include <linux/smp_lock.h>
27 #include <asm/mmu_context.h>
28 #include <linux/interrupt.h>
29 #include <linux/completion.h>
30 #include <linux/kernel_stat.h>
31 #include <linux/security.h>
32 #include <linux/notifier.h>
33 #include <linux/suspend.h>
34 #include <linux/blkdev.h>
35 #include <linux/delay.h>
36 #include <linux/smp.h>
37 #include <linux/timer.h>
38 #include <linux/rcupdate.h>
39 #include <linux/cpu.h>
40 #include <linux/percpu.h>
41 #include <linux/kthread.h>
42 #include <linux/vserver/sched.h>
43 #include <linux/vinline.h>
46 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
48 #define cpu_to_node_mask(cpu) (cpu_online_map)
52 * Convert user-nice values [ -20 ... 0 ... 19 ]
53 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
56 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
57 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
58 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
61 * 'User priority' is the nice value converted to something we
62 * can work with better when scaling various scheduler parameters,
63 * it's a [ 0 ... 39 ] range.
65 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
66 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
67 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
68 #define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
69 (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
72 * Some helpers for converting nanosecond timing to jiffy resolution
74 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
75 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
78 * These are the 'tuning knobs' of the scheduler:
80 * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
81 * maximum timeslice is 200 msecs. Timeslices get refilled after
84 #define MIN_TIMESLICE ( 10 * HZ / 1000)
85 #define MAX_TIMESLICE (200 * HZ / 1000)
86 #define ON_RUNQUEUE_WEIGHT 30
87 #define CHILD_PENALTY 95
88 #define PARENT_PENALTY 100
90 #define PRIO_BONUS_RATIO 25
91 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
92 #define INTERACTIVE_DELTA 2
93 #define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS)
94 #define STARVATION_LIMIT (MAX_SLEEP_AVG)
95 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
96 #define NODE_THRESHOLD 125
97 #define CREDIT_LIMIT 100
100 * If a task is 'interactive' then we reinsert it in the active
101 * array after it has expired its current timeslice. (it will not
102 * continue to run immediately, it will still roundrobin with
103 * other interactive tasks.)
105 * This part scales the interactivity limit depending on niceness.
107 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
108 * Here are a few examples of different nice levels:
110 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
111 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
112 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
113 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
114 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
116 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
117 * priority range a task can explore, a value of '1' means the
118 * task is rated interactive.)
120 * Ie. nice +19 tasks can never get 'interactive' enough to be
121 * reinserted into the active array. And only heavily CPU-hog nice -20
122 * tasks will be expired. Default nice 0 tasks are somewhere between,
123 * it takes some effort for them to get interactive, but it's not
127 #define CURRENT_BONUS(p) \
128 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
132 #define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
133 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
136 #define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
137 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
140 #define SCALE(v1,v1_max,v2_max) \
141 (v1) * (v2_max) / (v1_max)
144 (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
147 #define TASK_INTERACTIVE(p) \
148 ((p)->prio <= (p)->static_prio - DELTA(p))
150 #define INTERACTIVE_SLEEP(p) \
151 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
152 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
154 #define HIGH_CREDIT(p) \
155 ((p)->interactive_credit > CREDIT_LIMIT)
157 #define LOW_CREDIT(p) \
158 ((p)->interactive_credit < -CREDIT_LIMIT)
160 #define TASK_PREEMPTS_CURR(p, rq) \
161 ((p)->prio < (rq)->curr->prio)
164 * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
165 * to time slice values.
167 * The higher a thread's priority, the bigger timeslices
168 * it gets during one round of execution. But even the lowest
169 * priority thread gets MIN_TIMESLICE worth of execution time.
171 * task_timeslice() is the interface that is used by the scheduler.
174 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
175 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
176 (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
178 static inline unsigned int task_timeslice(task_t *p)
180 return BASE_TIMESLICE(p);
184 * These are the runqueue data structures:
187 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
189 typedef struct runqueue runqueue_t;
193 unsigned long bitmap[BITMAP_SIZE];
194 struct list_head queue[MAX_PRIO];
198 * This is the main, per-CPU runqueue data structure.
200 * Locking rule: those places that want to lock multiple runqueues
201 * (such as the load balancing or the thread migration code), lock
202 * acquire operations must be ordered by ascending &runqueue.
206 unsigned long long nr_switches;
207 unsigned long nr_running, expired_timestamp, nr_uninterruptible,
210 struct mm_struct *prev_mm;
211 prio_array_t *active, *expired, arrays[2];
212 int best_expired_prio, prev_cpu_load[NR_CPUS];
214 atomic_t *node_nr_running;
215 int prev_node_load[MAX_NUMNODES];
217 task_t *migration_thread;
218 struct list_head migration_queue;
219 struct list_head hold_queue;
225 static DEFINE_PER_CPU(struct runqueue, runqueues);
227 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
228 #define this_rq() (&__get_cpu_var(runqueues))
229 #define task_rq(p) cpu_rq(task_cpu(p))
230 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
232 extern unsigned long __scheduling_functions_start_here;
233 extern unsigned long __scheduling_functions_end_here;
234 const unsigned long scheduling_functions_start_here =
235 (unsigned long)&__scheduling_functions_start_here;
236 const unsigned long scheduling_functions_end_here =
237 (unsigned long)&__scheduling_functions_end_here;
240 * Default context-switch locking:
242 #ifndef prepare_arch_switch
243 # define prepare_arch_switch(rq, next) do { } while (0)
244 # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
245 # define task_running(rq, p) ((rq)->curr == (p))
251 * Keep track of running tasks.
254 static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
255 {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
257 static inline void nr_running_init(struct runqueue *rq)
259 rq->node_nr_running = &node_nr_running[0];
262 static inline void nr_running_inc(runqueue_t *rq)
264 atomic_inc(rq->node_nr_running);
268 static inline void nr_running_dec(runqueue_t *rq)
270 atomic_dec(rq->node_nr_running);
274 __init void node_nr_running_init(void)
278 for (i = 0; i < NR_CPUS; i++) {
280 cpu_rq(i)->node_nr_running =
281 &node_nr_running[cpu_to_node(i)];
285 #else /* !CONFIG_NUMA */
287 # define nr_running_init(rq) do { } while (0)
288 # define nr_running_inc(rq) do { (rq)->nr_running++; } while (0)
289 # define nr_running_dec(rq) do { (rq)->nr_running--; } while (0)
291 #endif /* CONFIG_NUMA */
294 * task_rq_lock - lock the runqueue a given task resides on and disable
295 * interrupts. Note the ordering: we can safely lookup the task_rq without
296 * explicitly disabling preemption.
298 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
303 local_irq_save(*flags);
305 spin_lock(&rq->lock);
306 if (unlikely(rq != task_rq(p))) {
307 spin_unlock_irqrestore(&rq->lock, *flags);
308 goto repeat_lock_task;
313 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
315 spin_unlock_irqrestore(&rq->lock, *flags);
319 * rq_lock - lock a given runqueue and disable interrupts.
321 static inline runqueue_t *this_rq_lock(void)
327 spin_lock(&rq->lock);
332 static inline void rq_unlock(runqueue_t *rq)
334 spin_unlock_irq(&rq->lock);
338 * Adding/removing a task to/from a priority array:
340 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
343 list_del(&p->run_list);
344 if (list_empty(array->queue + p->prio))
345 __clear_bit(p->prio, array->bitmap);
348 static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
350 list_add_tail(&p->run_list, array->queue + p->prio);
351 __set_bit(p->prio, array->bitmap);
357 * effective_prio - return the priority that is based on the static
358 * priority but is modified by bonuses/penalties.
360 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
361 * into the -5 ... 0 ... +5 bonus/penalty range.
363 * We use 25% of the full 0...39 priority range so that:
365 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
366 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
368 * Both properties are important to certain workloads.
370 static int effective_prio(task_t *p)
377 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
379 prio = p->static_prio - bonus;
380 if (__vx_task_flags(p, VXF_SCHED_PRIO, 0))
381 prio += effective_vavavoom(p, MAX_USER_PRIO);
383 if (prio < MAX_RT_PRIO)
385 if (prio > MAX_PRIO-1)
391 * __activate_task - move a task to the runqueue.
393 static inline void __activate_task(task_t *p, runqueue_t *rq)
395 enqueue_task(p, rq->active);
399 static void recalc_task_prio(task_t *p, unsigned long long now)
401 unsigned long long __sleep_time = now - p->timestamp;
402 unsigned long sleep_time;
404 if (__sleep_time > NS_MAX_SLEEP_AVG)
405 sleep_time = NS_MAX_SLEEP_AVG;
407 sleep_time = (unsigned long)__sleep_time;
409 if (likely(sleep_time > 0)) {
411 * User tasks that sleep a long time are categorised as
412 * idle and will get just interactive status to stay active &
413 * prevent them suddenly becoming cpu hogs and starving
416 if (p->mm && p->activated != -1 &&
417 sleep_time > INTERACTIVE_SLEEP(p)) {
418 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
421 p->interactive_credit++;
424 * The lower the sleep avg a task has the more
425 * rapidly it will rise with sleep time.
427 sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
430 * Tasks with low interactive_credit are limited to
431 * one timeslice worth of sleep avg bonus.
434 sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
435 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
438 * Non high_credit tasks waking from uninterruptible
439 * sleep are limited in their sleep_avg rise as they
440 * are likely to be cpu hogs waiting on I/O
442 if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
443 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
445 else if (p->sleep_avg + sleep_time >=
446 INTERACTIVE_SLEEP(p)) {
447 p->sleep_avg = INTERACTIVE_SLEEP(p);
453 * This code gives a bonus to interactive tasks.
455 * The boost works by updating the 'average sleep time'
456 * value here, based on ->timestamp. The more time a
457 * task spends sleeping, the higher the average gets -
458 * and the higher the priority boost gets as well.
460 p->sleep_avg += sleep_time;
462 if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
463 p->sleep_avg = NS_MAX_SLEEP_AVG;
465 p->interactive_credit++;
470 p->prio = effective_prio(p);
474 * activate_task - move a task to the runqueue and do priority recalculation
476 * Update all the scheduling statistics stuff. (sleep average
477 * calculation, priority modifiers, etc.)
479 static inline void activate_task(task_t *p, runqueue_t *rq)
481 unsigned long long now = sched_clock();
483 recalc_task_prio(p, now);
486 * This checks to make sure it's not an uninterruptible task
487 * that is now waking up.
491 * Tasks which were woken up by interrupts (ie. hw events)
492 * are most likely of interactive nature. So we give them
493 * the credit of extending their sleep time to the period
494 * of time they spend on the runqueue, waiting for execution
495 * on a CPU, first time around:
501 * Normal first-time wakeups get a credit too for
502 * on-runqueue time, but it will be weighted down:
509 __activate_task(p, rq);
513 * deactivate_task - remove a task from the runqueue.
515 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
518 if (p->state == TASK_UNINTERRUPTIBLE)
519 rq->nr_uninterruptible++;
520 dequeue_task(p, p->array);
525 * resched_task - mark a task 'to be rescheduled now'.
527 * On UP this means the setting of the need_resched flag, on SMP it
528 * might also involve a cross-CPU call to trigger the scheduler on
531 static inline void resched_task(task_t *p)
534 int need_resched, nrpolling;
537 /* minimise the chance of sending an interrupt to poll_idle() */
538 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
539 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
540 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
542 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
543 smp_send_reschedule(task_cpu(p));
546 set_tsk_need_resched(p);
551 * task_curr - is this task currently executing on a CPU?
552 * @p: the task in question.
554 inline int task_curr(task_t *p)
556 return cpu_curr(task_cpu(p)) == p;
561 struct list_head list;
563 struct completion done;
567 * The task's runqueue lock must be held, and the new mask must be valid.
568 * Returns true if you have to wait for migration thread.
570 static int __set_cpus_allowed(task_t *p, cpumask_t new_mask,
571 migration_req_t *req)
573 runqueue_t *rq = task_rq(p);
575 p->cpus_allowed = new_mask;
577 * Can the task run on the task's current CPU? If not then
578 * migrate the thread off to a proper CPU.
580 if (cpu_isset(task_cpu(p), new_mask))
584 * If the task is not on a runqueue (and not running), then
585 * it is sufficient to simply update the task's cpu field.
587 if (!p->array && !task_running(rq, p)) {
588 set_task_cpu(p, any_online_cpu(p->cpus_allowed));
592 init_completion(&req->done);
594 list_add(&req->list, &rq->migration_queue);
599 * wait_task_inactive - wait for a thread to unschedule.
601 * The caller must ensure that the task *will* unschedule sometime soon,
602 * else this function might spin for a *long* time. This function can't
603 * be called with interrupts off, or it may introduce deadlock with
604 * smp_call_function() if an IPI is sent by the same process we are
605 * waiting to become inactive.
607 void wait_task_inactive(task_t * p)
614 rq = task_rq_lock(p, &flags);
615 /* Must be off runqueue entirely, not preempted. */
616 if (unlikely(p->array)) {
617 /* If it's preempted, we yield. It could be a while. */
618 preempted = !task_running(rq, p);
619 task_rq_unlock(rq, &flags);
625 task_rq_unlock(rq, &flags);
629 * kick_process - kick a running thread to enter/exit the kernel
630 * @p: the to-be-kicked thread
632 * Cause a process which is running on another CPU to enter
633 * kernel-mode, without any delay. (to get signals handled.)
635 void kick_process(task_t *p)
641 if ((cpu != smp_processor_id()) && task_curr(p))
642 smp_send_reschedule(cpu);
646 EXPORT_SYMBOL_GPL(kick_process);
651 * try_to_wake_up - wake up a thread
652 * @p: the to-be-woken-up thread
653 * @state: the mask of task states that can be woken
654 * @sync: do a synchronous wakeup?
656 * Put it on the run-queue if it's not already there. The "current"
657 * thread is always on the run-queue (except when the actual
658 * re-schedule is in progress), and as such you're allowed to do
659 * the simpler "current->state = TASK_RUNNING" to mark yourself
660 * runnable without the overhead of this.
662 * returns failure only if the task is already active.
664 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
672 rq = task_rq_lock(p, &flags);
673 old_state = p->state;
674 if (old_state & state) {
677 * Fast-migrate the task if it's not running or runnable
678 * currently. Do not violate hard affinity.
680 if (unlikely(sync && !task_running(rq, p) &&
681 (task_cpu(p) != smp_processor_id()) &&
682 cpu_isset(smp_processor_id(),
684 !cpu_is_offline(smp_processor_id()))) {
685 set_task_cpu(p, smp_processor_id());
686 task_rq_unlock(rq, &flags);
687 goto repeat_lock_task;
689 if (old_state == TASK_UNINTERRUPTIBLE) {
690 rq->nr_uninterruptible--;
692 * Tasks on involuntary sleep don't earn
693 * sleep_avg beyond just interactive state.
697 if (sync && (task_cpu(p) == smp_processor_id()))
698 __activate_task(p, rq);
700 activate_task(p, rq);
701 if (TASK_PREEMPTS_CURR(p, rq))
702 resched_task(rq->curr);
706 p->state = TASK_RUNNING;
708 task_rq_unlock(rq, &flags);
712 int fastcall wake_up_process(task_t * p)
714 return try_to_wake_up(p, TASK_STOPPED |
715 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
718 EXPORT_SYMBOL(wake_up_process);
720 int fastcall wake_up_state(task_t *p, unsigned int state)
722 return try_to_wake_up(p, state, 0);
726 * Perform scheduler related setup for a newly forked process p.
727 * p is forked by current.
729 void fastcall sched_fork(task_t *p)
732 * We mark the process as running here, but have not actually
733 * inserted it onto the runqueue yet. This guarantees that
734 * nobody will actually run it, and a signal or other external
735 * event cannot wake it up and insert it on the runqueue either.
737 p->state = TASK_RUNNING;
738 INIT_LIST_HEAD(&p->run_list);
740 spin_lock_init(&p->switch_lock);
741 #ifdef CONFIG_PREEMPT
743 * During context-switch we hold precisely one spinlock, which
744 * schedule_tail drops. (in the common case it's this_rq()->lock,
745 * but it also can be p->switch_lock.) So we compensate with a count
746 * of 1. Also, we want to start with kernel preemption disabled.
748 p->thread_info->preempt_count = 1;
751 * Share the timeslice between parent and child, thus the
752 * total amount of pending timeslices in the system doesn't change,
753 * resulting in more scheduling fairness.
756 p->time_slice = (current->time_slice + 1) >> 1;
758 * The remainder of the first timeslice might be recovered by
759 * the parent if the child exits early enough.
761 p->first_time_slice = 1;
762 current->time_slice >>= 1;
763 p->timestamp = sched_clock();
764 if (!current->time_slice) {
766 * This case is rare, it happens when the parent has only
767 * a single jiffy left from its timeslice. Taking the
768 * runqueue lock is not a problem.
770 current->time_slice = 1;
772 scheduler_tick(0, 0);
780 * wake_up_forked_process - wake up a freshly forked process.
782 * This function will do some initial scheduler statistics housekeeping
783 * that must be done for every newly created process.
785 void fastcall wake_up_forked_process(task_t * p)
788 runqueue_t *rq = task_rq_lock(current, &flags);
790 BUG_ON(p->state != TASK_RUNNING);
793 * We decrease the sleep average of forking parents
794 * and children as well, to keep max-interactive tasks
795 * from forking tasks that are max-interactive.
797 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
798 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
800 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
801 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
803 p->interactive_credit = 0;
805 p->prio = effective_prio(p);
806 set_task_cpu(p, smp_processor_id());
808 if (unlikely(!current->array))
809 __activate_task(p, rq);
811 p->prio = current->prio;
812 list_add_tail(&p->run_list, ¤t->run_list);
813 p->array = current->array;
814 p->array->nr_active++;
817 task_rq_unlock(rq, &flags);
821 * Potentially available exiting-child timeslices are
822 * retrieved here - this way the parent does not get
823 * penalized for creating too many threads.
825 * (this cannot be used to 'generate' timeslices
826 * artificially, because any timeslice recovered here
827 * was given away by the parent in the first place.)
829 void fastcall sched_exit(task_t * p)
834 local_irq_save(flags);
835 if (p->first_time_slice) {
836 p->parent->time_slice += p->time_slice;
837 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
838 p->parent->time_slice = MAX_TIMESLICE;
840 local_irq_restore(flags);
842 * If the child was a (relative-) CPU hog then decrease
843 * the sleep_avg of the parent as well.
845 rq = task_rq_lock(p->parent, &flags);
846 if (p->sleep_avg < p->parent->sleep_avg)
847 p->parent->sleep_avg = p->parent->sleep_avg /
848 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
850 task_rq_unlock(rq, &flags);
854 * finish_task_switch - clean up after a task-switch
855 * @prev: the thread we just switched away from.
857 * We enter this with the runqueue still locked, and finish_arch_switch()
858 * will unlock it along with doing any other architecture-specific cleanup
861 * Note that we may have delayed dropping an mm in context_switch(). If
862 * so, we finish that here outside of the runqueue lock. (Doing it
863 * with the lock held can cause deadlocks; see schedule() for
866 static inline void finish_task_switch(task_t *prev)
868 runqueue_t *rq = this_rq();
869 struct mm_struct *mm = rq->prev_mm;
870 unsigned long prev_task_flags;
875 * A task struct has one reference for the use as "current".
876 * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
877 * schedule one last time. The schedule call will never return,
878 * and the scheduled task must drop that reference.
879 * The test for TASK_ZOMBIE must occur while the runqueue locks are
880 * still held, otherwise prev could be scheduled on another cpu, die
881 * there before we look at prev->state, and then the reference would
883 * Manfred Spraul <manfred@colorfullife.com>
885 prev_task_flags = prev->flags;
886 finish_arch_switch(rq, prev);
889 if (unlikely(prev_task_flags & PF_DEAD))
890 put_task_struct(prev);
894 * schedule_tail - first thing a freshly forked thread must call.
895 * @prev: the thread we just switched away from.
897 asmlinkage void schedule_tail(task_t *prev)
899 finish_task_switch(prev);
901 if (current->set_child_tid)
902 put_user(current->pid, current->set_child_tid);
906 * context_switch - switch to the new MM and the new
907 * thread's register state.
910 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
912 struct mm_struct *mm = next->mm;
913 struct mm_struct *oldmm = prev->active_mm;
916 next->active_mm = oldmm;
917 atomic_inc(&oldmm->mm_count);
918 enter_lazy_tlb(oldmm, next);
920 switch_mm(oldmm, mm, next);
922 if (unlikely(!prev->mm)) {
923 prev->active_mm = NULL;
924 WARN_ON(rq->prev_mm);
928 /* Here we just switch the register state and the stack. */
929 switch_to(prev, next, prev);
935 * nr_running, nr_uninterruptible and nr_context_switches:
937 * externally visible scheduler statistics: current number of runnable
938 * threads, current number of uninterruptible-sleeping threads, total
939 * number of context switches performed since bootup.
941 unsigned long nr_running(void)
943 unsigned long i, sum = 0;
945 for (i = 0; i < NR_CPUS; i++)
946 sum += cpu_rq(i)->nr_running;
951 unsigned long nr_uninterruptible(void)
953 unsigned long i, sum = 0;
956 sum += cpu_rq(i)->nr_uninterruptible;
961 unsigned long long nr_context_switches(void)
963 unsigned long long i, sum = 0;
966 sum += cpu_rq(i)->nr_switches;
971 unsigned long nr_iowait(void)
973 unsigned long i, sum = 0;
976 sum += atomic_read(&cpu_rq(i)->nr_iowait);
982 * double_rq_lock - safely lock two runqueues
984 * Note this does not disable interrupts like task_rq_lock,
985 * you need to do so manually before calling.
987 static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
990 spin_lock(&rq1->lock);
993 spin_lock(&rq1->lock);
994 spin_lock(&rq2->lock);
996 spin_lock(&rq2->lock);
997 spin_lock(&rq1->lock);
1003 * double_rq_unlock - safely unlock two runqueues
1005 * Note this does not restore interrupts like task_rq_unlock,
1006 * you need to do so manually after calling.
1008 static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1010 spin_unlock(&rq1->lock);
1012 spin_unlock(&rq2->lock);
1017 * If dest_cpu is allowed for this process, migrate the task to it.
1018 * This is accomplished by forcing the cpu_allowed mask to only
1019 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1020 * the cpu_allowed mask is restored.
1022 static void sched_migrate_task(task_t *p, int dest_cpu)
1025 migration_req_t req;
1026 unsigned long flags;
1027 cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu);
1030 rq = task_rq_lock(p, &flags);
1031 old_mask = p->cpus_allowed;
1032 if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu))
1035 /* force the process onto the specified CPU */
1036 if (__set_cpus_allowed(p, new_mask, &req)) {
1037 /* Need to wait for migration thread. */
1038 task_rq_unlock(rq, &flags);
1039 wake_up_process(rq->migration_thread);
1040 wait_for_completion(&req.done);
1042 /* If we raced with sys_sched_setaffinity, don't
1044 rq = task_rq_lock(p, &flags);
1045 if (likely(cpus_equal(p->cpus_allowed, new_mask))) {
1046 /* Restore old mask: won't need migration
1047 * thread, since current cpu is allowed. */
1048 BUG_ON(__set_cpus_allowed(p, old_mask, NULL));
1052 task_rq_unlock(rq, &flags);
1053 unlock_cpu_hotplug();
1057 * Find the least loaded CPU. Slightly favor the current CPU by
1058 * setting its runqueue length as the minimum to start.
1060 static int sched_best_cpu(struct task_struct *p)
1062 int i, minload, load, best_cpu, node = 0;
1065 best_cpu = task_cpu(p);
1066 if (cpu_rq(best_cpu)->nr_running <= 2)
1070 for_each_node_with_cpus(i) {
1072 * Node load is always divided by nr_cpus_node to normalise
1073 * load values in case cpu count differs from node to node.
1074 * We first multiply node_nr_running by 10 to get a little
1075 * better resolution.
1077 load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
1078 if (load < minload) {
1085 cpumask = node_to_cpumask(node);
1086 for (i = 0; i < NR_CPUS; ++i) {
1087 if (!cpu_isset(i, cpumask))
1089 if (cpu_rq(i)->nr_running < minload) {
1091 minload = cpu_rq(i)->nr_running;
1097 void sched_balance_exec(void)
1102 new_cpu = sched_best_cpu(current);
1103 if (new_cpu != smp_processor_id())
1104 sched_migrate_task(current, new_cpu);
1109 * Find the busiest node. All previous node loads contribute with a
1110 * geometrically deccaying weight to the load measure:
1111 * load_{t} = load_{t-1}/2 + nr_node_running_{t}
1112 * This way sudden load peaks are flattened out a bit.
1113 * Node load is divided by nr_cpus_node() in order to compare nodes
1114 * of different cpu count but also [first] multiplied by 10 to
1115 * provide better resolution.
1117 static int find_busiest_node(int this_node)
1119 int i, node = -1, load, this_load, maxload;
1121 if (!nr_cpus_node(this_node))
1123 this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
1124 + (10 * atomic_read(&node_nr_running[this_node])
1125 / nr_cpus_node(this_node));
1126 this_rq()->prev_node_load[this_node] = this_load;
1127 for_each_node_with_cpus(i) {
1130 load = (this_rq()->prev_node_load[i] >> 1)
1131 + (10 * atomic_read(&node_nr_running[i])
1133 this_rq()->prev_node_load[i] = load;
1134 if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
1142 #endif /* CONFIG_NUMA */
1147 * double_lock_balance - lock the busiest runqueue
1149 * this_rq is locked already. Recalculate nr_running if we have to
1150 * drop the runqueue lock.
1153 unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest,
1154 int this_cpu, int idle,
1155 unsigned int nr_running)
1157 if (unlikely(!spin_trylock(&busiest->lock))) {
1158 if (busiest < this_rq) {
1159 spin_unlock(&this_rq->lock);
1160 spin_lock(&busiest->lock);
1161 spin_lock(&this_rq->lock);
1162 /* Need to recalculate nr_running */
1163 if (idle || (this_rq->nr_running >
1164 this_rq->prev_cpu_load[this_cpu]))
1165 nr_running = this_rq->nr_running;
1167 nr_running = this_rq->prev_cpu_load[this_cpu];
1169 spin_lock(&busiest->lock);
1175 * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
1178 runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle,
1179 int *imbalance, cpumask_t cpumask)
1181 int nr_running, load, max_load, i;
1182 runqueue_t *busiest, *rq_src;
1185 * We search all runqueues to find the most busy one.
1186 * We do this lockless to reduce cache-bouncing overhead,
1187 * we re-check the 'best' source CPU later on again, with
1190 * We fend off statistical fluctuations in runqueue lengths by
1191 * saving the runqueue length (as seen by the balancing CPU) during
1192 * the previous load-balancing operation and using the smaller one
1193 * of the current and saved lengths. If a runqueue is long enough
1194 * for a longer amount of time then we recognize it and pull tasks
1197 * The 'current runqueue length' is a statistical maximum variable,
1198 * for that one we take the longer one - to avoid fluctuations in
1199 * the other direction. So for a load-balance to happen it needs
1200 * stable long runqueue on the target CPU and stable short runqueue
1201 * on the local runqueue.
1203 * We make an exception if this CPU is about to become idle - in
1204 * that case we are less picky about moving a task across CPUs and
1205 * take what can be taken.
1207 if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
1208 nr_running = this_rq->nr_running;
1210 nr_running = this_rq->prev_cpu_load[this_cpu];
1214 for (i = 0; i < NR_CPUS; i++) {
1215 if (!cpu_isset(i, cpumask))
1219 if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
1220 load = rq_src->nr_running;
1222 load = this_rq->prev_cpu_load[i];
1223 this_rq->prev_cpu_load[i] = rq_src->nr_running;
1225 if ((load > max_load) && (rq_src != this_rq)) {
1231 if (likely(!busiest))
1234 *imbalance = max_load - nr_running;
1236 /* It needs an at least ~25% imbalance to trigger balancing. */
1237 if (!idle && ((*imbalance)*4 < max_load)) {
1242 nr_running = double_lock_balance(this_rq, busiest, this_cpu,
1245 * Make sure nothing changed since we checked the
1248 if (busiest->nr_running <= nr_running) {
1249 spin_unlock(&busiest->lock);
1257 * pull_task - move a task from a remote runqueue to the local runqueue.
1258 * Both runqueues must be locked.
1261 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1262 runqueue_t *this_rq, int this_cpu)
1264 dequeue_task(p, src_array);
1265 nr_running_dec(src_rq);
1266 set_task_cpu(p, this_cpu);
1267 nr_running_inc(this_rq);
1268 enqueue_task(p, this_rq->active);
1269 p->timestamp = sched_clock() -
1270 (src_rq->timestamp_last_tick - p->timestamp);
1272 * Note that idle threads have a prio of MAX_PRIO, for this test
1273 * to be always true for them.
1275 if (TASK_PREEMPTS_CURR(p, this_rq))
1280 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1283 int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
1285 unsigned long delta = rq->timestamp_last_tick - tsk->timestamp;
1288 * We do not migrate tasks that are:
1289 * 1) running (obviously), or
1290 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1291 * 3) are cache-hot on their current CPU.
1293 if (task_running(rq, tsk))
1295 if (!cpu_isset(this_cpu, tsk->cpus_allowed))
1297 if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks)))
1303 * Current runqueue is empty, or rebalance tick: if there is an
1304 * inbalance (current runqueue is too short) then pull from
1305 * busiest runqueue(s).
1307 * We call this with the current runqueue locked,
1310 static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
1312 int imbalance, idx, this_cpu = smp_processor_id();
1313 runqueue_t *busiest;
1314 prio_array_t *array;
1315 struct list_head *head, *curr;
1318 if (cpu_is_offline(this_cpu))
1321 busiest = find_busiest_queue(this_rq, this_cpu, idle,
1322 &imbalance, cpumask);
1327 * We only want to steal a number of tasks equal to 1/2 the imbalance,
1328 * otherwise we'll just shift the imbalance to the new queue:
1333 * We first consider expired tasks. Those will likely not be
1334 * executed in the near future, and they are most likely to
1335 * be cache-cold, thus switching CPUs has the least effect
1338 if (busiest->expired->nr_active)
1339 array = busiest->expired;
1341 array = busiest->active;
1344 /* Start searching at priority 0: */
1348 idx = sched_find_first_bit(array->bitmap);
1350 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1351 if (idx >= MAX_PRIO) {
1352 if (array == busiest->expired) {
1353 array = busiest->active;
1359 head = array->queue + idx;
1362 tmp = list_entry(curr, task_t, run_list);
1366 if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
1372 pull_task(busiest, array, tmp, this_rq, this_cpu);
1374 /* Only migrate one task if we are idle */
1375 if (!idle && --imbalance) {
1382 spin_unlock(&busiest->lock);
1388 * One of the idle_cpu_tick() and busy_cpu_tick() functions will
1389 * get called every timer tick, on every CPU. Our balancing action
1390 * frequency and balancing agressivity depends on whether the CPU is
1393 * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
1394 * systems with HZ=100, every 10 msecs.)
1396 * On NUMA, do a node-rebalance every 400 msecs.
1398 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1399 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
1400 #define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
1401 #define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
1404 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
1406 int node = find_busiest_node(cpu_to_node(this_cpu));
1409 cpumask_t cpumask = node_to_cpumask(node);
1410 cpu_set(this_cpu, cpumask);
1411 spin_lock(&this_rq->lock);
1412 load_balance(this_rq, idle, cpumask);
1413 spin_unlock(&this_rq->lock);
1418 static void rebalance_tick(runqueue_t *this_rq, int idle)
1421 int this_cpu = smp_processor_id();
1423 unsigned long j = jiffies;
1426 * First do inter-node rebalancing, then intra-node rebalancing,
1427 * if both events happen in the same tick. The inter-node
1428 * rebalancing does not necessarily have to create a perfect
1429 * balance within the node, since we load-balance the most loaded
1430 * node with the current CPU. (ie. other CPUs in the local node
1431 * are not balanced.)
1435 if (!(j % IDLE_NODE_REBALANCE_TICK))
1436 balance_node(this_rq, idle, this_cpu);
1438 if (!(j % IDLE_REBALANCE_TICK)) {
1439 spin_lock(&this_rq->lock);
1440 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1441 spin_unlock(&this_rq->lock);
1446 if (!(j % BUSY_NODE_REBALANCE_TICK))
1447 balance_node(this_rq, idle, this_cpu);
1449 if (!(j % BUSY_REBALANCE_TICK)) {
1450 spin_lock(&this_rq->lock);
1451 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1452 spin_unlock(&this_rq->lock);
1457 * on UP we do not need to balance between CPUs:
1459 static inline void rebalance_tick(runqueue_t *this_rq, int idle)
1464 DEFINE_PER_CPU(struct kernel_stat, kstat);
1466 EXPORT_PER_CPU_SYMBOL(kstat);
1469 * We place interactive tasks back into the active array, if possible.
1471 * To guarantee that this does not starve expired tasks we ignore the
1472 * interactivity of a task if the first expired task had to wait more
1473 * than a 'reasonable' amount of time. This deadline timeout is
1474 * load-dependent, as the frequency of array switched decreases with
1475 * increasing number of running tasks. We also ignore the interactivity
1476 * if a better static_prio task has expired:
1478 #define EXPIRED_STARVING(rq) \
1479 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1480 (jiffies - (rq)->expired_timestamp >= \
1481 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1482 ((rq)->curr->static_prio > (rq)->best_expired_prio))
1485 * This function gets called by the timer code, with HZ frequency.
1486 * We call it with interrupts disabled.
1488 * It also gets called by the fork code, when changing the parent's
1491 void scheduler_tick(int user_ticks, int sys_ticks)
1493 int cpu = smp_processor_id();
1494 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1495 runqueue_t *rq = this_rq();
1496 task_t *p = current;
1498 rq->timestamp_last_tick = sched_clock();
1500 if (rcu_pending(cpu))
1501 rcu_check_callbacks(cpu, user_ticks);
1503 /* note: this timer irq context must be accounted for as well */
1504 if (hardirq_count() - HARDIRQ_OFFSET) {
1505 cpustat->irq += sys_ticks;
1507 } else if (softirq_count()) {
1508 cpustat->softirq += sys_ticks;
1512 if (p == rq->idle) {
1513 if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
1516 if (atomic_read(&rq->nr_iowait) > 0)
1517 cpustat->iowait += sys_ticks;
1519 cpustat->idle += sys_ticks;
1520 rebalance_tick(rq, 1);
1523 if (TASK_NICE(p) > 0)
1524 cpustat->nice += user_ticks;
1526 cpustat->user += user_ticks;
1527 cpustat->system += sys_ticks;
1529 /* Task might have expired already, but not scheduled off yet */
1530 if (p->array != rq->active) {
1531 set_tsk_need_resched(p);
1534 spin_lock(&rq->lock);
1536 * The task was running during this tick - update the
1537 * time slice counter. Note: we do not update a thread's
1538 * priority until it either goes to sleep or uses up its
1539 * timeslice. This makes it possible for interactive tasks
1540 * to use up their timeslices at their highest priority levels.
1542 if (unlikely(rt_task(p))) {
1544 * RR tasks need a special form of timeslice management.
1545 * FIFO tasks have no timeslices.
1547 if ((p->policy == SCHED_RR) && !--p->time_slice) {
1548 p->time_slice = task_timeslice(p);
1549 p->first_time_slice = 0;
1550 set_tsk_need_resched(p);
1552 /* put it at the end of the queue: */
1553 dequeue_task(p, rq->active);
1554 enqueue_task(p, rq->active);
1558 if (vx_need_resched(p)) {
1559 dequeue_task(p, rq->active);
1560 set_tsk_need_resched(p);
1561 p->prio = effective_prio(p);
1562 p->time_slice = task_timeslice(p);
1563 p->first_time_slice = 0;
1565 if (!rq->expired_timestamp)
1566 rq->expired_timestamp = jiffies;
1567 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1568 enqueue_task(p, rq->expired);
1569 if (p->static_prio < rq->best_expired_prio)
1570 rq->best_expired_prio = p->static_prio;
1572 enqueue_task(p, rq->active);
1575 * Prevent a too long timeslice allowing a task to monopolize
1576 * the CPU. We do this by splitting up the timeslice into
1579 * Note: this does not mean the task's timeslices expire or
1580 * get lost in any way, they just might be preempted by
1581 * another task of equal priority. (one with higher
1582 * priority would have preempted this task already.) We
1583 * requeue this task to the end of the list on this priority
1584 * level, which is in essence a round-robin of tasks with
1587 * This only applies to tasks in the interactive
1588 * delta range with at least TIMESLICE_GRANULARITY to requeue.
1590 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1591 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1592 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1593 (p->array == rq->active)) {
1595 dequeue_task(p, rq->active);
1596 set_tsk_need_resched(p);
1597 p->prio = effective_prio(p);
1598 enqueue_task(p, rq->active);
1602 spin_unlock(&rq->lock);
1604 rebalance_tick(rq, 0);
1608 * schedule() is the main scheduler function.
1610 asmlinkage void __sched schedule(void)
1613 task_t *prev, *next;
1615 prio_array_t *array;
1616 struct list_head *queue;
1617 unsigned long long now;
1618 unsigned long run_time;
1619 #ifdef CONFIG_VSERVER_HARDCPU
1620 struct vx_info *vxi;
1626 * Test if we are atomic. Since do_exit() needs to call into
1627 * schedule() atomically, we ignore that path for now.
1628 * Otherwise, whine if we are scheduling when we should not be.
1630 if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
1631 if (unlikely(in_atomic())) {
1632 printk(KERN_ERR "bad: scheduling while atomic!\n");
1642 release_kernel_lock(prev);
1643 now = sched_clock();
1644 if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
1645 run_time = now - prev->timestamp;
1647 run_time = NS_MAX_SLEEP_AVG;
1650 * Tasks with interactive credits get charged less run_time
1651 * at high sleep_avg to delay them losing their interactive
1654 if (HIGH_CREDIT(prev))
1655 run_time /= (CURRENT_BONUS(prev) ? : 1);
1657 spin_lock_irq(&rq->lock);
1660 * if entering off of a kernel preemption go straight
1661 * to picking the next task.
1663 switch_count = &prev->nivcsw;
1664 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1665 switch_count = &prev->nvcsw;
1666 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
1667 unlikely(signal_pending(prev))))
1668 prev->state = TASK_RUNNING;
1670 deactivate_task(prev, rq);
1673 #ifdef CONFIG_VSERVER_HARDCPU
1674 if (!list_empty(&rq->hold_queue)) {
1675 struct list_head *l, *n;
1679 list_for_each_safe(l, n, &rq->hold_queue) {
1680 next = list_entry(l, task_t, run_list);
1681 if (vxi == next->vx_info)
1684 vxi = next->vx_info;
1685 ret = vx_tokens_recalc(vxi);
1686 // tokens = vx_tokens_avail(next);
1689 list_del(&next->run_list);
1690 next->state &= ~TASK_ONHOLD;
1691 recalc_task_prio(next, now);
1692 __activate_task(next, rq);
1693 // printk("··· unhold %p\n", next);
1696 if ((ret < 0) && (maxidle < ret))
1700 rq->idle_tokens = -maxidle;
1704 if (unlikely(!rq->nr_running)) {
1706 load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
1708 if (!rq->nr_running) {
1710 rq->expired_timestamp = 0;
1716 if (unlikely(!array->nr_active)) {
1718 * Switch the active and expired arrays.
1720 rq->active = rq->expired;
1721 rq->expired = array;
1723 rq->expired_timestamp = 0;
1724 rq->best_expired_prio = MAX_PRIO;
1727 idx = sched_find_first_bit(array->bitmap);
1728 queue = array->queue + idx;
1729 next = list_entry(queue->next, task_t, run_list);
1731 #ifdef CONFIG_VSERVER_HARDCPU
1732 vxi = next->vx_info;
1733 if (vxi && __vx_flags(vxi->vx_flags,
1734 VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
1735 int ret = vx_tokens_recalc(vxi);
1737 if (unlikely(ret <= 0)) {
1738 if (ret && (rq->idle_tokens > -ret))
1739 rq->idle_tokens = -ret;
1740 deactivate_task(next, rq);
1741 list_add_tail(&next->run_list, &rq->hold_queue);
1742 next->state |= TASK_ONHOLD;
1748 if (!rt_task(next) && next->activated > 0) {
1749 unsigned long long delta = now - next->timestamp;
1751 if (next->activated == 1)
1752 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1754 array = next->array;
1755 dequeue_task(next, array);
1756 recalc_task_prio(next, next->timestamp + delta);
1757 enqueue_task(next, array);
1759 next->activated = 0;
1762 clear_tsk_need_resched(prev);
1763 RCU_qsctr(task_cpu(prev))++;
1765 prev->sleep_avg -= run_time;
1766 if ((long)prev->sleep_avg <= 0) {
1767 prev->sleep_avg = 0;
1768 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
1769 prev->interactive_credit--;
1771 prev->timestamp = now;
1773 if (likely(prev != next)) {
1774 next->timestamp = now;
1779 prepare_arch_switch(rq, next);
1780 prev = context_switch(rq, prev, next);
1783 finish_task_switch(prev);
1785 spin_unlock_irq(&rq->lock);
1787 reacquire_kernel_lock(current);
1788 preempt_enable_no_resched();
1789 if (test_thread_flag(TIF_NEED_RESCHED))
1793 EXPORT_SYMBOL(schedule);
1795 #ifdef CONFIG_PREEMPT
1797 * this is is the entry point to schedule() from in-kernel preemption
1798 * off of preempt_enable. Kernel preemptions off return from interrupt
1799 * occur there and call schedule directly.
1801 asmlinkage void __sched preempt_schedule(void)
1803 struct thread_info *ti = current_thread_info();
1806 * If there is a non-zero preempt_count or interrupts are disabled,
1807 * we do not want to preempt the current task. Just return..
1809 if (unlikely(ti->preempt_count || irqs_disabled()))
1813 ti->preempt_count = PREEMPT_ACTIVE;
1815 ti->preempt_count = 0;
1817 /* we could miss a preemption opportunity between schedule and now */
1819 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
1823 EXPORT_SYMBOL(preempt_schedule);
1824 #endif /* CONFIG_PREEMPT */
1826 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync)
1828 task_t *p = curr->task;
1829 return try_to_wake_up(p, mode, sync);
1832 EXPORT_SYMBOL(default_wake_function);
1835 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
1836 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
1837 * number) then we wake all the non-exclusive tasks and one exclusive task.
1839 * There are circumstances in which we can try to wake a task which has already
1840 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
1841 * zero in this (rare) case, and we handle it by continuing to scan the queue.
1843 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
1844 int nr_exclusive, int sync)
1846 struct list_head *tmp, *next;
1848 list_for_each_safe(tmp, next, &q->task_list) {
1851 curr = list_entry(tmp, wait_queue_t, task_list);
1852 flags = curr->flags;
1853 if (curr->func(curr, mode, sync) &&
1854 (flags & WQ_FLAG_EXCLUSIVE) &&
1861 * __wake_up - wake up threads blocked on a waitqueue.
1863 * @mode: which threads
1864 * @nr_exclusive: how many wake-one or wake-many threads to wake up
1866 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1868 unsigned long flags;
1870 spin_lock_irqsave(&q->lock, flags);
1871 __wake_up_common(q, mode, nr_exclusive, 0);
1872 spin_unlock_irqrestore(&q->lock, flags);
1875 EXPORT_SYMBOL(__wake_up);
1878 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
1880 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
1882 __wake_up_common(q, mode, 1, 0);
1886 * __wake_up - sync- wake up threads blocked on a waitqueue.
1888 * @mode: which threads
1889 * @nr_exclusive: how many wake-one or wake-many threads to wake up
1891 * The sync wakeup differs that the waker knows that it will schedule
1892 * away soon, so while the target thread will be woken up, it will not
1893 * be migrated to another CPU - ie. the two threads are 'synchronized'
1894 * with each other. This can prevent needless bouncing between CPUs.
1896 * On UP it can prevent extra preemption.
1898 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1900 unsigned long flags;
1905 spin_lock_irqsave(&q->lock, flags);
1906 if (likely(nr_exclusive))
1907 __wake_up_common(q, mode, nr_exclusive, 1);
1909 __wake_up_common(q, mode, nr_exclusive, 0);
1910 spin_unlock_irqrestore(&q->lock, flags);
1912 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
1914 void fastcall complete(struct completion *x)
1916 unsigned long flags;
1918 spin_lock_irqsave(&x->wait.lock, flags);
1920 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1922 spin_unlock_irqrestore(&x->wait.lock, flags);
1924 EXPORT_SYMBOL(complete);
1926 void fastcall complete_all(struct completion *x)
1928 unsigned long flags;
1930 spin_lock_irqsave(&x->wait.lock, flags);
1931 x->done += UINT_MAX/2;
1932 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1934 spin_unlock_irqrestore(&x->wait.lock, flags);
1936 EXPORT_SYMBOL(complete_all);
1938 void fastcall __sched wait_for_completion(struct completion *x)
1941 spin_lock_irq(&x->wait.lock);
1943 DECLARE_WAITQUEUE(wait, current);
1945 wait.flags |= WQ_FLAG_EXCLUSIVE;
1946 __add_wait_queue_tail(&x->wait, &wait);
1948 __set_current_state(TASK_UNINTERRUPTIBLE);
1949 spin_unlock_irq(&x->wait.lock);
1951 spin_lock_irq(&x->wait.lock);
1953 __remove_wait_queue(&x->wait, &wait);
1956 spin_unlock_irq(&x->wait.lock);
1958 EXPORT_SYMBOL(wait_for_completion);
1960 #define SLEEP_ON_VAR \
1961 unsigned long flags; \
1962 wait_queue_t wait; \
1963 init_waitqueue_entry(&wait, current);
1965 #define SLEEP_ON_HEAD \
1966 spin_lock_irqsave(&q->lock,flags); \
1967 __add_wait_queue(q, &wait); \
1968 spin_unlock(&q->lock);
1970 #define SLEEP_ON_TAIL \
1971 spin_lock_irq(&q->lock); \
1972 __remove_wait_queue(q, &wait); \
1973 spin_unlock_irqrestore(&q->lock, flags);
1975 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
1979 current->state = TASK_INTERRUPTIBLE;
1986 EXPORT_SYMBOL(interruptible_sleep_on);
1988 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1992 current->state = TASK_INTERRUPTIBLE;
1995 timeout = schedule_timeout(timeout);
2001 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2003 void fastcall __sched sleep_on(wait_queue_head_t *q)
2007 current->state = TASK_UNINTERRUPTIBLE;
2014 EXPORT_SYMBOL(sleep_on);
2016 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2020 current->state = TASK_UNINTERRUPTIBLE;
2023 timeout = schedule_timeout(timeout);
2029 EXPORT_SYMBOL(sleep_on_timeout);
2031 void set_user_nice(task_t *p, long nice)
2033 unsigned long flags;
2034 prio_array_t *array;
2036 int old_prio, new_prio, delta;
2038 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
2041 * We have to be careful, if called from sys_setpriority(),
2042 * the task might be in the middle of scheduling on another CPU.
2044 rq = task_rq_lock(p, &flags);
2046 * The RT priorities are set via setscheduler(), but we still
2047 * allow the 'normal' nice value to be set - but as expected
2048 * it wont have any effect on scheduling until the task is
2052 p->static_prio = NICE_TO_PRIO(nice);
2057 dequeue_task(p, array);
2060 new_prio = NICE_TO_PRIO(nice);
2061 delta = new_prio - old_prio;
2062 p->static_prio = NICE_TO_PRIO(nice);
2066 enqueue_task(p, array);
2068 * If the task increased its priority or is running and
2069 * lowered its priority, then reschedule its CPU:
2071 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2072 resched_task(rq->curr);
2075 task_rq_unlock(rq, &flags);
2078 EXPORT_SYMBOL(set_user_nice);
2083 * sys_nice - change the priority of the current process.
2084 * @increment: priority increment
2086 * sys_setpriority is a more generic, but much slower function that
2087 * does similar things.
2089 asmlinkage long sys_nice(int increment)
2095 * Setpriority might change our priority at the same moment.
2096 * We don't have to worry. Conceptually one call occurs first
2097 * and we have a single winner.
2099 if (increment < 0) {
2100 if (!capable(CAP_SYS_NICE))
2102 if (increment < -40)
2108 nice = PRIO_TO_NICE(current->static_prio) + increment;
2114 retval = security_task_setnice(current, nice);
2118 set_user_nice(current, nice);
2125 * task_prio - return the priority value of a given task.
2126 * @p: the task in question.
2128 * This is the priority value as seen by users in /proc.
2129 * RT tasks are offset by -200. Normal tasks are centered
2130 * around 0, value goes from -16 to +15.
2132 int task_prio(task_t *p)
2134 return p->prio - MAX_RT_PRIO;
2138 * task_nice - return the nice value of a given task.
2139 * @p: the task in question.
2141 int task_nice(task_t *p)
2143 return TASK_NICE(p);
2146 EXPORT_SYMBOL(task_nice);
2149 * idle_cpu - is a given cpu idle currently?
2150 * @cpu: the processor in question.
2152 int idle_cpu(int cpu)
2154 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
2157 EXPORT_SYMBOL_GPL(idle_cpu);
2160 * find_process_by_pid - find a process with a matching PID value.
2161 * @pid: the pid in question.
2163 static inline task_t *find_process_by_pid(pid_t pid)
2165 return pid ? find_task_by_pid(pid) : current;
2168 /* Actually do priority change: must hold rq lock. */
2169 static void __setscheduler(struct task_struct *p, int policy, int prio)
2173 p->rt_priority = prio;
2174 if (policy != SCHED_NORMAL)
2175 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
2177 p->prio = p->static_prio;
2181 * setscheduler - change the scheduling policy and/or RT priority of a thread.
2183 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
2185 struct sched_param lp;
2186 int retval = -EINVAL;
2188 prio_array_t *array;
2189 unsigned long flags;
2193 if (!param || pid < 0)
2197 if (copy_from_user(&lp, param, sizeof(struct sched_param)))
2201 * We play safe to avoid deadlocks.
2203 read_lock_irq(&tasklist_lock);
2205 p = find_process_by_pid(pid);
2209 goto out_unlock_tasklist;
2212 * To be able to change p->policy safely, the apropriate
2213 * runqueue lock must be held.
2215 rq = task_rq_lock(p, &flags);
2221 if (policy != SCHED_FIFO && policy != SCHED_RR &&
2222 policy != SCHED_NORMAL)
2227 * Valid priorities for SCHED_FIFO and SCHED_RR are
2228 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
2231 if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
2233 if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
2237 if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
2238 !capable(CAP_SYS_NICE))
2240 if ((current->euid != p->euid) && (current->euid != p->uid) &&
2241 !capable(CAP_SYS_NICE))
2244 retval = security_task_setscheduler(p, policy, &lp);
2250 deactivate_task(p, task_rq(p));
2253 __setscheduler(p, policy, lp.sched_priority);
2255 __activate_task(p, task_rq(p));
2257 * Reschedule if we are currently running on this runqueue and
2258 * our priority decreased, or if we are not currently running on
2259 * this runqueue and our priority is higher than the current's
2261 if (task_running(rq, p)) {
2262 if (p->prio > oldprio)
2263 resched_task(rq->curr);
2264 } else if (p->prio < rq->curr->prio)
2265 resched_task(rq->curr);
2269 task_rq_unlock(rq, &flags);
2270 out_unlock_tasklist:
2271 read_unlock_irq(&tasklist_lock);
2278 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
2279 * @pid: the pid in question.
2280 * @policy: new policy
2281 * @param: structure containing the new RT priority.
2283 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
2284 struct sched_param __user *param)
2286 return setscheduler(pid, policy, param);
2290 * sys_sched_setparam - set/change the RT priority of a thread
2291 * @pid: the pid in question.
2292 * @param: structure containing the new RT priority.
2294 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
2296 return setscheduler(pid, -1, param);
2300 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
2301 * @pid: the pid in question.
2303 asmlinkage long sys_sched_getscheduler(pid_t pid)
2305 int retval = -EINVAL;
2312 read_lock(&tasklist_lock);
2313 p = find_process_by_pid(pid);
2315 retval = security_task_getscheduler(p);
2319 read_unlock(&tasklist_lock);
2326 * sys_sched_getscheduler - get the RT priority of a thread
2327 * @pid: the pid in question.
2328 * @param: structure containing the RT priority.
2330 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
2332 struct sched_param lp;
2333 int retval = -EINVAL;
2336 if (!param || pid < 0)
2339 read_lock(&tasklist_lock);
2340 p = find_process_by_pid(pid);
2345 retval = security_task_getscheduler(p);
2349 lp.sched_priority = p->rt_priority;
2350 read_unlock(&tasklist_lock);
2353 * This one might sleep, we cannot do it with a spinlock held ...
2355 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
2361 read_unlock(&tasklist_lock);
2366 * sys_sched_setaffinity - set the cpu affinity of a process
2367 * @pid: pid of the process
2368 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2369 * @user_mask_ptr: user-space pointer to the new cpu mask
2371 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
2372 unsigned long __user *user_mask_ptr)
2378 if (len < sizeof(new_mask))
2381 if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
2385 read_lock(&tasklist_lock);
2387 p = find_process_by_pid(pid);
2389 read_unlock(&tasklist_lock);
2390 unlock_cpu_hotplug();
2395 * It is not safe to call set_cpus_allowed with the
2396 * tasklist_lock held. We will bump the task_struct's
2397 * usage count and then drop tasklist_lock.
2400 read_unlock(&tasklist_lock);
2403 if ((current->euid != p->euid) && (current->euid != p->uid) &&
2404 !capable(CAP_SYS_NICE))
2407 retval = set_cpus_allowed(p, new_mask);
2411 unlock_cpu_hotplug();
2416 * sys_sched_getaffinity - get the cpu affinity of a process
2417 * @pid: pid of the process
2418 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2419 * @user_mask_ptr: user-space pointer to hold the current cpu mask
2421 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
2422 unsigned long __user *user_mask_ptr)
2424 unsigned int real_len;
2429 real_len = sizeof(mask);
2433 read_lock(&tasklist_lock);
2436 p = find_process_by_pid(pid);
2441 cpus_and(mask, p->cpus_allowed, cpu_possible_map);
2444 read_unlock(&tasklist_lock);
2447 if (copy_to_user(user_mask_ptr, &mask, real_len))
2453 * sys_sched_yield - yield the current processor to other threads.
2455 * this function yields the current CPU by moving the calling thread
2456 * to the expired array. If there are no other threads running on this
2457 * CPU then this function will return.
2459 asmlinkage long sys_sched_yield(void)
2461 runqueue_t *rq = this_rq_lock();
2462 prio_array_t *array = current->array;
2465 * We implement yielding by moving the task into the expired
2468 * (special rule: RT tasks will just roundrobin in the active
2471 if (likely(!rt_task(current))) {
2472 dequeue_task(current, array);
2473 enqueue_task(current, rq->expired);
2475 list_del(¤t->run_list);
2476 list_add_tail(¤t->run_list, array->queue + current->prio);
2479 * Since we are going to call schedule() anyway, there's
2480 * no need to preempt:
2482 _raw_spin_unlock(&rq->lock);
2483 preempt_enable_no_resched();
2490 void __sched __cond_resched(void)
2492 set_current_state(TASK_RUNNING);
2496 EXPORT_SYMBOL(__cond_resched);
2499 * yield - yield the current processor to other threads.
2501 * this is a shortcut for kernel-space yielding - it marks the
2502 * thread runnable and calls sys_sched_yield().
2504 void __sched yield(void)
2506 set_current_state(TASK_RUNNING);
2510 EXPORT_SYMBOL(yield);
2513 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
2514 * that process accounting knows that this is a task in IO wait state.
2516 * But don't do that if it is a deliberate, throttling IO wait (this task
2517 * has set its backing_dev_info: the queue against which it should throttle)
2519 void __sched io_schedule(void)
2521 struct runqueue *rq = this_rq();
2523 atomic_inc(&rq->nr_iowait);
2525 atomic_dec(&rq->nr_iowait);
2528 EXPORT_SYMBOL(io_schedule);
2530 long __sched io_schedule_timeout(long timeout)
2532 struct runqueue *rq = this_rq();
2535 atomic_inc(&rq->nr_iowait);
2536 ret = schedule_timeout(timeout);
2537 atomic_dec(&rq->nr_iowait);
2542 * sys_sched_get_priority_max - return maximum RT priority.
2543 * @policy: scheduling class.
2545 * this syscall returns the maximum rt_priority that can be used
2546 * by a given scheduling class.
2548 asmlinkage long sys_sched_get_priority_max(int policy)
2555 ret = MAX_USER_RT_PRIO-1;
2565 * sys_sched_get_priority_min - return minimum RT priority.
2566 * @policy: scheduling class.
2568 * this syscall returns the minimum rt_priority that can be used
2569 * by a given scheduling class.
2571 asmlinkage long sys_sched_get_priority_min(int policy)
2587 * sys_sched_rr_get_interval - return the default timeslice of a process.
2588 * @pid: pid of the process.
2589 * @interval: userspace pointer to the timeslice value.
2591 * this syscall writes the default timeslice value of a given process
2592 * into the user-space timespec buffer. A value of '0' means infinity.
2595 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
2597 int retval = -EINVAL;
2605 read_lock(&tasklist_lock);
2606 p = find_process_by_pid(pid);
2610 retval = security_task_getscheduler(p);
2614 jiffies_to_timespec(p->policy & SCHED_FIFO ?
2615 0 : task_timeslice(p), &t);
2616 read_unlock(&tasklist_lock);
2617 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2621 read_unlock(&tasklist_lock);
2625 static inline struct task_struct *eldest_child(struct task_struct *p)
2627 if (list_empty(&p->children)) return NULL;
2628 return list_entry(p->children.next,struct task_struct,sibling);
2631 static inline struct task_struct *older_sibling(struct task_struct *p)
2633 if (p->sibling.prev==&p->parent->children) return NULL;
2634 return list_entry(p->sibling.prev,struct task_struct,sibling);
2637 static inline struct task_struct *younger_sibling(struct task_struct *p)
2639 if (p->sibling.next==&p->parent->children) return NULL;
2640 return list_entry(p->sibling.next,struct task_struct,sibling);
2643 static void show_task(task_t * p)
2647 unsigned long free = 0;
2648 static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
2650 printk("%-13.13s ", p->comm);
2651 state = p->state ? __ffs(p->state) + 1 : 0;
2652 if (state < ARRAY_SIZE(stat_nam))
2653 printk(stat_nam[state]);
2656 #if (BITS_PER_LONG == 32)
2657 if (state == TASK_RUNNING)
2658 printk(" running ");
2660 printk(" %08lX ", thread_saved_pc(p));
2662 if (state == TASK_RUNNING)
2663 printk(" running task ");
2665 printk(" %016lx ", thread_saved_pc(p));
2667 #ifdef CONFIG_DEBUG_STACK_USAGE
2669 unsigned long * n = (unsigned long *) (p->thread_info+1);
2672 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
2675 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
2676 if ((relative = eldest_child(p)))
2677 printk("%5d ", relative->pid);
2680 if ((relative = younger_sibling(p)))
2681 printk("%7d", relative->pid);
2684 if ((relative = older_sibling(p)))
2685 printk(" %5d", relative->pid);
2689 printk(" (L-TLB)\n");
2691 printk(" (NOTLB)\n");
2693 if (state != TASK_RUNNING)
2694 show_stack(p, NULL);
2697 void show_state(void)
2701 #if (BITS_PER_LONG == 32)
2704 printk(" task PC pid father child younger older\n");
2708 printk(" task PC pid father child younger older\n");
2710 read_lock(&tasklist_lock);
2711 do_each_thread(g, p) {
2713 * reset the NMI-timeout, listing all files on a slow
2714 * console might take alot of time:
2716 touch_nmi_watchdog();
2718 } while_each_thread(g, p);
2720 read_unlock(&tasklist_lock);
2723 void __init init_idle(task_t *idle, int cpu)
2725 runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
2726 unsigned long flags;
2728 local_irq_save(flags);
2729 double_rq_lock(idle_rq, rq);
2731 idle_rq->curr = idle_rq->idle = idle;
2732 deactivate_task(idle, rq);
2734 idle->prio = MAX_PRIO;
2735 idle->state = TASK_RUNNING;
2736 set_task_cpu(idle, cpu);
2737 double_rq_unlock(idle_rq, rq);
2738 set_tsk_need_resched(idle);
2739 local_irq_restore(flags);
2741 /* Set the preempt count _outside_ the spinlocks! */
2742 #ifdef CONFIG_PREEMPT
2743 idle->thread_info->preempt_count = (idle->lock_depth >= 0);
2745 idle->thread_info->preempt_count = 0;
2750 * In a system that switches off the HZ timer idle_cpu_mask
2751 * indicates which cpus entered this state. This is used
2752 * in the rcu update to wait only for active cpus. For system
2753 * which do not switch off the HZ timer idle_cpu_mask should
2754 * always be CPU_MASK_NONE.
2756 cpumask_t idle_cpu_mask = CPU_MASK_NONE;
2760 * This is how migration works:
2762 * 1) we queue a migration_req_t structure in the source CPU's
2763 * runqueue and wake up that CPU's migration thread.
2764 * 2) we down() the locked semaphore => thread blocks.
2765 * 3) migration thread wakes up (implicitly it forces the migrated
2766 * thread off the CPU)
2767 * 4) it gets the migration request and checks whether the migrated
2768 * task is still in the wrong runqueue.
2769 * 5) if it's in the wrong runqueue then the migration thread removes
2770 * it and puts it into the right queue.
2771 * 6) migration thread up()s the semaphore.
2772 * 7) we wake up and the migration is done.
2776 * Change a given task's CPU affinity. Migrate the thread to a
2777 * proper CPU and schedule it away if the CPU it's executing on
2778 * is removed from the allowed bitmask.
2780 * NOTE: the caller must have a valid reference to the task, the
2781 * task must not exit() & deallocate itself prematurely. The
2782 * call is not atomic; no spinlocks may be held.
2784 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
2786 unsigned long flags;
2788 migration_req_t req;
2791 rq = task_rq_lock(p, &flags);
2792 if (any_online_cpu(new_mask) == NR_CPUS) {
2797 if (__set_cpus_allowed(p, new_mask, &req)) {
2798 /* Need help from migration thread: drop lock and wait. */
2799 task_rq_unlock(rq, &flags);
2800 wake_up_process(rq->migration_thread);
2801 wait_for_completion(&req.done);
2805 task_rq_unlock(rq, &flags);
2809 EXPORT_SYMBOL_GPL(set_cpus_allowed);
2811 /* Move (not current) task off this cpu, onto dest cpu. */
2812 static void move_task_away(struct task_struct *p, int dest_cpu)
2814 runqueue_t *rq_dest;
2816 rq_dest = cpu_rq(dest_cpu);
2818 double_rq_lock(this_rq(), rq_dest);
2819 if (task_cpu(p) != smp_processor_id())
2820 goto out; /* Already moved */
2822 set_task_cpu(p, dest_cpu);
2824 deactivate_task(p, this_rq());
2825 activate_task(p, rq_dest);
2826 if (p->prio < rq_dest->curr->prio)
2827 resched_task(rq_dest->curr);
2829 p->timestamp = rq_dest->timestamp_last_tick;
2832 double_rq_unlock(this_rq(), rq_dest);
2836 * migration_thread - this is a highprio system thread that performs
2837 * thread migration by bumping thread off CPU then 'pushing' onto
2840 static int migration_thread(void * data)
2843 int cpu = (long)data;
2846 BUG_ON(rq->migration_thread != current);
2848 while (!kthread_should_stop()) {
2849 struct list_head *head;
2850 migration_req_t *req;
2852 if (current->flags & PF_FREEZE)
2853 refrigerator(PF_FREEZE);
2855 spin_lock_irq(&rq->lock);
2856 head = &rq->migration_queue;
2857 current->state = TASK_INTERRUPTIBLE;
2858 if (list_empty(head)) {
2859 spin_unlock_irq(&rq->lock);
2863 req = list_entry(head->next, migration_req_t, list);
2864 list_del_init(head->next);
2865 spin_unlock(&rq->lock);
2867 move_task_away(req->task,
2868 any_online_cpu(req->task->cpus_allowed));
2870 complete(&req->done);
2875 #ifdef CONFIG_HOTPLUG_CPU
2876 /* migrate_all_tasks - function to migrate all the tasks from the
2877 * current cpu caller must have already scheduled this to the target
2878 * cpu via set_cpus_allowed. Machine is stopped. */
2879 void migrate_all_tasks(void)
2881 struct task_struct *tsk, *t;
2882 int dest_cpu, src_cpu;
2885 /* We're nailed to this CPU. */
2886 src_cpu = smp_processor_id();
2888 /* Not required, but here for neatness. */
2889 write_lock(&tasklist_lock);
2891 /* watch out for per node tasks, let's stay on this node */
2892 node = cpu_to_node(src_cpu);
2894 do_each_thread(t, tsk) {
2899 if (task_cpu(tsk) != src_cpu)
2902 /* Figure out where this task should go (attempting to
2903 * keep it on-node), and check if it can be migrated
2904 * as-is. NOTE that kernel threads bound to more than
2905 * one online cpu will be migrated. */
2906 mask = node_to_cpumask(node);
2907 cpus_and(mask, mask, tsk->cpus_allowed);
2908 dest_cpu = any_online_cpu(mask);
2909 if (dest_cpu == NR_CPUS)
2910 dest_cpu = any_online_cpu(tsk->cpus_allowed);
2911 if (dest_cpu == NR_CPUS) {
2912 cpus_clear(tsk->cpus_allowed);
2913 cpus_complement(tsk->cpus_allowed);
2914 dest_cpu = any_online_cpu(tsk->cpus_allowed);
2916 /* Don't tell them about moving exiting tasks
2917 or kernel threads (both mm NULL), since
2918 they never leave kernel. */
2919 if (tsk->mm && printk_ratelimit())
2920 printk(KERN_INFO "process %d (%s) no "
2921 "longer affine to cpu%d\n",
2922 tsk->pid, tsk->comm, src_cpu);
2925 move_task_away(tsk, dest_cpu);
2926 } while_each_thread(t, tsk);
2928 write_unlock(&tasklist_lock);
2930 #endif /* CONFIG_HOTPLUG_CPU */
2933 * migration_call - callback that gets triggered when a CPU is added.
2934 * Here we can start up the necessary migration thread for the new CPU.
2936 static int migration_call(struct notifier_block *nfb, unsigned long action,
2939 int cpu = (long)hcpu;
2940 struct task_struct *p;
2941 struct runqueue *rq;
2942 unsigned long flags;
2945 case CPU_UP_PREPARE:
2946 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
2949 kthread_bind(p, cpu);
2950 /* Must be high prio: stop_machine expects to yield to it. */
2951 rq = task_rq_lock(p, &flags);
2952 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
2953 task_rq_unlock(rq, &flags);
2954 cpu_rq(cpu)->migration_thread = p;
2957 /* Strictly unneccessary, as first user will wake it. */
2958 wake_up_process(cpu_rq(cpu)->migration_thread);
2960 #ifdef CONFIG_HOTPLUG_CPU
2961 case CPU_UP_CANCELED:
2962 /* Unbind it from offline cpu so it can run. Fall thru. */
2963 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
2965 kthread_stop(cpu_rq(cpu)->migration_thread);
2966 cpu_rq(cpu)->migration_thread = NULL;
2967 BUG_ON(cpu_rq(cpu)->nr_running != 0);
2974 static struct notifier_block __devinitdata migration_notifier = {
2975 .notifier_call = migration_call,
2978 int __init migration_init(void)
2980 void *cpu = (void *)(long)smp_processor_id();
2981 /* Start one for boot CPU. */
2982 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
2983 migration_call(&migration_notifier, CPU_ONLINE, cpu);
2984 register_cpu_notifier(&migration_notifier);
2990 * The 'big kernel lock'
2992 * This spinlock is taken and released recursively by lock_kernel()
2993 * and unlock_kernel(). It is transparently dropped and reaquired
2994 * over schedule(). It is used to protect legacy code that hasn't
2995 * been migrated to a proper locking design yet.
2997 * Don't use in new code.
2999 * Note: spinlock debugging needs this even on !CONFIG_SMP.
3001 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
3002 EXPORT_SYMBOL(kernel_flag);
3004 void __init sched_init(void)
3009 for (i = 0; i < NR_CPUS; i++) {
3010 prio_array_t *array;
3013 rq->active = rq->arrays;
3014 rq->expired = rq->arrays + 1;
3015 rq->best_expired_prio = MAX_PRIO;
3017 spin_lock_init(&rq->lock);
3018 INIT_LIST_HEAD(&rq->migration_queue);
3019 INIT_LIST_HEAD(&rq->hold_queue);
3020 atomic_set(&rq->nr_iowait, 0);
3021 nr_running_init(rq);
3023 for (j = 0; j < 2; j++) {
3024 array = rq->arrays + j;
3025 for (k = 0; k < MAX_PRIO; k++) {
3026 INIT_LIST_HEAD(array->queue + k);
3027 __clear_bit(k, array->bitmap);
3029 // delimiter for bitsearch
3030 __set_bit(MAX_PRIO, array->bitmap);
3034 * We have to do a little magic to get the first
3035 * thread right in SMP mode.
3040 set_task_cpu(current, smp_processor_id());
3041 wake_up_forked_process(current);
3046 * The boot idle thread does lazy MMU switching as well:
3048 atomic_inc(&init_mm.mm_count);
3049 enter_lazy_tlb(&init_mm, current);
3052 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
3053 void __might_sleep(char *file, int line)
3055 #if defined(in_atomic)
3056 static unsigned long prev_jiffy; /* ratelimiting */
3058 if ((in_atomic() || irqs_disabled()) &&
3059 system_state == SYSTEM_RUNNING) {
3060 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
3062 prev_jiffy = jiffies;
3063 printk(KERN_ERR "Debug: sleeping function called from invalid"
3064 " context at %s:%d\n", file, line);
3065 printk("in_atomic():%d, irqs_disabled():%d\n",
3066 in_atomic(), irqs_disabled());
3071 EXPORT_SYMBOL(__might_sleep);
3075 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
3077 * This could be a long-held lock. If another CPU holds it for a long time,
3078 * and that CPU is not asked to reschedule then *this* CPU will spin on the
3079 * lock for a long time, even if *this* CPU is asked to reschedule.
3081 * So what we do here, in the slow (contended) path is to spin on the lock by
3082 * hand while permitting preemption.
3084 * Called inside preempt_disable().
3086 void __sched __preempt_spin_lock(spinlock_t *lock)
3088 if (preempt_count() > 1) {
3089 _raw_spin_lock(lock);
3094 while (spin_is_locked(lock))
3097 } while (!_raw_spin_trylock(lock));
3100 EXPORT_SYMBOL(__preempt_spin_lock);
3102 void __sched __preempt_write_lock(rwlock_t *lock)
3104 if (preempt_count() > 1) {
3105 _raw_write_lock(lock);
3111 while (rwlock_is_locked(lock))
3114 } while (!_raw_write_trylock(lock));
3117 EXPORT_SYMBOL(__preempt_write_lock);
3118 #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */