kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  */
  19
  20 #include <linux/mm.h>
  21 #include <linux/module.h>
  22 #include <linux/nmi.h>
  23 #include <linux/init.h>
  24 #include <asm/uaccess.h>
  25 #include <linux/highmem.h>
  26 #include <linux/smp_lock.h>
  27 #include <asm/mmu_context.h>
  28 #include <linux/interrupt.h>
  29 #include <linux/completion.h>
  30 #include <linux/kernel_stat.h>
  31 #include <linux/security.h>
  32 #include <linux/notifier.h>
  33 #include <linux/suspend.h>
  34 #include <linux/blkdev.h>
  35 #include <linux/delay.h>
  36 #include <linux/smp.h>
  37 #include <linux/timer.h>
  38 #include <linux/rcupdate.h>
  39 #include <linux/cpu.h>
  40 #include <linux/percpu.h>
  41 #include <linux/kthread.h>
  42
  43 #ifdef CONFIG_NUMA
  44 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
  45 #else
  46 #define cpu_to_node_mask(cpu) (cpu_online_map)
  47 #endif
  48
  49 /*
  50  * Convert user-nice values [ -20 ... 0 ... 19 ]
  51  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  52  * and back.
  53  */
  54 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  55 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  56 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  57
  58 /*
  59  * 'User priority' is the nice value converted to something we
  60  * can work with better when scaling various scheduler parameters,
  61  * it's a [ 0 ... 39 ] range.
  62  */
  63 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  64 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  65 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  66 #define AVG_TIMESLICE   (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
  67                         (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
  68
  69 /*
  70  * Some helpers for converting nanosecond timing to jiffy resolution
  71  */
  72 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  73 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  74
  75 /*
  76  * These are the 'tuning knobs' of the scheduler:
  77  *
  78  * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
  79  * maximum timeslice is 200 msecs. Timeslices get refilled after
  80  * they expire.
  81  */
  82 #define MIN_TIMESLICE           ( 10 * HZ / 1000)
  83 #define MAX_TIMESLICE           (200 * HZ / 1000)
  84 #define ON_RUNQUEUE_WEIGHT       30
  85 #define CHILD_PENALTY            95
  86 #define PARENT_PENALTY          100
  87 #define EXIT_WEIGHT               3
  88 #define PRIO_BONUS_RATIO         25
  89 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
  90 #define INTERACTIVE_DELTA         2
  91 #define MAX_SLEEP_AVG           (AVG_TIMESLICE * MAX_BONUS)
  92 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
  93 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
  94 #define NODE_THRESHOLD          125
  95 #define CREDIT_LIMIT            100
  96
  97 /*
  98  * If a task is 'interactive' then we reinsert it in the active
  99  * array after it has expired its current timeslice. (it will not
 100  * continue to run immediately, it will still roundrobin with
 101  * other interactive tasks.)
 102  *
 103  * This part scales the interactivity limit depending on niceness.
 104  *
 105  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
 106  * Here are a few examples of different nice levels:
 107  *
 108  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
 109  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
 110  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
 111  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
 112  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
 113  *
 114  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
 115  *  priority range a task can explore, a value of '1' means the
 116  *  task is rated interactive.)
 117  *
 118  * Ie. nice +19 tasks can never get 'interactive' enough to be
 119  * reinserted into the active array. And only heavily CPU-hog nice -20
 120  * tasks will be expired. Default nice 0 tasks are somewhere between,
 121  * it takes some effort for them to get interactive, but it's not
 122  * too hard.
 123  */
 124
 125 #define CURRENT_BONUS(p) \
 126         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 127                 MAX_SLEEP_AVG)
 128
 129 #ifdef CONFIG_SMP
 130 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
 131                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 132                         num_online_cpus())
 133 #else
 134 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
 135                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 136 #endif
 137
 138 #define SCALE(v1,v1_max,v2_max) \
 139         (v1) * (v2_max) / (v1_max)
 140
 141 #define DELTA(p) \
 142         (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
 143                 INTERACTIVE_DELTA)
 144
 145 #define TASK_INTERACTIVE(p) \
 146         ((p)->prio <= (p)->static_prio - DELTA(p))
 147
 148 #define INTERACTIVE_SLEEP(p) \
 149         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 150                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 151
 152 #define HIGH_CREDIT(p) \
 153         ((p)->interactive_credit > CREDIT_LIMIT)
 154
 155 #define LOW_CREDIT(p) \
 156         ((p)->interactive_credit < -CREDIT_LIMIT)
 157
 158 #define TASK_PREEMPTS_CURR(p, rq) \
 159         ((p)->prio < (rq)->curr->prio)
 160
 161 /*
 162  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
 163  * to time slice values.
 164  *
 165  * The higher a thread's priority, the bigger timeslices
 166  * it gets during one round of execution. But even the lowest
 167  * priority thread gets MIN_TIMESLICE worth of execution time.
 168  *
 169  * task_timeslice() is the interface that is used by the scheduler.
 170  */
 171
 172 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
 173                 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
 174                         (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
 175
 176 static inline unsigned int task_timeslice(task_t *p)
 177 {
 178         return BASE_TIMESLICE(p);
 179 }
 180
 181 /*
 182  * These are the runqueue data structures:
 183  */
 184
 185 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 186
 187 typedef struct runqueue runqueue_t;
 188
 189 struct prio_array {
 190         int nr_active;
 191         unsigned long bitmap[BITMAP_SIZE];
 192         struct list_head queue[MAX_PRIO];
 193 };
 194
 195 /*
 196  * This is the main, per-CPU runqueue data structure.
 197  *
 198  * Locking rule: those places that want to lock multiple runqueues
 199  * (such as the load balancing or the thread migration code), lock
 200  * acquire operations must be ordered by ascending &runqueue.
 201  */
 202 struct runqueue {
 203         spinlock_t lock;
 204         unsigned long long nr_switches;
 205         unsigned long nr_running, expired_timestamp, nr_uninterruptible,
 206                 timestamp_last_tick;
 207         task_t *curr, *idle;
 208         struct mm_struct *prev_mm;
 209         prio_array_t *active, *expired, arrays[2];
 210         int best_expired_prio, prev_cpu_load[NR_CPUS];
 211 #ifdef CONFIG_NUMA
 212         atomic_t *node_nr_running;
 213         int prev_node_load[MAX_NUMNODES];
 214 #endif
 215         task_t *migration_thread;
 216         struct list_head migration_queue;
 217
 218         atomic_t nr_iowait;
 219 };
 220
 221 static DEFINE_PER_CPU(struct runqueue, runqueues);
 222
 223 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 224 #define this_rq()               (&__get_cpu_var(runqueues))
 225 #define task_rq(p)              cpu_rq(task_cpu(p))
 226 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 227
 228 extern unsigned long __scheduling_functions_start_here;
 229 extern unsigned long __scheduling_functions_end_here;
 230 const unsigned long scheduling_functions_start_here =
 231                         (unsigned long)&__scheduling_functions_start_here;
 232 const unsigned long scheduling_functions_end_here =
 233                         (unsigned long)&__scheduling_functions_end_here;
 234
 235 /*
 236  * Default context-switch locking:
 237  */
 238 #ifndef prepare_arch_switch
 239 # define prepare_arch_switch(rq, next)  do { } while (0)
 240 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
 241 # define task_running(rq, p)            ((rq)->curr == (p))
 242 #endif
 243
 244 #ifdef CONFIG_NUMA
 245
 246 /*
 247  * Keep track of running tasks.
 248  */
 249
 250 static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
 251         {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
 252
 253 static inline void nr_running_init(struct runqueue *rq)
 254 {
 255         rq->node_nr_running = &node_nr_running[0];
 256 }
 257
 258 static inline void nr_running_inc(runqueue_t *rq)
 259 {
 260         atomic_inc(rq->node_nr_running);
 261         rq->nr_running++;
 262 }
 263
 264 static inline void nr_running_dec(runqueue_t *rq)
 265 {
 266         atomic_dec(rq->node_nr_running);
 267         rq->nr_running--;
 268 }
 269
 270 __init void node_nr_running_init(void)
 271 {
 272         int i;
 273
 274         for (i = 0; i < NR_CPUS; i++) {
 275                 if (cpu_possible(i))
 276                         cpu_rq(i)->node_nr_running =
 277                                 &node_nr_running[cpu_to_node(i)];
 278         }
 279 }
 280
 281 #else /* !CONFIG_NUMA */
 282
 283 # define nr_running_init(rq)    do { } while (0)
 284 # define nr_running_inc(rq)     do { (rq)->nr_running++; } while (0)
 285 # define nr_running_dec(rq)     do { (rq)->nr_running--; } while (0)
 286
 287 #endif /* CONFIG_NUMA */
 288
 289 /*
 290  * task_rq_lock - lock the runqueue a given task resides on and disable
 291  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 292  * explicitly disabling preemption.
 293  */
 294 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 295 {
 296         struct runqueue *rq;
 297
 298 repeat_lock_task:
 299         local_irq_save(*flags);
 300         rq = task_rq(p);
 301         spin_lock(&rq->lock);
 302         if (unlikely(rq != task_rq(p))) {
 303                 spin_unlock_irqrestore(&rq->lock, *flags);
 304                 goto repeat_lock_task;
 305         }
 306         return rq;
 307 }
 308
 309 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 310 {
 311         spin_unlock_irqrestore(&rq->lock, *flags);
 312 }
 313
 314 /*
 315  * rq_lock - lock a given runqueue and disable interrupts.
 316  */
 317 static inline runqueue_t *this_rq_lock(void)
 318 {
 319         runqueue_t *rq;
 320
 321         local_irq_disable();
 322         rq = this_rq();
 323         spin_lock(&rq->lock);
 324
 325         return rq;
 326 }
 327
 328 static inline void rq_unlock(runqueue_t *rq)
 329 {
 330         spin_unlock_irq(&rq->lock);
 331 }
 332
 333 /*
 334  * Adding/removing a task to/from a priority array:
 335  */
 336 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
 337 {
 338         array->nr_active--;
 339         list_del(&p->run_list);
 340         if (list_empty(array->queue + p->prio))
 341                 __clear_bit(p->prio, array->bitmap);
 342 }
 343
 344 static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
 345 {
 346         list_add_tail(&p->run_list, array->queue + p->prio);
 347         __set_bit(p->prio, array->bitmap);
 348         array->nr_active++;
 349         p->array = array;
 350 }
 351
 352 /*
 353  * effective_prio - return the priority that is based on the static
 354  * priority but is modified by bonuses/penalties.
 355  *
 356  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 357  * into the -5 ... 0 ... +5 bonus/penalty range.
 358  *
 359  * We use 25% of the full 0...39 priority range so that:
 360  *
 361  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 362  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 363  *
 364  * Both properties are important to certain workloads.
 365  */
 366 static int effective_prio(task_t *p)
 367 {
 368         int bonus, prio;
 369
 370         if (rt_task(p))
 371                 return p->prio;
 372
 373         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 374
 375         prio = p->static_prio - bonus;
 376         if (prio < MAX_RT_PRIO)
 377                 prio = MAX_RT_PRIO;
 378         if (prio > MAX_PRIO-1)
 379                 prio = MAX_PRIO-1;
 380         return prio;
 381 }
 382
 383 /*
 384  * __activate_task - move a task to the runqueue.
 385  */
 386 static inline void __activate_task(task_t *p, runqueue_t *rq)
 387 {
 388         enqueue_task(p, rq->active);
 389         nr_running_inc(rq);
 390 }
 391
 392 static void recalc_task_prio(task_t *p, unsigned long long now)
 393 {
 394         unsigned long long __sleep_time = now - p->timestamp;
 395         unsigned long sleep_time;
 396
 397         if (__sleep_time > NS_MAX_SLEEP_AVG)
 398                 sleep_time = NS_MAX_SLEEP_AVG;
 399         else
 400                 sleep_time = (unsigned long)__sleep_time;
 401
 402         if (likely(sleep_time > 0)) {
 403                 /*
 404                  * User tasks that sleep a long time are categorised as
 405                  * idle and will get just interactive status to stay active &
 406                  * prevent them suddenly becoming cpu hogs and starving
 407                  * other processes.
 408                  */
 409                 if (p->mm && p->activated != -1 &&
 410                         sleep_time > INTERACTIVE_SLEEP(p)) {
 411                                 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 412                                                 AVG_TIMESLICE);
 413                                 if (!HIGH_CREDIT(p))
 414                                         p->interactive_credit++;
 415                 } else {
 416                         /*
 417                          * The lower the sleep avg a task has the more
 418                          * rapidly it will rise with sleep time.
 419                          */
 420                         sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
 421
 422                         /*
 423                          * Tasks with low interactive_credit are limited to
 424                          * one timeslice worth of sleep avg bonus.
 425                          */
 426                         if (LOW_CREDIT(p) &&
 427                             sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
 428                                 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
 429
 430                         /*
 431                          * Non high_credit tasks waking from uninterruptible
 432                          * sleep are limited in their sleep_avg rise as they
 433                          * are likely to be cpu hogs waiting on I/O
 434                          */
 435                         if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
 436                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
 437                                         sleep_time = 0;
 438                                 else if (p->sleep_avg + sleep_time >=
 439                                                 INTERACTIVE_SLEEP(p)) {
 440                                         p->sleep_avg = INTERACTIVE_SLEEP(p);
 441                                         sleep_time = 0;
 442                                 }
 443                         }
 444
 445                         /*
 446                          * This code gives a bonus to interactive tasks.
 447                          *
 448                          * The boost works by updating the 'average sleep time'
 449                          * value here, based on ->timestamp. The more time a
 450                          * task spends sleeping, the higher the average gets -
 451                          * and the higher the priority boost gets as well.
 452                          */
 453                         p->sleep_avg += sleep_time;
 454
 455                         if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
 456                                 p->sleep_avg = NS_MAX_SLEEP_AVG;
 457                                 if (!HIGH_CREDIT(p))
 458                                         p->interactive_credit++;
 459                         }
 460                 }
 461         }
 462
 463         p->prio = effective_prio(p);
 464 }
 465
 466 /*
 467  * activate_task - move a task to the runqueue and do priority recalculation
 468  *
 469  * Update all the scheduling statistics stuff. (sleep average
 470  * calculation, priority modifiers, etc.)
 471  */
 472 static inline void activate_task(task_t *p, runqueue_t *rq)
 473 {
 474         unsigned long long now = sched_clock();
 475
 476         recalc_task_prio(p, now);
 477
 478         /*
 479          * This checks to make sure it's not an uninterruptible task
 480          * that is now waking up.
 481          */
 482         if (!p->activated) {
 483                 /*
 484                  * Tasks which were woken up by interrupts (ie. hw events)
 485                  * are most likely of interactive nature. So we give them
 486                  * the credit of extending their sleep time to the period
 487                  * of time they spend on the runqueue, waiting for execution
 488                  * on a CPU, first time around:
 489                  */
 490                 if (in_interrupt())
 491                         p->activated = 2;
 492                 else {
 493                         /*
 494                          * Normal first-time wakeups get a credit too for
 495                          * on-runqueue time, but it will be weighted down:
 496                          */
 497                         p->activated = 1;
 498                 }
 499         }
 500         p->timestamp = now;
 501
 502         __activate_task(p, rq);
 503 }
 504
 505 /*
 506  * deactivate_task - remove a task from the runqueue.
 507  */
 508 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 509 {
 510         nr_running_dec(rq);
 511         if (p->state == TASK_UNINTERRUPTIBLE)
 512                 rq->nr_uninterruptible++;
 513         dequeue_task(p, p->array);
 514         p->array = NULL;
 515 }
 516
 517 /*
 518  * resched_task - mark a task 'to be rescheduled now'.
 519  *
 520  * On UP this means the setting of the need_resched flag, on SMP it
 521  * might also involve a cross-CPU call to trigger the scheduler on
 522  * the target CPU.
 523  */
 524 static inline void resched_task(task_t *p)
 525 {
 526 #ifdef CONFIG_SMP
 527         int need_resched, nrpolling;
 528
 529         preempt_disable();
 530         /* minimise the chance of sending an interrupt to poll_idle() */
 531         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 532         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
 533         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 534
 535         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 536                 smp_send_reschedule(task_cpu(p));
 537         preempt_enable();
 538 #else
 539         set_tsk_need_resched(p);
 540 #endif
 541 }
 542
 543 /**
 544  * task_curr - is this task currently executing on a CPU?
 545  * @p: the task in question.
 546  */
 547 inline int task_curr(task_t *p)
 548 {
 549         return cpu_curr(task_cpu(p)) == p;
 550 }
 551
 552 #ifdef CONFIG_SMP
 553 typedef struct {
 554         struct list_head list;
 555         task_t *task;
 556         struct completion done;
 557 } migration_req_t;
 558
 559 /*
 560  * The task's runqueue lock must be held, and the new mask must be valid.
 561  * Returns true if you have to wait for migration thread.
 562  */
 563 static int __set_cpus_allowed(task_t *p, cpumask_t new_mask,
 564                                 migration_req_t *req)
 565 {
 566         runqueue_t *rq = task_rq(p);
 567
 568         p->cpus_allowed = new_mask;
 569         /*
 570          * Can the task run on the task's current CPU? If not then
 571          * migrate the thread off to a proper CPU.
 572          */
 573         if (cpu_isset(task_cpu(p), new_mask))
 574                 return 0;
 575
 576         /*
 577          * If the task is not on a runqueue (and not running), then
 578          * it is sufficient to simply update the task's cpu field.
 579          */
 580         if (!p->array && !task_running(rq, p)) {
 581                 set_task_cpu(p, any_online_cpu(p->cpus_allowed));
 582                 return 0;
 583         }
 584
 585         init_completion(&req->done);
 586         req->task = p;
 587         list_add(&req->list, &rq->migration_queue);
 588         return 1;
 589 }
 590
 591 /*
 592  * wait_task_inactive - wait for a thread to unschedule.
 593  *
 594  * The caller must ensure that the task *will* unschedule sometime soon,
 595  * else this function might spin for a *long* time. This function can't
 596  * be called with interrupts off, or it may introduce deadlock with
 597  * smp_call_function() if an IPI is sent by the same process we are
 598  * waiting to become inactive.
 599  */
 600 void wait_task_inactive(task_t * p)
 601 {
 602         unsigned long flags;
 603         runqueue_t *rq;
 604         int preempted;
 605
 606 repeat:
 607         rq = task_rq_lock(p, &flags);
 608         /* Must be off runqueue entirely, not preempted. */
 609         if (unlikely(p->array)) {
 610                 /* If it's preempted, we yield.  It could be a while. */
 611                 preempted = !task_running(rq, p);
 612                 task_rq_unlock(rq, &flags);
 613                 cpu_relax();
 614                 if (preempted)
 615                         yield();
 616                 goto repeat;
 617         }
 618         task_rq_unlock(rq, &flags);
 619 }
 620
 621 /***
 622  * kick_process - kick a running thread to enter/exit the kernel
 623  * @p: the to-be-kicked thread
 624  *
 625  * Cause a process which is running on another CPU to enter
 626  * kernel-mode, without any delay. (to get signals handled.)
 627  */
 628 void kick_process(task_t *p)
 629 {
 630         int cpu;
 631
 632         preempt_disable();
 633         cpu = task_cpu(p);
 634         if ((cpu != smp_processor_id()) && task_curr(p))
 635                 smp_send_reschedule(cpu);
 636         preempt_enable();
 637 }
 638
 639 EXPORT_SYMBOL_GPL(kick_process);
 640
 641 #endif
 642
 643 /***
 644  * try_to_wake_up - wake up a thread
 645  * @p: the to-be-woken-up thread
 646  * @state: the mask of task states that can be woken
 647  * @sync: do a synchronous wakeup?
 648  *
 649  * Put it on the run-queue if it's not already there. The "current"
 650  * thread is always on the run-queue (except when the actual
 651  * re-schedule is in progress), and as such you're allowed to do
 652  * the simpler "current->state = TASK_RUNNING" to mark yourself
 653  * runnable without the overhead of this.
 654  *
 655  * returns failure only if the task is already active.
 656  */
 657 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 658 {
 659         unsigned long flags;
 660         int success = 0;
 661         long old_state;
 662         runqueue_t *rq;
 663
 664 repeat_lock_task:
 665         rq = task_rq_lock(p, &flags);
 666         old_state = p->state;
 667         if (old_state & state) {
 668                 if (!p->array) {
 669                         /*
 670                          * Fast-migrate the task if it's not running or runnable
 671                          * currently. Do not violate hard affinity.
 672                          */
 673                         if (unlikely(sync && !task_running(rq, p) &&
 674                                 (task_cpu(p) != smp_processor_id()) &&
 675                                         cpu_isset(smp_processor_id(),
 676                                                         p->cpus_allowed) &&
 677                                         !cpu_is_offline(smp_processor_id()))) {
 678                                 set_task_cpu(p, smp_processor_id());
 679                                 task_rq_unlock(rq, &flags);
 680                                 goto repeat_lock_task;
 681                         }
 682                         if (old_state == TASK_UNINTERRUPTIBLE) {
 683                                 rq->nr_uninterruptible--;
 684                                 /*
 685                                  * Tasks on involuntary sleep don't earn
 686                                  * sleep_avg beyond just interactive state.
 687                                  */
 688                                 p->activated = -1;
 689                         }
 690                         if (sync && (task_cpu(p) == smp_processor_id()))
 691                                 __activate_task(p, rq);
 692                         else {
 693                                 activate_task(p, rq);
 694                                 if (TASK_PREEMPTS_CURR(p, rq))
 695                                         resched_task(rq->curr);
 696                         }
 697                         success = 1;
 698                 }
 699                 p->state = TASK_RUNNING;
 700         }
 701         task_rq_unlock(rq, &flags);
 702
 703         return success;
 704 }
 705 int fastcall wake_up_process(task_t * p)
 706 {
 707         return try_to_wake_up(p, TASK_STOPPED |
 708                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 709 }
 710
 711 EXPORT_SYMBOL(wake_up_process);
 712
 713 int fastcall wake_up_state(task_t *p, unsigned int state)
 714 {
 715         return try_to_wake_up(p, state, 0);
 716 }
 717
 718 /*
 719  * Perform scheduler related setup for a newly forked process p.
 720  * p is forked by current.
 721  */
 722 void fastcall sched_fork(task_t *p)
 723 {
 724         /*
 725          * We mark the process as running here, but have not actually
 726          * inserted it onto the runqueue yet. This guarantees that
 727          * nobody will actually run it, and a signal or other external
 728          * event cannot wake it up and insert it on the runqueue either.
 729          */
 730         p->state = TASK_RUNNING;
 731         INIT_LIST_HEAD(&p->run_list);
 732         p->array = NULL;
 733         spin_lock_init(&p->switch_lock);
 734 #ifdef CONFIG_PREEMPT
 735         /*
 736          * During context-switch we hold precisely one spinlock, which
 737          * schedule_tail drops. (in the common case it's this_rq()->lock,
 738          * but it also can be p->switch_lock.) So we compensate with a count
 739          * of 1. Also, we want to start with kernel preemption disabled.
 740          */
 741         p->thread_info->preempt_count = 1;
 742 #endif
 743         /*
 744          * Share the timeslice between parent and child, thus the
 745          * total amount of pending timeslices in the system doesn't change,
 746          * resulting in more scheduling fairness.
 747          */
 748         local_irq_disable();
 749         p->time_slice = (current->time_slice + 1) >> 1;
 750         /*
 751          * The remainder of the first timeslice might be recovered by
 752          * the parent if the child exits early enough.
 753          */
 754         p->first_time_slice = 1;
 755         current->time_slice >>= 1;
 756         p->timestamp = sched_clock();
 757         if (!current->time_slice) {
 758                 /*
 759                  * This case is rare, it happens when the parent has only
 760                  * a single jiffy left from its timeslice. Taking the
 761                  * runqueue lock is not a problem.
 762                  */
 763                 current->time_slice = 1;
 764                 preempt_disable();
 765                 scheduler_tick(0, 0);
 766                 local_irq_enable();
 767                 preempt_enable();
 768         } else
 769                 local_irq_enable();
 770 }
 771
 772 /*
 773  * wake_up_forked_process - wake up a freshly forked process.
 774  *
 775  * This function will do some initial scheduler statistics housekeeping
 776  * that must be done for every newly created process.
 777  */
 778 void fastcall wake_up_forked_process(task_t * p)
 779 {
 780         unsigned long flags;
 781         runqueue_t *rq = task_rq_lock(current, &flags);
 782
 783         BUG_ON(p->state != TASK_RUNNING);
 784
 785         /*
 786          * We decrease the sleep average of forking parents
 787          * and children as well, to keep max-interactive tasks
 788          * from forking tasks that are max-interactive.
 789          */
 790         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
 791                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 792
 793         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
 794                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 795
 796         p->interactive_credit = 0;
 797
 798         p->prio = effective_prio(p);
 799         set_task_cpu(p, smp_processor_id());
 800
 801         if (unlikely(!current->array))
 802                 __activate_task(p, rq);
 803         else {
 804                 p->prio = current->prio;
 805                 list_add_tail(&p->run_list, &current->run_list);
 806                 p->array = current->array;
 807                 p->array->nr_active++;
 808                 nr_running_inc(rq);
 809         }
 810         task_rq_unlock(rq, &flags);
 811 }
 812
 813 /*
 814  * Potentially available exiting-child timeslices are
 815  * retrieved here - this way the parent does not get
 816  * penalized for creating too many threads.
 817  *
 818  * (this cannot be used to 'generate' timeslices
 819  * artificially, because any timeslice recovered here
 820  * was given away by the parent in the first place.)
 821  */
 822 void fastcall sched_exit(task_t * p)
 823 {
 824         unsigned long flags;
 825         runqueue_t *rq;
 826
 827         local_irq_save(flags);
 828         if (p->first_time_slice) {
 829                 p->parent->time_slice += p->time_slice;
 830                 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
 831                         p->parent->time_slice = MAX_TIMESLICE;
 832         }
 833         local_irq_restore(flags);
 834         /*
 835          * If the child was a (relative-) CPU hog then decrease
 836          * the sleep_avg of the parent as well.
 837          */
 838         rq = task_rq_lock(p->parent, &flags);
 839         if (p->sleep_avg < p->parent->sleep_avg)
 840                 p->parent->sleep_avg = p->parent->sleep_avg /
 841                 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
 842                 (EXIT_WEIGHT + 1);
 843         task_rq_unlock(rq, &flags);
 844 }
 845
 846 /**
 847  * finish_task_switch - clean up after a task-switch
 848  * @prev: the thread we just switched away from.
 849  *
 850  * We enter this with the runqueue still locked, and finish_arch_switch()
 851  * will unlock it along with doing any other architecture-specific cleanup
 852  * actions.
 853  *
 854  * Note that we may have delayed dropping an mm in context_switch(). If
 855  * so, we finish that here outside of the runqueue lock.  (Doing it
 856  * with the lock held can cause deadlocks; see schedule() for
 857  * details.)
 858  */
 859 static inline void finish_task_switch(task_t *prev)
 860 {
 861         runqueue_t *rq = this_rq();
 862         struct mm_struct *mm = rq->prev_mm;
 863         unsigned long prev_task_flags;
 864
 865         rq->prev_mm = NULL;
 866
 867         /*
 868          * A task struct has one reference for the use as "current".
 869          * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
 870          * schedule one last time. The schedule call will never return,
 871          * and the scheduled task must drop that reference.
 872          * The test for TASK_ZOMBIE must occur while the runqueue locks are
 873          * still held, otherwise prev could be scheduled on another cpu, die
 874          * there before we look at prev->state, and then the reference would
 875          * be dropped twice.
 876          *              Manfred Spraul <manfred@colorfullife.com>
 877          */
 878         prev_task_flags = prev->flags;
 879         finish_arch_switch(rq, prev);
 880         if (mm)
 881                 mmdrop(mm);
 882         if (unlikely(prev_task_flags & PF_DEAD))
 883                 put_task_struct(prev);
 884 }
 885
 886 /**
 887  * schedule_tail - first thing a freshly forked thread must call.
 888  * @prev: the thread we just switched away from.
 889  */
 890 asmlinkage void schedule_tail(task_t *prev)
 891 {
 892         finish_task_switch(prev);
 893
 894         if (current->set_child_tid)
 895                 put_user(current->pid, current->set_child_tid);
 896 }
 897
 898 /*
 899  * context_switch - switch to the new MM and the new
 900  * thread's register state.
 901  */
 902 static inline
 903 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
 904 {
 905         struct mm_struct *mm = next->mm;
 906         struct mm_struct *oldmm = prev->active_mm;
 907
 908         if (unlikely(!mm)) {
 909                 next->active_mm = oldmm;
 910                 atomic_inc(&oldmm->mm_count);
 911                 enter_lazy_tlb(oldmm, next);
 912         } else
 913                 switch_mm(oldmm, mm, next);
 914
 915         if (unlikely(!prev->mm)) {
 916                 prev->active_mm = NULL;
 917                 WARN_ON(rq->prev_mm);
 918                 rq->prev_mm = oldmm;
 919         }
 920
 921         /* Here we just switch the register state and the stack. */
 922         switch_to(prev, next, prev);
 923
 924         return prev;
 925 }
 926
 927 /*
 928  * nr_running, nr_uninterruptible and nr_context_switches:
 929  *
 930  * externally visible scheduler statistics: current number of runnable
 931  * threads, current number of uninterruptible-sleeping threads, total
 932  * number of context switches performed since bootup.
 933  */
 934 unsigned long nr_running(void)
 935 {
 936         unsigned long i, sum = 0;
 937
 938         for (i = 0; i < NR_CPUS; i++)
 939                 sum += cpu_rq(i)->nr_running;
 940
 941         return sum;
 942 }
 943
 944 unsigned long nr_uninterruptible(void)
 945 {
 946         unsigned long i, sum = 0;
 947
 948         for_each_cpu(i)
 949                 sum += cpu_rq(i)->nr_uninterruptible;
 950
 951         return sum;
 952 }
 953
 954 unsigned long long nr_context_switches(void)
 955 {
 956         unsigned long long i, sum = 0;
 957
 958         for_each_cpu(i)
 959                 sum += cpu_rq(i)->nr_switches;
 960
 961         return sum;
 962 }
 963
 964 unsigned long nr_iowait(void)
 965 {
 966         unsigned long i, sum = 0;
 967
 968         for_each_cpu(i)
 969                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
 970
 971         return sum;
 972 }
 973
 974 /*
 975  * double_rq_lock - safely lock two runqueues
 976  *
 977  * Note this does not disable interrupts like task_rq_lock,
 978  * you need to do so manually before calling.
 979  */
 980 static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
 981 {
 982         if (rq1 == rq2)
 983                 spin_lock(&rq1->lock);
 984         else {
 985                 if (rq1 < rq2) {
 986                         spin_lock(&rq1->lock);
 987                         spin_lock(&rq2->lock);
 988                 } else {
 989                         spin_lock(&rq2->lock);
 990                         spin_lock(&rq1->lock);
 991                 }
 992         }
 993 }
 994
 995 /*
 996  * double_rq_unlock - safely unlock two runqueues
 997  *
 998  * Note this does not restore interrupts like task_rq_unlock,
 999  * you need to do so manually after calling.
1000  */
1001 static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1002 {
1003         spin_unlock(&rq1->lock);
1004         if (rq1 != rq2)
1005                 spin_unlock(&rq2->lock);
1006 }
1007
1008 #ifdef CONFIG_NUMA
1009 /*
1010  * If dest_cpu is allowed for this process, migrate the task to it.
1011  * This is accomplished by forcing the cpu_allowed mask to only
1012  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1013  * the cpu_allowed mask is restored.
1014  */
1015 static void sched_migrate_task(task_t *p, int dest_cpu)
1016 {
1017         runqueue_t *rq;
1018         migration_req_t req;
1019         unsigned long flags;
1020         cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu);
1021
1022         lock_cpu_hotplug();
1023         rq = task_rq_lock(p, &flags);
1024         old_mask = p->cpus_allowed;
1025         if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu))
1026                 goto out;
1027
1028         /* force the process onto the specified CPU */
1029         if (__set_cpus_allowed(p, new_mask, &req)) {
1030                 /* Need to wait for migration thread. */
1031                 task_rq_unlock(rq, &flags);
1032                 wake_up_process(rq->migration_thread);
1033                 wait_for_completion(&req.done);
1034
1035                 /* If we raced with sys_sched_setaffinity, don't
1036                  * restore mask. */
1037                 rq = task_rq_lock(p, &flags);
1038                 if (likely(cpus_equal(p->cpus_allowed, new_mask))) {
1039                         /* Restore old mask: won't need migration
1040                          * thread, since current cpu is allowed. */
1041                         BUG_ON(__set_cpus_allowed(p, old_mask, NULL));
1042                 }
1043         }
1044 out:
1045         task_rq_unlock(rq, &flags);
1046         unlock_cpu_hotplug();
1047 }
1048
1049 /*
1050  * Find the least loaded CPU.  Slightly favor the current CPU by
1051  * setting its runqueue length as the minimum to start.
1052  */
1053 static int sched_best_cpu(struct task_struct *p)
1054 {
1055         int i, minload, load, best_cpu, node = 0;
1056         cpumask_t cpumask;
1057
1058         best_cpu = task_cpu(p);
1059         if (cpu_rq(best_cpu)->nr_running <= 2)
1060                 return best_cpu;
1061
1062         minload = 10000000;
1063         for_each_node_with_cpus(i) {
1064                 /*
1065                  * Node load is always divided by nr_cpus_node to normalise
1066                  * load values in case cpu count differs from node to node.
1067                  * We first multiply node_nr_running by 10 to get a little
1068                  * better resolution.
1069                  */
1070                 load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
1071                 if (load < minload) {
1072                         minload = load;
1073                         node = i;
1074                 }
1075         }
1076
1077         minload = 10000000;
1078         cpumask = node_to_cpumask(node);
1079         for (i = 0; i < NR_CPUS; ++i) {
1080                 if (!cpu_isset(i, cpumask))
1081                         continue;
1082                 if (cpu_rq(i)->nr_running < minload) {
1083                         best_cpu = i;
1084                         minload = cpu_rq(i)->nr_running;
1085                 }
1086         }
1087         return best_cpu;
1088 }
1089
1090 void sched_balance_exec(void)
1091 {
1092         int new_cpu;
1093
1094         if (numnodes > 1) {
1095                 new_cpu = sched_best_cpu(current);
1096                 if (new_cpu != smp_processor_id())
1097                         sched_migrate_task(current, new_cpu);
1098         }
1099 }
1100
1101 /*
1102  * Find the busiest node. All previous node loads contribute with a
1103  * geometrically deccaying weight to the load measure:
1104  *      load_{t} = load_{t-1}/2 + nr_node_running_{t}
1105  * This way sudden load peaks are flattened out a bit.
1106  * Node load is divided by nr_cpus_node() in order to compare nodes
1107  * of different cpu count but also [first] multiplied by 10 to
1108  * provide better resolution.
1109  */
1110 static int find_busiest_node(int this_node)
1111 {
1112         int i, node = -1, load, this_load, maxload;
1113
1114         if (!nr_cpus_node(this_node))
1115                 return node;
1116         this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
1117                 + (10 * atomic_read(&node_nr_running[this_node])
1118                 / nr_cpus_node(this_node));
1119         this_rq()->prev_node_load[this_node] = this_load;
1120         for_each_node_with_cpus(i) {
1121                 if (i == this_node)
1122                         continue;
1123                 load = (this_rq()->prev_node_load[i] >> 1)
1124                         + (10 * atomic_read(&node_nr_running[i])
1125                         / nr_cpus_node(i));
1126                 this_rq()->prev_node_load[i] = load;
1127                 if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
1128                         maxload = load;
1129                         node = i;
1130                 }
1131         }
1132         return node;
1133 }
1134
1135 #endif /* CONFIG_NUMA */
1136
1137 #ifdef CONFIG_SMP
1138
1139 /*
1140  * double_lock_balance - lock the busiest runqueue
1141  *
1142  * this_rq is locked already. Recalculate nr_running if we have to
1143  * drop the runqueue lock.
1144  */
1145 static inline
1146 unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest,
1147                                  int this_cpu, int idle,
1148                                  unsigned int nr_running)
1149 {
1150         if (unlikely(!spin_trylock(&busiest->lock))) {
1151                 if (busiest < this_rq) {
1152                         spin_unlock(&this_rq->lock);
1153                         spin_lock(&busiest->lock);
1154                         spin_lock(&this_rq->lock);
1155                         /* Need to recalculate nr_running */
1156                         if (idle || (this_rq->nr_running >
1157                                         this_rq->prev_cpu_load[this_cpu]))
1158                                 nr_running = this_rq->nr_running;
1159                         else
1160                                 nr_running = this_rq->prev_cpu_load[this_cpu];
1161                 } else
1162                         spin_lock(&busiest->lock);
1163         }
1164         return nr_running;
1165 }
1166
1167 /*
1168  * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
1169  */
1170 static inline
1171 runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle,
1172                                int *imbalance, cpumask_t cpumask)
1173 {
1174         int nr_running, load, max_load, i;
1175         runqueue_t *busiest, *rq_src;
1176
1177         /*
1178          * We search all runqueues to find the most busy one.
1179          * We do this lockless to reduce cache-bouncing overhead,
1180          * we re-check the 'best' source CPU later on again, with
1181          * the lock held.
1182          *
1183          * We fend off statistical fluctuations in runqueue lengths by
1184          * saving the runqueue length (as seen by the balancing CPU) during
1185          * the previous load-balancing operation and using the smaller one
1186          * of the current and saved lengths. If a runqueue is long enough
1187          * for a longer amount of time then we recognize it and pull tasks
1188          * from it.
1189          *
1190          * The 'current runqueue length' is a statistical maximum variable,
1191          * for that one we take the longer one - to avoid fluctuations in
1192          * the other direction. So for a load-balance to happen it needs
1193          * stable long runqueue on the target CPU and stable short runqueue
1194          * on the local runqueue.
1195          *
1196          * We make an exception if this CPU is about to become idle - in
1197          * that case we are less picky about moving a task across CPUs and
1198          * take what can be taken.
1199          */
1200         if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
1201                 nr_running = this_rq->nr_running;
1202         else
1203                 nr_running = this_rq->prev_cpu_load[this_cpu];
1204
1205         busiest = NULL;
1206         max_load = 1;
1207         for (i = 0; i < NR_CPUS; i++) {
1208                 if (!cpu_isset(i, cpumask))
1209                         continue;
1210
1211                 rq_src = cpu_rq(i);
1212                 if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
1213                         load = rq_src->nr_running;
1214                 else
1215                         load = this_rq->prev_cpu_load[i];
1216                 this_rq->prev_cpu_load[i] = rq_src->nr_running;
1217
1218                 if ((load > max_load) && (rq_src != this_rq)) {
1219                         busiest = rq_src;
1220                         max_load = load;
1221                 }
1222         }
1223
1224         if (likely(!busiest))
1225                 goto out;
1226
1227         *imbalance = max_load - nr_running;
1228
1229         /* It needs an at least ~25% imbalance to trigger balancing. */
1230         if (!idle && ((*imbalance)*4 < max_load)) {
1231                 busiest = NULL;
1232                 goto out;
1233         }
1234
1235         nr_running = double_lock_balance(this_rq, busiest, this_cpu,
1236                                          idle, nr_running);
1237         /*
1238          * Make sure nothing changed since we checked the
1239          * runqueue length.
1240          */
1241         if (busiest->nr_running <= nr_running) {
1242                 spin_unlock(&busiest->lock);
1243                 busiest = NULL;
1244         }
1245 out:
1246         return busiest;
1247 }
1248
1249 /*
1250  * pull_task - move a task from a remote runqueue to the local runqueue.
1251  * Both runqueues must be locked.
1252  */
1253 static inline
1254 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1255                runqueue_t *this_rq, int this_cpu)
1256 {
1257         dequeue_task(p, src_array);
1258         nr_running_dec(src_rq);
1259         set_task_cpu(p, this_cpu);
1260         nr_running_inc(this_rq);
1261         enqueue_task(p, this_rq->active);
1262         p->timestamp = sched_clock() -
1263                                 (src_rq->timestamp_last_tick - p->timestamp);
1264         /*
1265          * Note that idle threads have a prio of MAX_PRIO, for this test
1266          * to be always true for them.
1267          */
1268         if (TASK_PREEMPTS_CURR(p, this_rq))
1269                 set_need_resched();
1270 }
1271
1272 /*
1273  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1274  */
1275 static inline
1276 int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
1277 {
1278         unsigned long delta = rq->timestamp_last_tick - tsk->timestamp;
1279
1280         /*
1281          * We do not migrate tasks that are:
1282          * 1) running (obviously), or
1283          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1284          * 3) are cache-hot on their current CPU.
1285          */
1286         if (task_running(rq, tsk))
1287                 return 0;
1288         if (!cpu_isset(this_cpu, tsk->cpus_allowed))
1289                 return 0;
1290         if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks)))
1291                 return 0;
1292         return 1;
1293 }
1294
1295 /*
1296  * Current runqueue is empty, or rebalance tick: if there is an
1297  * inbalance (current runqueue is too short) then pull from
1298  * busiest runqueue(s).
1299  *
1300  * We call this with the current runqueue locked,
1301  * irqs disabled.
1302  */
1303 static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
1304 {
1305         int imbalance, idx, this_cpu = smp_processor_id();
1306         runqueue_t *busiest;
1307         prio_array_t *array;
1308         struct list_head *head, *curr;
1309         task_t *tmp;
1310
1311         if (cpu_is_offline(this_cpu))
1312                 goto out;
1313
1314         busiest = find_busiest_queue(this_rq, this_cpu, idle,
1315                                      &imbalance, cpumask);
1316         if (!busiest)
1317                 goto out;
1318
1319         /*
1320          * We only want to steal a number of tasks equal to 1/2 the imbalance,
1321          * otherwise we'll just shift the imbalance to the new queue:
1322          */
1323         imbalance /= 2;
1324
1325         /*
1326          * We first consider expired tasks. Those will likely not be
1327          * executed in the near future, and they are most likely to
1328          * be cache-cold, thus switching CPUs has the least effect
1329          * on them.
1330          */
1331         if (busiest->expired->nr_active)
1332                 array = busiest->expired;
1333         else
1334                 array = busiest->active;
1335
1336 new_array:
1337         /* Start searching at priority 0: */
1338         idx = 0;
1339 skip_bitmap:
1340         if (!idx)
1341                 idx = sched_find_first_bit(array->bitmap);
1342         else
1343                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1344         if (idx >= MAX_PRIO) {
1345                 if (array == busiest->expired) {
1346                         array = busiest->active;
1347                         goto new_array;
1348                 }
1349                 goto out_unlock;
1350         }
1351
1352         head = array->queue + idx;
1353         curr = head->prev;
1354 skip_queue:
1355         tmp = list_entry(curr, task_t, run_list);
1356
1357         curr = curr->prev;
1358
1359         if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
1360                 if (curr != head)
1361                         goto skip_queue;
1362                 idx++;
1363                 goto skip_bitmap;
1364         }
1365         pull_task(busiest, array, tmp, this_rq, this_cpu);
1366
1367         /* Only migrate one task if we are idle */
1368         if (!idle && --imbalance) {
1369                 if (curr != head)
1370                         goto skip_queue;
1371                 idx++;
1372                 goto skip_bitmap;
1373         }
1374 out_unlock:
1375         spin_unlock(&busiest->lock);
1376 out:
1377         ;
1378 }
1379
1380 /*
1381  * One of the idle_cpu_tick() and busy_cpu_tick() functions will
1382  * get called every timer tick, on every CPU. Our balancing action
1383  * frequency and balancing agressivity depends on whether the CPU is
1384  * idle or not.
1385  *
1386  * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
1387  * systems with HZ=100, every 10 msecs.)
1388  *
1389  * On NUMA, do a node-rebalance every 400 msecs.
1390  */
1391 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1392 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
1393 #define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
1394 #define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
1395
1396 #ifdef CONFIG_NUMA
1397 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
1398 {
1399         int node = find_busiest_node(cpu_to_node(this_cpu));
1400
1401         if (node >= 0) {
1402                 cpumask_t cpumask = node_to_cpumask(node);
1403                 cpu_set(this_cpu, cpumask);
1404                 spin_lock(&this_rq->lock);
1405                 load_balance(this_rq, idle, cpumask);
1406                 spin_unlock(&this_rq->lock);
1407         }
1408 }
1409 #endif
1410
1411 static void rebalance_tick(runqueue_t *this_rq, int idle)
1412 {
1413 #ifdef CONFIG_NUMA
1414         int this_cpu = smp_processor_id();
1415 #endif
1416         unsigned long j = jiffies;
1417
1418         /*
1419          * First do inter-node rebalancing, then intra-node rebalancing,
1420          * if both events happen in the same tick. The inter-node
1421          * rebalancing does not necessarily have to create a perfect
1422          * balance within the node, since we load-balance the most loaded
1423          * node with the current CPU. (ie. other CPUs in the local node
1424          * are not balanced.)
1425          */
1426         if (idle) {
1427 #ifdef CONFIG_NUMA
1428                 if (!(j % IDLE_NODE_REBALANCE_TICK))
1429                         balance_node(this_rq, idle, this_cpu);
1430 #endif
1431                 if (!(j % IDLE_REBALANCE_TICK)) {
1432                         spin_lock(&this_rq->lock);
1433                         load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1434                         spin_unlock(&this_rq->lock);
1435                 }
1436                 return;
1437         }
1438 #ifdef CONFIG_NUMA
1439         if (!(j % BUSY_NODE_REBALANCE_TICK))
1440                 balance_node(this_rq, idle, this_cpu);
1441 #endif
1442         if (!(j % BUSY_REBALANCE_TICK)) {
1443                 spin_lock(&this_rq->lock);
1444                 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1445                 spin_unlock(&this_rq->lock);
1446         }
1447 }
1448 #else
1449 /*
1450  * on UP we do not need to balance between CPUs:
1451  */
1452 static inline void rebalance_tick(runqueue_t *this_rq, int idle)
1453 {
1454 }
1455 #endif
1456
1457 DEFINE_PER_CPU(struct kernel_stat, kstat);
1458
1459 EXPORT_PER_CPU_SYMBOL(kstat);
1460
1461 /*
1462  * We place interactive tasks back into the active array, if possible.
1463  *
1464  * To guarantee that this does not starve expired tasks we ignore the
1465  * interactivity of a task if the first expired task had to wait more
1466  * than a 'reasonable' amount of time. This deadline timeout is
1467  * load-dependent, as the frequency of array switched decreases with
1468  * increasing number of running tasks. We also ignore the interactivity
1469  * if a better static_prio task has expired:
1470  */
1471 #define EXPIRED_STARVING(rq) \
1472         ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1473                 (jiffies - (rq)->expired_timestamp >= \
1474                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1475                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
1476
1477 /*
1478  * This function gets called by the timer code, with HZ frequency.
1479  * We call it with interrupts disabled.
1480  *
1481  * It also gets called by the fork code, when changing the parent's
1482  * timeslices.
1483  */
1484 void scheduler_tick(int user_ticks, int sys_ticks)
1485 {
1486         int cpu = smp_processor_id();
1487         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1488         runqueue_t *rq = this_rq();
1489         task_t *p = current;
1490
1491         rq->timestamp_last_tick = sched_clock();
1492
1493         if (rcu_pending(cpu))
1494                 rcu_check_callbacks(cpu, user_ticks);
1495
1496         /* note: this timer irq context must be accounted for as well */
1497         if (hardirq_count() - HARDIRQ_OFFSET) {
1498                 cpustat->irq += sys_ticks;
1499                 sys_ticks = 0;
1500         } else if (softirq_count()) {
1501                 cpustat->softirq += sys_ticks;
1502                 sys_ticks = 0;
1503         }
1504
1505         if (p == rq->idle) {
1506                 if (atomic_read(&rq->nr_iowait) > 0)
1507                         cpustat->iowait += sys_ticks;
1508                 else
1509                         cpustat->idle += sys_ticks;
1510                 rebalance_tick(rq, 1);
1511                 return;
1512         }
1513         if (TASK_NICE(p) > 0)
1514                 cpustat->nice += user_ticks;
1515         else
1516                 cpustat->user += user_ticks;
1517         cpustat->system += sys_ticks;
1518
1519         /* Task might have expired already, but not scheduled off yet */
1520         if (p->array != rq->active) {
1521                 set_tsk_need_resched(p);
1522                 goto out;
1523         }
1524         spin_lock(&rq->lock);
1525         /*
1526          * The task was running during this tick - update the
1527          * time slice counter. Note: we do not update a thread's
1528          * priority until it either goes to sleep or uses up its
1529          * timeslice. This makes it possible for interactive tasks
1530          * to use up their timeslices at their highest priority levels.
1531          */
1532         if (unlikely(rt_task(p))) {
1533                 /*
1534                  * RR tasks need a special form of timeslice management.
1535                  * FIFO tasks have no timeslices.
1536                  */
1537                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
1538                         p->time_slice = task_timeslice(p);
1539                         p->first_time_slice = 0;
1540                         set_tsk_need_resched(p);
1541
1542                         /* put it at the end of the queue: */
1543                         dequeue_task(p, rq->active);
1544                         enqueue_task(p, rq->active);
1545                 }
1546                 goto out_unlock;
1547         }
1548         if (!--p->time_slice) {
1549                 dequeue_task(p, rq->active);
1550                 set_tsk_need_resched(p);
1551                 p->prio = effective_prio(p);
1552                 p->time_slice = task_timeslice(p);
1553                 p->first_time_slice = 0;
1554
1555                 if (!rq->expired_timestamp)
1556                         rq->expired_timestamp = jiffies;
1557                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1558                         enqueue_task(p, rq->expired);
1559                         if (p->static_prio < rq->best_expired_prio)
1560                                 rq->best_expired_prio = p->static_prio;
1561                 } else
1562                         enqueue_task(p, rq->active);
1563         } else {
1564                 /*
1565                  * Prevent a too long timeslice allowing a task to monopolize
1566                  * the CPU. We do this by splitting up the timeslice into
1567                  * smaller pieces.
1568                  *
1569                  * Note: this does not mean the task's timeslices expire or
1570                  * get lost in any way, they just might be preempted by
1571                  * another task of equal priority. (one with higher
1572                  * priority would have preempted this task already.) We
1573                  * requeue this task to the end of the list on this priority
1574                  * level, which is in essence a round-robin of tasks with
1575                  * equal priority.
1576                  *
1577                  * This only applies to tasks in the interactive
1578                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
1579                  */
1580                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1581                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1582                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1583                         (p->array == rq->active)) {
1584
1585                         dequeue_task(p, rq->active);
1586                         set_tsk_need_resched(p);
1587                         p->prio = effective_prio(p);
1588                         enqueue_task(p, rq->active);
1589                 }
1590         }
1591 out_unlock:
1592         spin_unlock(&rq->lock);
1593 out:
1594         rebalance_tick(rq, 0);
1595 }
1596
1597 /*
1598  * schedule() is the main scheduler function.
1599  */
1600 asmlinkage void __sched schedule(void)
1601 {
1602         long *switch_count;
1603         task_t *prev, *next;
1604         runqueue_t *rq;
1605         prio_array_t *array;
1606         struct list_head *queue;
1607         unsigned long long now;
1608         unsigned long run_time;
1609         int idx;
1610
1611         /*
1612          * Test if we are atomic.  Since do_exit() needs to call into
1613          * schedule() atomically, we ignore that path for now.
1614          * Otherwise, whine if we are scheduling when we should not be.
1615          */
1616         if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
1617                 if (unlikely(in_atomic())) {
1618                         printk(KERN_ERR "bad: scheduling while atomic!\n");
1619                         dump_stack();
1620                 }
1621         }
1622
1623 need_resched:
1624         preempt_disable();
1625         prev = current;
1626         rq = this_rq();
1627
1628         release_kernel_lock(prev);
1629         now = sched_clock();
1630         if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
1631                 run_time = now - prev->timestamp;
1632         else
1633                 run_time = NS_MAX_SLEEP_AVG;
1634
1635         /*
1636          * Tasks with interactive credits get charged less run_time
1637          * at high sleep_avg to delay them losing their interactive
1638          * status
1639          */
1640         if (HIGH_CREDIT(prev))
1641                 run_time /= (CURRENT_BONUS(prev) ? : 1);
1642
1643         spin_lock_irq(&rq->lock);
1644
1645         /*
1646          * if entering off of a kernel preemption go straight
1647          * to picking the next task.
1648          */
1649         switch_count = &prev->nivcsw;
1650         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1651                 switch_count = &prev->nvcsw;
1652                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
1653                                 unlikely(signal_pending(prev))))
1654                         prev->state = TASK_RUNNING;
1655                 else
1656                         deactivate_task(prev, rq);
1657         }
1658
1659         if (unlikely(!rq->nr_running)) {
1660 #ifdef CONFIG_SMP
1661                 load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
1662 #endif
1663                 if (!rq->nr_running) {
1664                         next = rq->idle;
1665                         rq->expired_timestamp = 0;
1666                         goto switch_tasks;
1667                 }
1668         }
1669
1670         array = rq->active;
1671         if (unlikely(!array->nr_active)) {
1672                 /*
1673                  * Switch the active and expired arrays.
1674                  */
1675                 rq->active = rq->expired;
1676                 rq->expired = array;
1677                 array = rq->active;
1678                 rq->expired_timestamp = 0;
1679                 rq->best_expired_prio = MAX_PRIO;
1680         }
1681
1682         idx = sched_find_first_bit(array->bitmap);
1683         queue = array->queue + idx;
1684         next = list_entry(queue->next, task_t, run_list);
1685
1686         if (!rt_task(next) && next->activated > 0) {
1687                 unsigned long long delta = now - next->timestamp;
1688
1689                 if (next->activated == 1)
1690                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1691
1692                 array = next->array;
1693                 dequeue_task(next, array);
1694                 recalc_task_prio(next, next->timestamp + delta);
1695                 enqueue_task(next, array);
1696         }
1697         next->activated = 0;
1698 switch_tasks:
1699         prefetch(next);
1700         clear_tsk_need_resched(prev);
1701         RCU_qsctr(task_cpu(prev))++;
1702
1703         prev->sleep_avg -= run_time;
1704         if ((long)prev->sleep_avg <= 0) {
1705                 prev->sleep_avg = 0;
1706                 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
1707                         prev->interactive_credit--;
1708         }
1709         add_delay_ts(prev,runcpu_total,prev->timestamp,now);
1710         prev->timestamp = now;
1711
1712         if (likely(prev != next)) {
1713                 add_delay_ts(next,waitcpu_total,next->timestamp,now);
1714                 inc_delay(next,runs);
1715                 next->timestamp = now;
1716                 rq->nr_switches++;
1717                 rq->curr = next;
1718                 ++*switch_count;
1719
1720                 prepare_arch_switch(rq, next);
1721                 prev = context_switch(rq, prev, next);
1722                 barrier();
1723
1724                 finish_task_switch(prev);
1725         } else
1726                 spin_unlock_irq(&rq->lock);
1727
1728         reacquire_kernel_lock(current);
1729         preempt_enable_no_resched();
1730         if (test_thread_flag(TIF_NEED_RESCHED))
1731                 goto need_resched;
1732 }
1733
1734 EXPORT_SYMBOL(schedule);
1735
1736 #ifdef CONFIG_PREEMPT
1737 /*
1738  * this is is the entry point to schedule() from in-kernel preemption
1739  * off of preempt_enable.  Kernel preemptions off return from interrupt
1740  * occur there and call schedule directly.
1741  */
1742 asmlinkage void __sched preempt_schedule(void)
1743 {
1744         struct thread_info *ti = current_thread_info();
1745
1746         /*
1747          * If there is a non-zero preempt_count or interrupts are disabled,
1748          * we do not want to preempt the current task.  Just return..
1749          */
1750         if (unlikely(ti->preempt_count || irqs_disabled()))
1751                 return;
1752
1753 need_resched:
1754         ti->preempt_count = PREEMPT_ACTIVE;
1755         schedule();
1756         ti->preempt_count = 0;
1757
1758         /* we could miss a preemption opportunity between schedule and now */
1759         barrier();
1760         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
1761                 goto need_resched;
1762 }
1763
1764 EXPORT_SYMBOL(preempt_schedule);
1765 #endif /* CONFIG_PREEMPT */
1766
1767 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync)
1768 {
1769         task_t *p = curr->task;
1770         return try_to_wake_up(p, mode, sync);
1771 }
1772
1773 EXPORT_SYMBOL(default_wake_function);
1774
1775 /*
1776  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
1777  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
1778  * number) then we wake all the non-exclusive tasks and one exclusive task.
1779  *
1780  * There are circumstances in which we can try to wake a task which has already
1781  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
1782  * zero in this (rare) case, and we handle it by continuing to scan the queue.
1783  */
1784 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
1785                              int nr_exclusive, int sync)
1786 {
1787         struct list_head *tmp, *next;
1788
1789         list_for_each_safe(tmp, next, &q->task_list) {
1790                 wait_queue_t *curr;
1791                 unsigned flags;
1792                 curr = list_entry(tmp, wait_queue_t, task_list);
1793                 flags = curr->flags;
1794                 if (curr->func(curr, mode, sync) &&
1795                     (flags & WQ_FLAG_EXCLUSIVE) &&
1796                     !--nr_exclusive)
1797                         break;
1798         }
1799 }
1800
1801 /**
1802  * __wake_up - wake up threads blocked on a waitqueue.
1803  * @q: the waitqueue
1804  * @mode: which threads
1805  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1806  */
1807 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1808 {
1809         unsigned long flags;
1810
1811         spin_lock_irqsave(&q->lock, flags);
1812         __wake_up_common(q, mode, nr_exclusive, 0);
1813         spin_unlock_irqrestore(&q->lock, flags);
1814 }
1815
1816 EXPORT_SYMBOL(__wake_up);
1817
1818 /*
1819  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
1820  */
1821 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
1822 {
1823         __wake_up_common(q, mode, 1, 0);
1824 }
1825
1826 /**
1827  * __wake_up - sync- wake up threads blocked on a waitqueue.
1828  * @q: the waitqueue
1829  * @mode: which threads
1830  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1831  *
1832  * The sync wakeup differs that the waker knows that it will schedule
1833  * away soon, so while the target thread will be woken up, it will not
1834  * be migrated to another CPU - ie. the two threads are 'synchronized'
1835  * with each other. This can prevent needless bouncing between CPUs.
1836  *
1837  * On UP it can prevent extra preemption.
1838  */
1839 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1840 {
1841         unsigned long flags;
1842
1843         if (unlikely(!q))
1844                 return;
1845
1846         spin_lock_irqsave(&q->lock, flags);
1847         if (likely(nr_exclusive))
1848                 __wake_up_common(q, mode, nr_exclusive, 1);
1849         else
1850                 __wake_up_common(q, mode, nr_exclusive, 0);
1851         spin_unlock_irqrestore(&q->lock, flags);
1852 }
1853 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
1854
1855 void fastcall complete(struct completion *x)
1856 {
1857         unsigned long flags;
1858
1859         spin_lock_irqsave(&x->wait.lock, flags);
1860         x->done++;
1861         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1862                          1, 0);
1863         spin_unlock_irqrestore(&x->wait.lock, flags);
1864 }
1865 EXPORT_SYMBOL(complete);
1866
1867 void fastcall complete_all(struct completion *x)
1868 {
1869         unsigned long flags;
1870
1871         spin_lock_irqsave(&x->wait.lock, flags);
1872         x->done += UINT_MAX/2;
1873         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1874                          0, 0);
1875         spin_unlock_irqrestore(&x->wait.lock, flags);
1876 }
1877 EXPORT_SYMBOL(complete_all);
1878
1879 void fastcall __sched wait_for_completion(struct completion *x)
1880 {
1881         might_sleep();
1882         spin_lock_irq(&x->wait.lock);
1883         if (!x->done) {
1884                 DECLARE_WAITQUEUE(wait, current);
1885
1886                 wait.flags |= WQ_FLAG_EXCLUSIVE;
1887                 __add_wait_queue_tail(&x->wait, &wait);
1888                 do {
1889                         __set_current_state(TASK_UNINTERRUPTIBLE);
1890                         spin_unlock_irq(&x->wait.lock);
1891                         schedule();
1892                         spin_lock_irq(&x->wait.lock);
1893                 } while (!x->done);
1894                 __remove_wait_queue(&x->wait, &wait);
1895         }
1896         x->done--;
1897         spin_unlock_irq(&x->wait.lock);
1898 }
1899 EXPORT_SYMBOL(wait_for_completion);
1900
1901 #define SLEEP_ON_VAR                                    \
1902         unsigned long flags;                            \
1903         wait_queue_t wait;                              \
1904         init_waitqueue_entry(&wait, current);
1905
1906 #define SLEEP_ON_HEAD                                   \
1907         spin_lock_irqsave(&q->lock,flags);              \
1908         __add_wait_queue(q, &wait);                     \
1909         spin_unlock(&q->lock);
1910
1911 #define SLEEP_ON_TAIL                                   \
1912         spin_lock_irq(&q->lock);                        \
1913         __remove_wait_queue(q, &wait);                  \
1914         spin_unlock_irqrestore(&q->lock, flags);
1915
1916 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
1917 {
1918         SLEEP_ON_VAR
1919
1920         current->state = TASK_INTERRUPTIBLE;
1921
1922         SLEEP_ON_HEAD
1923         schedule();
1924         SLEEP_ON_TAIL
1925 }
1926
1927 EXPORT_SYMBOL(interruptible_sleep_on);
1928
1929 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1930 {
1931         SLEEP_ON_VAR
1932
1933         current->state = TASK_INTERRUPTIBLE;
1934
1935         SLEEP_ON_HEAD
1936         timeout = schedule_timeout(timeout);
1937         SLEEP_ON_TAIL
1938
1939         return timeout;
1940 }
1941
1942 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
1943
1944 void fastcall __sched sleep_on(wait_queue_head_t *q)
1945 {
1946         SLEEP_ON_VAR
1947
1948         current->state = TASK_UNINTERRUPTIBLE;
1949
1950         SLEEP_ON_HEAD
1951         schedule();
1952         SLEEP_ON_TAIL
1953 }
1954
1955 EXPORT_SYMBOL(sleep_on);
1956
1957 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
1958 {
1959         SLEEP_ON_VAR
1960
1961         current->state = TASK_UNINTERRUPTIBLE;
1962
1963         SLEEP_ON_HEAD
1964         timeout = schedule_timeout(timeout);
1965         SLEEP_ON_TAIL
1966
1967         return timeout;
1968 }
1969
1970 EXPORT_SYMBOL(sleep_on_timeout);
1971
1972 void set_user_nice(task_t *p, long nice)
1973 {
1974         unsigned long flags;
1975         prio_array_t *array;
1976         runqueue_t *rq;
1977         int old_prio, new_prio, delta;
1978
1979         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1980                 return;
1981         /*
1982          * We have to be careful, if called from sys_setpriority(),
1983          * the task might be in the middle of scheduling on another CPU.
1984          */
1985         rq = task_rq_lock(p, &flags);
1986         /*
1987          * The RT priorities are set via setscheduler(), but we still
1988          * allow the 'normal' nice value to be set - but as expected
1989          * it wont have any effect on scheduling until the task is
1990          * not SCHED_NORMAL:
1991          */
1992         if (rt_task(p)) {
1993                 p->static_prio = NICE_TO_PRIO(nice);
1994                 goto out_unlock;
1995         }
1996         array = p->array;
1997         if (array)
1998                 dequeue_task(p, array);
1999
2000         old_prio = p->prio;
2001         new_prio = NICE_TO_PRIO(nice);
2002         delta = new_prio - old_prio;
2003         p->static_prio = NICE_TO_PRIO(nice);
2004         p->prio += delta;
2005
2006         if (array) {
2007                 enqueue_task(p, array);
2008                 /*
2009                  * If the task increased its priority or is running and
2010                  * lowered its priority, then reschedule its CPU:
2011                  */
2012                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2013                         resched_task(rq->curr);
2014         }
2015 out_unlock:
2016         task_rq_unlock(rq, &flags);
2017 }
2018
2019 EXPORT_SYMBOL(set_user_nice);
2020
2021 #ifndef __alpha__
2022
2023 /*
2024  * sys_nice - change the priority of the current process.
2025  * @increment: priority increment
2026  *
2027  * sys_setpriority is a more generic, but much slower function that
2028  * does similar things.
2029  */
2030 asmlinkage long sys_nice(int increment)
2031 {
2032         int retval;
2033         long nice;
2034
2035         /*
2036          * Setpriority might change our priority at the same moment.
2037          * We don't have to worry. Conceptually one call occurs first
2038          * and we have a single winner.
2039          */
2040         if (increment < 0) {
2041                 if (!capable(CAP_SYS_NICE))
2042                         return -EPERM;
2043                 if (increment < -40)
2044                         increment = -40;
2045         }
2046         if (increment > 40)
2047                 increment = 40;
2048
2049         nice = PRIO_TO_NICE(current->static_prio) + increment;
2050         if (nice < -20)
2051                 nice = -20;
2052         if (nice > 19)
2053                 nice = 19;
2054
2055         retval = security_task_setnice(current, nice);
2056         if (retval)
2057                 return retval;
2058
2059         set_user_nice(current, nice);
2060         return 0;
2061 }
2062
2063 #endif
2064
2065 /**
2066  * task_prio - return the priority value of a given task.
2067  * @p: the task in question.
2068  *
2069  * This is the priority value as seen by users in /proc.
2070  * RT tasks are offset by -200. Normal tasks are centered
2071  * around 0, value goes from -16 to +15.
2072  */
2073 int task_prio(task_t *p)
2074 {
2075         return p->prio - MAX_RT_PRIO;
2076 }
2077
2078 /**
2079  * task_nice - return the nice value of a given task.
2080  * @p: the task in question.
2081  */
2082 int task_nice(task_t *p)
2083 {
2084         return TASK_NICE(p);
2085 }
2086
2087 EXPORT_SYMBOL(task_nice);
2088
2089 /**
2090  * idle_cpu - is a given cpu idle currently?
2091  * @cpu: the processor in question.
2092  */
2093 int idle_cpu(int cpu)
2094 {
2095         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
2096 }
2097
2098 EXPORT_SYMBOL_GPL(idle_cpu);
2099
2100 /**
2101  * find_process_by_pid - find a process with a matching PID value.
2102  * @pid: the pid in question.
2103  */
2104 static inline task_t *find_process_by_pid(pid_t pid)
2105 {
2106         return pid ? find_task_by_pid(pid) : current;
2107 }
2108
2109 /* Actually do priority change: must hold rq lock. */
2110 static void __setscheduler(struct task_struct *p, int policy, int prio)
2111 {
2112         BUG_ON(p->array);
2113         p->policy = policy;
2114         p->rt_priority = prio;
2115         if (policy != SCHED_NORMAL)
2116                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
2117         else
2118                 p->prio = p->static_prio;
2119 }
2120
2121 /*
2122  * setscheduler - change the scheduling policy and/or RT priority of a thread.
2123  */
2124 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
2125 {
2126         struct sched_param lp;
2127         int retval = -EINVAL;
2128         int oldprio;
2129         prio_array_t *array;
2130         unsigned long flags;
2131         runqueue_t *rq;
2132         task_t *p;
2133
2134         if (!param || pid < 0)
2135                 goto out_nounlock;
2136
2137         retval = -EFAULT;
2138         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
2139                 goto out_nounlock;
2140
2141         /*
2142          * We play safe to avoid deadlocks.
2143          */
2144         read_lock_irq(&tasklist_lock);
2145
2146         p = find_process_by_pid(pid);
2147
2148         retval = -ESRCH;
2149         if (!p)
2150                 goto out_unlock_tasklist;
2151
2152         /*
2153          * To be able to change p->policy safely, the apropriate
2154          * runqueue lock must be held.
2155          */
2156         rq = task_rq_lock(p, &flags);
2157
2158         if (policy < 0)
2159                 policy = p->policy;
2160         else {
2161                 retval = -EINVAL;
2162                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
2163                                 policy != SCHED_NORMAL)
2164                         goto out_unlock;
2165         }
2166
2167         /*
2168          * Valid priorities for SCHED_FIFO and SCHED_RR are
2169          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
2170          */
2171         retval = -EINVAL;
2172         if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
2173                 goto out_unlock;
2174         if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
2175                 goto out_unlock;
2176
2177         retval = -EPERM;
2178         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
2179             !capable(CAP_SYS_NICE))
2180                 goto out_unlock;
2181         if ((current->euid != p->euid) && (current->euid != p->uid) &&
2182             !capable(CAP_SYS_NICE))
2183                 goto out_unlock;
2184
2185         retval = security_task_setscheduler(p, policy, &lp);
2186         if (retval)
2187                 goto out_unlock;
2188
2189         array = p->array;
2190         if (array)
2191                 deactivate_task(p, task_rq(p));
2192         retval = 0;
2193         oldprio = p->prio;
2194         __setscheduler(p, policy, lp.sched_priority);
2195         if (array) {
2196                 __activate_task(p, task_rq(p));
2197                 /*
2198                  * Reschedule if we are currently running on this runqueue and
2199                  * our priority decreased, or if we are not currently running on
2200                  * this runqueue and our priority is higher than the current's
2201                  */
2202                 if (task_running(rq, p)) {
2203                         if (p->prio > oldprio)
2204                                 resched_task(rq->curr);
2205                 } else if (p->prio < rq->curr->prio)
2206                         resched_task(rq->curr);
2207         }
2208
2209 out_unlock:
2210         task_rq_unlock(rq, &flags);
2211 out_unlock_tasklist:
2212         read_unlock_irq(&tasklist_lock);
2213
2214 out_nounlock:
2215         return retval;
2216 }
2217
2218 /**
2219  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
2220  * @pid: the pid in question.
2221  * @policy: new policy
2222  * @param: structure containing the new RT priority.
2223  */
2224 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
2225                                        struct sched_param __user *param)
2226 {
2227         return setscheduler(pid, policy, param);
2228 }
2229
2230 /**
2231  * sys_sched_setparam - set/change the RT priority of a thread
2232  * @pid: the pid in question.
2233  * @param: structure containing the new RT priority.
2234  */
2235 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
2236 {
2237         return setscheduler(pid, -1, param);
2238 }
2239
2240 /**
2241  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
2242  * @pid: the pid in question.
2243  */
2244 asmlinkage long sys_sched_getscheduler(pid_t pid)
2245 {
2246         int retval = -EINVAL;
2247         task_t *p;
2248
2249         if (pid < 0)
2250                 goto out_nounlock;
2251
2252         retval = -ESRCH;
2253         read_lock(&tasklist_lock);
2254         p = find_process_by_pid(pid);
2255         if (p) {
2256                 retval = security_task_getscheduler(p);
2257                 if (!retval)
2258                         retval = p->policy;
2259         }
2260         read_unlock(&tasklist_lock);
2261
2262 out_nounlock:
2263         return retval;
2264 }
2265
2266 /**
2267  * sys_sched_getscheduler - get the RT priority of a thread
2268  * @pid: the pid in question.
2269  * @param: structure containing the RT priority.
2270  */
2271 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
2272 {
2273         struct sched_param lp;
2274         int retval = -EINVAL;
2275         task_t *p;
2276
2277         if (!param || pid < 0)
2278                 goto out_nounlock;
2279
2280         read_lock(&tasklist_lock);
2281         p = find_process_by_pid(pid);
2282         retval = -ESRCH;
2283         if (!p)
2284                 goto out_unlock;
2285
2286         retval = security_task_getscheduler(p);
2287         if (retval)
2288                 goto out_unlock;
2289
2290         lp.sched_priority = p->rt_priority;
2291         read_unlock(&tasklist_lock);
2292
2293         /*
2294          * This one might sleep, we cannot do it with a spinlock held ...
2295          */
2296         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
2297
2298 out_nounlock:
2299         return retval;
2300
2301 out_unlock:
2302         read_unlock(&tasklist_lock);
2303         return retval;
2304 }
2305
2306 /**
2307  * sys_sched_setaffinity - set the cpu affinity of a process
2308  * @pid: pid of the process
2309  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2310  * @user_mask_ptr: user-space pointer to the new cpu mask
2311  */
2312 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
2313                                       unsigned long __user *user_mask_ptr)
2314 {
2315         cpumask_t new_mask;
2316         int retval;
2317         task_t *p;
2318
2319         if (len < sizeof(new_mask))
2320                 return -EINVAL;
2321
2322         if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
2323                 return -EFAULT;
2324
2325         lock_cpu_hotplug();
2326         read_lock(&tasklist_lock);
2327
2328         p = find_process_by_pid(pid);
2329         if (!p) {
2330                 read_unlock(&tasklist_lock);
2331                 unlock_cpu_hotplug();
2332                 return -ESRCH;
2333         }
2334
2335         /*
2336          * It is not safe to call set_cpus_allowed with the
2337          * tasklist_lock held.  We will bump the task_struct's
2338          * usage count and then drop tasklist_lock.
2339          */
2340         get_task_struct(p);
2341         read_unlock(&tasklist_lock);
2342
2343         retval = -EPERM;
2344         if ((current->euid != p->euid) && (current->euid != p->uid) &&
2345                         !capable(CAP_SYS_NICE))
2346                 goto out_unlock;
2347
2348         retval = set_cpus_allowed(p, new_mask);
2349
2350 out_unlock:
2351         put_task_struct(p);
2352         unlock_cpu_hotplug();
2353         return retval;
2354 }
2355
2356 /**
2357  * sys_sched_getaffinity - get the cpu affinity of a process
2358  * @pid: pid of the process
2359  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2360  * @user_mask_ptr: user-space pointer to hold the current cpu mask
2361  */
2362 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
2363                                       unsigned long __user *user_mask_ptr)
2364 {
2365         unsigned int real_len;
2366         cpumask_t mask;
2367         int retval;
2368         task_t *p;
2369
2370         real_len = sizeof(mask);
2371         if (len < real_len)
2372                 return -EINVAL;
2373
2374         read_lock(&tasklist_lock);
2375
2376         retval = -ESRCH;
2377         p = find_process_by_pid(pid);
2378         if (!p)
2379                 goto out_unlock;
2380
2381         retval = 0;
2382         cpus_and(mask, p->cpus_allowed, cpu_possible_map);
2383
2384 out_unlock:
2385         read_unlock(&tasklist_lock);
2386         if (retval)
2387                 return retval;
2388         if (copy_to_user(user_mask_ptr, &mask, real_len))
2389                 return -EFAULT;
2390         return real_len;
2391 }
2392
2393 /**
2394  * sys_sched_yield - yield the current processor to other threads.
2395  *
2396  * this function yields the current CPU by moving the calling thread
2397  * to the expired array. If there are no other threads running on this
2398  * CPU then this function will return.
2399  */
2400 asmlinkage long sys_sched_yield(void)
2401 {
2402         runqueue_t *rq = this_rq_lock();
2403         prio_array_t *array = current->array;
2404
2405         /*
2406          * We implement yielding by moving the task into the expired
2407          * queue.
2408          *
2409          * (special rule: RT tasks will just roundrobin in the active
2410          *  array.)
2411          */
2412         if (likely(!rt_task(current))) {
2413                 dequeue_task(current, array);
2414                 enqueue_task(current, rq->expired);
2415         } else {
2416                 list_del(&current->run_list);
2417                 list_add_tail(&current->run_list, array->queue + current->prio);
2418         }
2419         /*
2420          * Since we are going to call schedule() anyway, there's
2421          * no need to preempt:
2422          */
2423         _raw_spin_unlock(&rq->lock);
2424         preempt_enable_no_resched();
2425
2426         schedule();
2427
2428         return 0;
2429 }
2430
2431 void __sched __cond_resched(void)
2432 {
2433         set_current_state(TASK_RUNNING);
2434         schedule();
2435 }
2436
2437 EXPORT_SYMBOL(__cond_resched);
2438
2439 /**
2440  * yield - yield the current processor to other threads.
2441  *
2442  * this is a shortcut for kernel-space yielding - it marks the
2443  * thread runnable and calls sys_sched_yield().
2444  */
2445 void __sched yield(void)
2446 {
2447         set_current_state(TASK_RUNNING);
2448         sys_sched_yield();
2449 }
2450
2451 EXPORT_SYMBOL(yield);
2452
2453 /*
2454  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
2455  * that process accounting knows that this is a task in IO wait state.
2456  *
2457  * But don't do that if it is a deliberate, throttling IO wait (this task
2458  * has set its backing_dev_info: the queue against which it should throttle)
2459  */
2460 void __sched io_schedule(void)
2461 {
2462         struct runqueue *rq = this_rq();
2463         def_delay_var(dstart);
2464
2465         start_delay_set(dstart,PF_IOWAIT);
2466         atomic_inc(&rq->nr_iowait);
2467         schedule();
2468         atomic_dec(&rq->nr_iowait);
2469         add_io_delay(dstart);
2470 }
2471
2472 EXPORT_SYMBOL(io_schedule);
2473
2474 long __sched io_schedule_timeout(long timeout)
2475 {
2476         struct runqueue *rq = this_rq();
2477         long ret;
2478         def_delay_var(dstart);
2479
2480         start_delay_set(dstart,PF_IOWAIT);
2481         atomic_inc(&rq->nr_iowait);
2482         ret = schedule_timeout(timeout);
2483         atomic_dec(&rq->nr_iowait);
2484         add_io_delay(dstart);
2485         return ret;
2486 }
2487
2488 /**
2489  * sys_sched_get_priority_max - return maximum RT priority.
2490  * @policy: scheduling class.
2491  *
2492  * this syscall returns the maximum rt_priority that can be used
2493  * by a given scheduling class.
2494  */
2495 asmlinkage long sys_sched_get_priority_max(int policy)
2496 {
2497         int ret = -EINVAL;
2498
2499         switch (policy) {
2500         case SCHED_FIFO:
2501         case SCHED_RR:
2502                 ret = MAX_USER_RT_PRIO-1;
2503                 break;
2504         case SCHED_NORMAL:
2505                 ret = 0;
2506                 break;
2507         }
2508         return ret;
2509 }
2510
2511 /**
2512  * sys_sched_get_priority_min - return minimum RT priority.
2513  * @policy: scheduling class.
2514  *
2515  * this syscall returns the minimum rt_priority that can be used
2516  * by a given scheduling class.
2517  */
2518 asmlinkage long sys_sched_get_priority_min(int policy)
2519 {
2520         int ret = -EINVAL;
2521
2522         switch (policy) {
2523         case SCHED_FIFO:
2524         case SCHED_RR:
2525                 ret = 1;
2526                 break;
2527         case SCHED_NORMAL:
2528                 ret = 0;
2529         }
2530         return ret;
2531 }
2532
2533 /**
2534  * sys_sched_rr_get_interval - return the default timeslice of a process.
2535  * @pid: pid of the process.
2536  * @interval: userspace pointer to the timeslice value.
2537  *
2538  * this syscall writes the default timeslice value of a given process
2539  * into the user-space timespec buffer. A value of '0' means infinity.
2540  */
2541 asmlinkage
2542 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
2543 {
2544         int retval = -EINVAL;
2545         struct timespec t;
2546         task_t *p;
2547
2548         if (pid < 0)
2549                 goto out_nounlock;
2550
2551         retval = -ESRCH;
2552         read_lock(&tasklist_lock);
2553         p = find_process_by_pid(pid);
2554         if (!p)
2555                 goto out_unlock;
2556
2557         retval = security_task_getscheduler(p);
2558         if (retval)
2559                 goto out_unlock;
2560
2561         jiffies_to_timespec(p->policy & SCHED_FIFO ?
2562                                 0 : task_timeslice(p), &t);
2563         read_unlock(&tasklist_lock);
2564         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2565 out_nounlock:
2566         return retval;
2567 out_unlock:
2568         read_unlock(&tasklist_lock);
2569         return retval;
2570 }
2571
2572 static inline struct task_struct *eldest_child(struct task_struct *p)
2573 {
2574         if (list_empty(&p->children)) return NULL;
2575         return list_entry(p->children.next,struct task_struct,sibling);
2576 }
2577
2578 static inline struct task_struct *older_sibling(struct task_struct *p)
2579 {
2580         if (p->sibling.prev==&p->parent->children) return NULL;
2581         return list_entry(p->sibling.prev,struct task_struct,sibling);
2582 }
2583
2584 static inline struct task_struct *younger_sibling(struct task_struct *p)
2585 {
2586         if (p->sibling.next==&p->parent->children) return NULL;
2587         return list_entry(p->sibling.next,struct task_struct,sibling);
2588 }
2589
2590 static void show_task(task_t * p)
2591 {
2592         task_t *relative;
2593         unsigned state;
2594         unsigned long free = 0;
2595         static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
2596
2597         printk("%-13.13s ", p->comm);
2598         state = p->state ? __ffs(p->state) + 1 : 0;
2599         if (state < ARRAY_SIZE(stat_nam))
2600                 printk(stat_nam[state]);
2601         else
2602                 printk("?");
2603 #if (BITS_PER_LONG == 32)
2604         if (state == TASK_RUNNING)
2605                 printk(" running ");
2606         else
2607                 printk(" %08lX ", thread_saved_pc(p));
2608 #else
2609         if (state == TASK_RUNNING)
2610                 printk("  running task   ");
2611         else
2612                 printk(" %016lx ", thread_saved_pc(p));
2613 #endif
2614 #ifdef CONFIG_DEBUG_STACK_USAGE
2615         {
2616                 unsigned long * n = (unsigned long *) (p->thread_info+1);
2617                 while (!*n)
2618                         n++;
2619                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
2620         }
2621 #endif
2622         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
2623         if ((relative = eldest_child(p)))
2624                 printk("%5d ", relative->pid);
2625         else
2626                 printk("      ");
2627         if ((relative = younger_sibling(p)))
2628                 printk("%7d", relative->pid);
2629         else
2630                 printk("       ");
2631         if ((relative = older_sibling(p)))
2632                 printk(" %5d", relative->pid);
2633         else
2634                 printk("      ");
2635         if (!p->mm)
2636                 printk(" (L-TLB)\n");
2637         else
2638                 printk(" (NOTLB)\n");
2639
2640         if (state != TASK_RUNNING)
2641                 show_stack(p, NULL);
2642 }
2643
2644 void show_state(void)
2645 {
2646         task_t *g, *p;
2647
2648 #if (BITS_PER_LONG == 32)
2649         printk("\n"
2650                "                                               sibling\n");
2651         printk("  task             PC      pid father child younger older\n");
2652 #else
2653         printk("\n"
2654                "                                                       sibling\n");
2655         printk("  task                 PC          pid father child younger older\n");
2656 #endif
2657         read_lock(&tasklist_lock);
2658         do_each_thread(g, p) {
2659                 /*
2660                  * reset the NMI-timeout, listing all files on a slow
2661                  * console might take alot of time:
2662                  */
2663                 touch_nmi_watchdog();
2664                 show_task(p);
2665         } while_each_thread(g, p);
2666
2667         read_unlock(&tasklist_lock);
2668 }
2669
2670 void __init init_idle(task_t *idle, int cpu)
2671 {
2672         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
2673         unsigned long flags;
2674
2675         local_irq_save(flags);
2676         double_rq_lock(idle_rq, rq);
2677
2678         idle_rq->curr = idle_rq->idle = idle;
2679         deactivate_task(idle, rq);
2680         idle->array = NULL;
2681         idle->prio = MAX_PRIO;
2682         idle->state = TASK_RUNNING;
2683         set_task_cpu(idle, cpu);
2684         double_rq_unlock(idle_rq, rq);
2685         set_tsk_need_resched(idle);
2686         local_irq_restore(flags);
2687
2688         /* Set the preempt count _outside_ the spinlocks! */
2689 #ifdef CONFIG_PREEMPT
2690         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
2691 #else
2692         idle->thread_info->preempt_count = 0;
2693 #endif
2694 }
2695
2696 /*
2697  * In a system that switches off the HZ timer idle_cpu_mask
2698  * indicates which cpus entered this state. This is used
2699  * in the rcu update to wait only for active cpus. For system
2700  * which do not switch off the HZ timer idle_cpu_mask should
2701  * always be CPU_MASK_NONE.
2702  */
2703 cpumask_t idle_cpu_mask = CPU_MASK_NONE;
2704
2705 #ifdef CONFIG_SMP
2706 /*
2707  * This is how migration works:
2708  *
2709  * 1) we queue a migration_req_t structure in the source CPU's
2710  *    runqueue and wake up that CPU's migration thread.
2711  * 2) we down() the locked semaphore => thread blocks.
2712  * 3) migration thread wakes up (implicitly it forces the migrated
2713  *    thread off the CPU)
2714  * 4) it gets the migration request and checks whether the migrated
2715  *    task is still in the wrong runqueue.
2716  * 5) if it's in the wrong runqueue then the migration thread removes
2717  *    it and puts it into the right queue.
2718  * 6) migration thread up()s the semaphore.
2719  * 7) we wake up and the migration is done.
2720  */
2721
2722 /*
2723  * Change a given task's CPU affinity. Migrate the thread to a
2724  * proper CPU and schedule it away if the CPU it's executing on
2725  * is removed from the allowed bitmask.
2726  *
2727  * NOTE: the caller must have a valid reference to the task, the
2728  * task must not exit() & deallocate itself prematurely.  The
2729  * call is not atomic; no spinlocks may be held.
2730  */
2731 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
2732 {
2733         unsigned long flags;
2734         int ret = 0;
2735         migration_req_t req;
2736         runqueue_t *rq;
2737
2738         rq = task_rq_lock(p, &flags);
2739         if (any_online_cpu(new_mask) == NR_CPUS) {
2740                 ret = -EINVAL;
2741                 goto out;
2742         }
2743
2744         if (__set_cpus_allowed(p, new_mask, &req)) {
2745                 /* Need help from migration thread: drop lock and wait. */
2746                 task_rq_unlock(rq, &flags);
2747                 wake_up_process(rq->migration_thread);
2748                 wait_for_completion(&req.done);
2749                 return 0;
2750         }
2751 out:
2752         task_rq_unlock(rq, &flags);
2753         return ret;
2754 }
2755
2756 EXPORT_SYMBOL_GPL(set_cpus_allowed);
2757
2758 /* Move (not current) task off this cpu, onto dest cpu. */
2759 static void move_task_away(struct task_struct *p, int dest_cpu)
2760 {
2761         runqueue_t *rq_dest;
2762
2763         rq_dest = cpu_rq(dest_cpu);
2764
2765         double_rq_lock(this_rq(), rq_dest);
2766         if (task_cpu(p) != smp_processor_id())
2767                 goto out; /* Already moved */
2768
2769         set_task_cpu(p, dest_cpu);
2770         if (p->array) {
2771                 deactivate_task(p, this_rq());
2772                 activate_task(p, rq_dest);
2773                 if (p->prio < rq_dest->curr->prio)
2774                         resched_task(rq_dest->curr);
2775         }
2776         p->timestamp = rq_dest->timestamp_last_tick;
2777
2778 out:
2779         double_rq_unlock(this_rq(), rq_dest);
2780 }
2781
2782 /*
2783  * migration_thread - this is a highprio system thread that performs
2784  * thread migration by bumping thread off CPU then 'pushing' onto
2785  * another runqueue.
2786  */
2787 static int migration_thread(void * data)
2788 {
2789         runqueue_t *rq;
2790         int cpu = (long)data;
2791
2792         rq = cpu_rq(cpu);
2793         BUG_ON(rq->migration_thread != current);
2794
2795         while (!kthread_should_stop()) {
2796                 struct list_head *head;
2797                 migration_req_t *req;
2798
2799                 if (current->flags & PF_FREEZE)
2800                         refrigerator(PF_FREEZE);
2801
2802                 spin_lock_irq(&rq->lock);
2803                 head = &rq->migration_queue;
2804                 current->state = TASK_INTERRUPTIBLE;
2805                 if (list_empty(head)) {
2806                         spin_unlock_irq(&rq->lock);
2807                         schedule();
2808                         continue;
2809                 }
2810                 req = list_entry(head->next, migration_req_t, list);
2811                 list_del_init(head->next);
2812                 spin_unlock(&rq->lock);
2813
2814                 move_task_away(req->task,
2815                                any_online_cpu(req->task->cpus_allowed));
2816                 local_irq_enable();
2817                 complete(&req->done);
2818         }
2819         return 0;
2820 }
2821
2822 #ifdef CONFIG_HOTPLUG_CPU
2823 /* migrate_all_tasks - function to migrate all the tasks from the
2824  * current cpu caller must have already scheduled this to the target
2825  * cpu via set_cpus_allowed.  Machine is stopped.  */
2826 void migrate_all_tasks(void)
2827 {
2828         struct task_struct *tsk, *t;
2829         int dest_cpu, src_cpu;
2830         unsigned int node;
2831
2832         /* We're nailed to this CPU. */
2833         src_cpu = smp_processor_id();
2834
2835         /* Not required, but here for neatness. */
2836         write_lock(&tasklist_lock);
2837
2838         /* watch out for per node tasks, let's stay on this node */
2839         node = cpu_to_node(src_cpu);
2840
2841         do_each_thread(t, tsk) {
2842                 cpumask_t mask;
2843                 if (tsk == current)
2844                         continue;
2845
2846                 if (task_cpu(tsk) != src_cpu)
2847                         continue;
2848
2849                 /* Figure out where this task should go (attempting to
2850                  * keep it on-node), and check if it can be migrated
2851                  * as-is.  NOTE that kernel threads bound to more than
2852                  * one online cpu will be migrated. */
2853                 mask = node_to_cpumask(node);
2854                 cpus_and(mask, mask, tsk->cpus_allowed);
2855                 dest_cpu = any_online_cpu(mask);
2856                 if (dest_cpu == NR_CPUS)
2857                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
2858                 if (dest_cpu == NR_CPUS) {
2859                         cpus_clear(tsk->cpus_allowed);
2860                         cpus_complement(tsk->cpus_allowed);
2861                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
2862
2863                         /* Don't tell them about moving exiting tasks
2864                            or kernel threads (both mm NULL), since
2865                            they never leave kernel. */
2866                         if (tsk->mm && printk_ratelimit())
2867                                 printk(KERN_INFO "process %d (%s) no "
2868                                        "longer affine to cpu%d\n",
2869                                        tsk->pid, tsk->comm, src_cpu);
2870                 }
2871
2872                 move_task_away(tsk, dest_cpu);
2873         } while_each_thread(t, tsk);
2874
2875         write_unlock(&tasklist_lock);
2876 }
2877 #endif /* CONFIG_HOTPLUG_CPU */
2878
2879 /*
2880  * migration_call - callback that gets triggered when a CPU is added.
2881  * Here we can start up the necessary migration thread for the new CPU.
2882  */
2883 static int migration_call(struct notifier_block *nfb, unsigned long action,
2884                           void *hcpu)
2885 {
2886         int cpu = (long)hcpu;
2887         struct task_struct *p;
2888         struct runqueue *rq;
2889         unsigned long flags;
2890
2891         switch (action) {
2892         case CPU_UP_PREPARE:
2893                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
2894                 if (IS_ERR(p))
2895                         return NOTIFY_BAD;
2896                 kthread_bind(p, cpu);
2897                 /* Must be high prio: stop_machine expects to yield to it. */
2898                 rq = task_rq_lock(p, &flags);
2899                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
2900                 task_rq_unlock(rq, &flags);
2901                 cpu_rq(cpu)->migration_thread = p;
2902                 break;
2903         case CPU_ONLINE:
2904                 /* Strictly unneccessary, as first user will wake it. */
2905                 wake_up_process(cpu_rq(cpu)->migration_thread);
2906                 break;
2907 #ifdef CONFIG_HOTPLUG_CPU
2908         case CPU_UP_CANCELED:
2909                 /* Unbind it from offline cpu so it can run.  Fall thru. */
2910                 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
2911         case CPU_DEAD:
2912                 kthread_stop(cpu_rq(cpu)->migration_thread);
2913                 cpu_rq(cpu)->migration_thread = NULL;
2914                 BUG_ON(cpu_rq(cpu)->nr_running != 0);
2915                 break;
2916 #endif
2917         }
2918         return NOTIFY_OK;
2919 }
2920
2921 static struct notifier_block __devinitdata migration_notifier = {
2922         .notifier_call = migration_call,
2923 };
2924
2925 int __init migration_init(void)
2926 {
2927         void *cpu = (void *)(long)smp_processor_id();
2928         /* Start one for boot CPU. */
2929         migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
2930         migration_call(&migration_notifier, CPU_ONLINE, cpu);
2931         register_cpu_notifier(&migration_notifier);
2932         return 0;
2933 }
2934 #endif
2935
2936 /*
2937  * The 'big kernel lock'
2938  *
2939  * This spinlock is taken and released recursively by lock_kernel()
2940  * and unlock_kernel().  It is transparently dropped and reaquired
2941  * over schedule().  It is used to protect legacy code that hasn't
2942  * been migrated to a proper locking design yet.
2943  *
2944  * Don't use in new code.
2945  *
2946  * Note: spinlock debugging needs this even on !CONFIG_SMP.
2947  */
2948 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
2949 EXPORT_SYMBOL(kernel_flag);
2950
2951 void __init sched_init(void)
2952 {
2953         runqueue_t *rq;
2954         int i, j, k;
2955
2956         for (i = 0; i < NR_CPUS; i++) {
2957                 prio_array_t *array;
2958
2959                 rq = cpu_rq(i);
2960                 rq->active = rq->arrays;
2961                 rq->expired = rq->arrays + 1;
2962                 rq->best_expired_prio = MAX_PRIO;
2963
2964                 spin_lock_init(&rq->lock);
2965                 INIT_LIST_HEAD(&rq->migration_queue);
2966                 atomic_set(&rq->nr_iowait, 0);
2967                 nr_running_init(rq);
2968
2969                 for (j = 0; j < 2; j++) {
2970                         array = rq->arrays + j;
2971                         for (k = 0; k < MAX_PRIO; k++) {
2972                                 INIT_LIST_HEAD(array->queue + k);
2973                                 __clear_bit(k, array->bitmap);
2974                         }
2975                         // delimiter for bitsearch
2976                         __set_bit(MAX_PRIO, array->bitmap);
2977                 }
2978         }
2979         /*
2980          * We have to do a little magic to get the first
2981          * thread right in SMP mode.
2982          */
2983         rq = this_rq();
2984         rq->curr = current;
2985         rq->idle = current;
2986         set_task_cpu(current, smp_processor_id());
2987         wake_up_forked_process(current);
2988
2989         init_timers();
2990
2991         /*
2992          * The boot idle thread does lazy MMU switching as well:
2993          */
2994         atomic_inc(&init_mm.mm_count);
2995         enter_lazy_tlb(&init_mm, current);
2996 }
2997
2998 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
2999 void __might_sleep(char *file, int line)
3000 {
3001 #if defined(in_atomic)
3002         static unsigned long prev_jiffy;        /* ratelimiting */
3003
3004         if ((in_atomic() || irqs_disabled()) &&
3005             system_state == SYSTEM_RUNNING) {
3006                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
3007                         return;
3008                 prev_jiffy = jiffies;
3009                 printk(KERN_ERR "Debug: sleeping function called from invalid"
3010                                 " context at %s:%d\n", file, line);
3011                 printk("in_atomic():%d, irqs_disabled():%d\n",
3012                         in_atomic(), irqs_disabled());
3013                 dump_stack();
3014         }
3015 #endif
3016 }
3017 EXPORT_SYMBOL(__might_sleep);
3018 #endif
3019
3020
3021 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
3022 /*
3023  * This could be a long-held lock.  If another CPU holds it for a long time,
3024  * and that CPU is not asked to reschedule then *this* CPU will spin on the
3025  * lock for a long time, even if *this* CPU is asked to reschedule.
3026  *
3027  * So what we do here, in the slow (contended) path is to spin on the lock by
3028  * hand while permitting preemption.
3029  *
3030  * Called inside preempt_disable().
3031  */
3032 void __sched __preempt_spin_lock(spinlock_t *lock)
3033 {
3034         if (preempt_count() > 1) {
3035                 _raw_spin_lock(lock);
3036                 return;
3037         }
3038         do {
3039                 preempt_enable();
3040                 while (spin_is_locked(lock))
3041                         cpu_relax();
3042                 preempt_disable();
3043         } while (!_raw_spin_trylock(lock));
3044 }
3045
3046 EXPORT_SYMBOL(__preempt_spin_lock);
3047
3048 void __sched __preempt_write_lock(rwlock_t *lock)
3049 {
3050         if (preempt_count() > 1) {
3051                 _raw_write_lock(lock);
3052                 return;
3053         }
3054
3055         do {
3056                 preempt_enable();
3057                 while (rwlock_is_locked(lock))
3058                         cpu_relax();
3059                 preempt_disable();
3060         } while (!_raw_write_trylock(lock));
3061 }
3062
3063 EXPORT_SYMBOL(__preempt_write_lock);
3064 #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */
3065
3066 #ifdef CONFIG_DELAY_ACCT
3067 int task_running_sys(struct task_struct *p)
3068 {
3069        return task_running(task_rq(p),p);
3070 }
3071 EXPORT_SYMBOL(task_running_sys);
3072 #endif
3073