kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  */
  19
  20 #include <linux/mm.h>
  21 #include <linux/module.h>
  22 #include <linux/nmi.h>
  23 #include <linux/init.h>
  24 #include <asm/uaccess.h>
  25 #include <linux/highmem.h>
  26 #include <linux/smp_lock.h>
  27 #include <asm/mmu_context.h>
  28 #include <linux/interrupt.h>
  29 #include <linux/completion.h>
  30 #include <linux/kernel_stat.h>
  31 #include <linux/security.h>
  32 #include <linux/notifier.h>
  33 #include <linux/suspend.h>
  34 #include <linux/blkdev.h>
  35 #include <linux/delay.h>
  36 #include <linux/smp.h>
  37 #include <linux/timer.h>
  38 #include <linux/rcupdate.h>
  39 #include <linux/cpu.h>
  40 #include <linux/percpu.h>
  41 #include <linux/kthread.h>
  42
  43 #ifdef CONFIG_NUMA
  44 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
  45 #else
  46 #define cpu_to_node_mask(cpu) (cpu_online_map)
  47 #endif
  48
  49 /*
  50  * Convert user-nice values [ -20 ... 0 ... 19 ]
  51  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  52  * and back.
  53  */
  54 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  55 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  56 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  57
  58 /*
  59  * 'User priority' is the nice value converted to something we
  60  * can work with better when scaling various scheduler parameters,
  61  * it's a [ 0 ... 39 ] range.
  62  */
  63 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  64 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  65 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  66 #define AVG_TIMESLICE   (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
  67                         (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
  68
  69 /*
  70  * Some helpers for converting nanosecond timing to jiffy resolution
  71  */
  72 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  73 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  74
  75 /*
  76  * These are the 'tuning knobs' of the scheduler:
  77  *
  78  * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
  79  * maximum timeslice is 200 msecs. Timeslices get refilled after
  80  * they expire.
  81  */
  82 #define MIN_TIMESLICE           ( 10 * HZ / 1000)
  83 #define MAX_TIMESLICE           (200 * HZ / 1000)
  84 #define ON_RUNQUEUE_WEIGHT       30
  85 #define CHILD_PENALTY            95
  86 #define PARENT_PENALTY          100
  87 #define EXIT_WEIGHT               3
  88 #define PRIO_BONUS_RATIO         25
  89 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
  90 #define INTERACTIVE_DELTA         2
  91 #define MAX_SLEEP_AVG           (AVG_TIMESLICE * MAX_BONUS)
  92 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
  93 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
  94 #define NODE_THRESHOLD          125
  95 #define CREDIT_LIMIT            100
  96
  97 /*
  98  * If a task is 'interactive' then we reinsert it in the active
  99  * array after it has expired its current timeslice. (it will not
 100  * continue to run immediately, it will still roundrobin with
 101  * other interactive tasks.)
 102  *
 103  * This part scales the interactivity limit depending on niceness.
 104  *
 105  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
 106  * Here are a few examples of different nice levels:
 107  *
 108  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
 109  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
 110  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
 111  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
 112  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
 113  *
 114  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
 115  *  priority range a task can explore, a value of '1' means the
 116  *  task is rated interactive.)
 117  *
 118  * Ie. nice +19 tasks can never get 'interactive' enough to be
 119  * reinserted into the active array. And only heavily CPU-hog nice -20
 120  * tasks will be expired. Default nice 0 tasks are somewhere between,
 121  * it takes some effort for them to get interactive, but it's not
 122  * too hard.
 123  */
 124
 125 #define CURRENT_BONUS(p) \
 126         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 127                 MAX_SLEEP_AVG)
 128
 129 #ifdef CONFIG_SMP
 130 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
 131                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 132                         num_online_cpus())
 133 #else
 134 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
 135                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 136 #endif
 137
 138 #define SCALE(v1,v1_max,v2_max) \
 139         (v1) * (v2_max) / (v1_max)
 140
 141 #define DELTA(p) \
 142         (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
 143                 INTERACTIVE_DELTA)
 144
 145 #define TASK_INTERACTIVE(p) \
 146         ((p)->prio <= (p)->static_prio - DELTA(p))
 147
 148 #define INTERACTIVE_SLEEP(p) \
 149         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 150                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 151
 152 #define HIGH_CREDIT(p) \
 153         ((p)->interactive_credit > CREDIT_LIMIT)
 154
 155 #define LOW_CREDIT(p) \
 156         ((p)->interactive_credit < -CREDIT_LIMIT)
 157
 158 #define TASK_PREEMPTS_CURR(p, rq) \
 159         ((p)->prio < (rq)->curr->prio)
 160
 161 /*
 162  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
 163  * to time slice values.
 164  *
 165  * The higher a thread's priority, the bigger timeslices
 166  * it gets during one round of execution. But even the lowest
 167  * priority thread gets MIN_TIMESLICE worth of execution time.
 168  *
 169  * task_timeslice() is the interface that is used by the scheduler.
 170  */
 171
 172 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
 173                 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
 174                         (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
 175
 176 static inline unsigned int task_timeslice(task_t *p)
 177 {
 178         return BASE_TIMESLICE(p);
 179 }
 180
 181 /*
 182  * These are the runqueue data structures:
 183  */
 184
 185 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 186
 187 typedef struct runqueue runqueue_t;
 188
 189 struct prio_array {
 190         int nr_active;
 191         unsigned long bitmap[BITMAP_SIZE];
 192         struct list_head queue[MAX_PRIO];
 193 };
 194
 195 /*
 196  * This is the main, per-CPU runqueue data structure.
 197  *
 198  * Locking rule: those places that want to lock multiple runqueues
 199  * (such as the load balancing or the thread migration code), lock
 200  * acquire operations must be ordered by ascending &runqueue.
 201  */
 202 struct runqueue {
 203         spinlock_t lock;
 204         unsigned long long nr_switches;
 205         unsigned long nr_running, expired_timestamp, nr_uninterruptible,
 206                 timestamp_last_tick;
 207         task_t *curr, *idle;
 208         struct mm_struct *prev_mm;
 209         prio_array_t *active, *expired, arrays[2];
 210         int best_expired_prio, prev_cpu_load[NR_CPUS];
 211 #ifdef CONFIG_NUMA
 212         atomic_t *node_nr_running;
 213         int prev_node_load[MAX_NUMNODES];
 214 #endif
 215         task_t *migration_thread;
 216         struct list_head migration_queue;
 217
 218         atomic_t nr_iowait;
 219 };
 220
 221 static DEFINE_PER_CPU(struct runqueue, runqueues);
 222
 223 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 224 #define this_rq()               (&__get_cpu_var(runqueues))
 225 #define task_rq(p)              cpu_rq(task_cpu(p))
 226 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 227
 228 extern unsigned long __scheduling_functions_start_here;
 229 extern unsigned long __scheduling_functions_end_here;
 230 const unsigned long scheduling_functions_start_here =
 231                         (unsigned long)&__scheduling_functions_start_here;
 232 const unsigned long scheduling_functions_end_here =
 233                         (unsigned long)&__scheduling_functions_end_here;
 234
 235 /*
 236  * Default context-switch locking:
 237  */
 238 #ifndef prepare_arch_switch
 239 # define prepare_arch_switch(rq, next)  do { } while (0)
 240 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
 241 # define task_running(rq, p)            ((rq)->curr == (p))
 242 #endif
 243
 244 #ifdef CONFIG_NUMA
 245
 246 /*
 247  * Keep track of running tasks.
 248  */
 249
 250 static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
 251         {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
 252
 253 static inline void nr_running_init(struct runqueue *rq)
 254 {
 255         rq->node_nr_running = &node_nr_running[0];
 256 }
 257
 258 static inline void nr_running_inc(runqueue_t *rq)
 259 {
 260         atomic_inc(rq->node_nr_running);
 261         rq->nr_running++;
 262 }
 263
 264 static inline void nr_running_dec(runqueue_t *rq)
 265 {
 266         atomic_dec(rq->node_nr_running);
 267         rq->nr_running--;
 268 }
 269
 270 __init void node_nr_running_init(void)
 271 {
 272         int i;
 273
 274         for (i = 0; i < NR_CPUS; i++) {
 275                 if (cpu_possible(i))
 276                         cpu_rq(i)->node_nr_running =
 277                                 &node_nr_running[cpu_to_node(i)];
 278         }
 279 }
 280
 281 #else /* !CONFIG_NUMA */
 282
 283 # define nr_running_init(rq)    do { } while (0)
 284 # define nr_running_inc(rq)     do { (rq)->nr_running++; } while (0)
 285 # define nr_running_dec(rq)     do { (rq)->nr_running--; } while (0)
 286
 287 #endif /* CONFIG_NUMA */
 288
 289 /*
 290  * task_rq_lock - lock the runqueue a given task resides on and disable
 291  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 292  * explicitly disabling preemption.
 293  */
 294 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 295 {
 296         struct runqueue *rq;
 297
 298 repeat_lock_task:
 299         local_irq_save(*flags);
 300         rq = task_rq(p);
 301         spin_lock(&rq->lock);
 302         if (unlikely(rq != task_rq(p))) {
 303                 spin_unlock_irqrestore(&rq->lock, *flags);
 304                 goto repeat_lock_task;
 305         }
 306         return rq;
 307 }
 308
 309 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 310 {
 311         spin_unlock_irqrestore(&rq->lock, *flags);
 312 }
 313
 314 /*
 315  * rq_lock - lock a given runqueue and disable interrupts.
 316  */
 317 static inline runqueue_t *this_rq_lock(void)
 318 {
 319         runqueue_t *rq;
 320
 321         local_irq_disable();
 322         rq = this_rq();
 323         spin_lock(&rq->lock);
 324
 325         return rq;
 326 }
 327
 328 static inline void rq_unlock(runqueue_t *rq)
 329 {
 330         spin_unlock_irq(&rq->lock);
 331 }
 332
 333 /*
 334  * Adding/removing a task to/from a priority array:
 335  */
 336 static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
 337 {
 338         array->nr_active--;
 339         list_del(&p->run_list);
 340         if (list_empty(array->queue + p->prio))
 341                 __clear_bit(p->prio, array->bitmap);
 342 }
 343
 344 static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
 345 {
 346         list_add_tail(&p->run_list, array->queue + p->prio);
 347         __set_bit(p->prio, array->bitmap);
 348         array->nr_active++;
 349         p->array = array;
 350 }
 351
 352 /*
 353  * effective_prio - return the priority that is based on the static
 354  * priority but is modified by bonuses/penalties.
 355  *
 356  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 357  * into the -5 ... 0 ... +5 bonus/penalty range.
 358  *
 359  * We use 25% of the full 0...39 priority range so that:
 360  *
 361  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 362  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 363  *
 364  * Both properties are important to certain workloads.
 365  */
 366 static int effective_prio(task_t *p)
 367 {
 368         int bonus, prio;
 369
 370         if (rt_task(p))
 371                 return p->prio;
 372
 373         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 374
 375         prio = p->static_prio - bonus;
 376         if (prio < MAX_RT_PRIO)
 377                 prio = MAX_RT_PRIO;
 378         if (prio > MAX_PRIO-1)
 379                 prio = MAX_PRIO-1;
 380         return prio;
 381 }
 382
 383 /*
 384  * __activate_task - move a task to the runqueue.
 385  */
 386 static inline void __activate_task(task_t *p, runqueue_t *rq)
 387 {
 388         enqueue_task(p, rq->active);
 389         nr_running_inc(rq);
 390 }
 391
 392 static void recalc_task_prio(task_t *p, unsigned long long now)
 393 {
 394         unsigned long long __sleep_time = now - p->timestamp;
 395         unsigned long sleep_time;
 396
 397         if (__sleep_time > NS_MAX_SLEEP_AVG)
 398                 sleep_time = NS_MAX_SLEEP_AVG;
 399         else
 400                 sleep_time = (unsigned long)__sleep_time;
 401
 402         if (likely(sleep_time > 0)) {
 403                 /*
 404                  * User tasks that sleep a long time are categorised as
 405                  * idle and will get just interactive status to stay active &
 406                  * prevent them suddenly becoming cpu hogs and starving
 407                  * other processes.
 408                  */
 409                 if (p->mm && p->activated != -1 &&
 410                         sleep_time > INTERACTIVE_SLEEP(p)) {
 411                                 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 412                                                 AVG_TIMESLICE);
 413                                 if (!HIGH_CREDIT(p))
 414                                         p->interactive_credit++;
 415                 } else {
 416                         /*
 417                          * The lower the sleep avg a task has the more
 418                          * rapidly it will rise with sleep time.
 419                          */
 420                         sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
 421
 422                         /*
 423                          * Tasks with low interactive_credit are limited to
 424                          * one timeslice worth of sleep avg bonus.
 425                          */
 426                         if (LOW_CREDIT(p) &&
 427                             sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
 428                                 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
 429
 430                         /*
 431                          * Non high_credit tasks waking from uninterruptible
 432                          * sleep are limited in their sleep_avg rise as they
 433                          * are likely to be cpu hogs waiting on I/O
 434                          */
 435                         if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
 436                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
 437                                         sleep_time = 0;
 438                                 else if (p->sleep_avg + sleep_time >=
 439                                                 INTERACTIVE_SLEEP(p)) {
 440                                         p->sleep_avg = INTERACTIVE_SLEEP(p);
 441                                         sleep_time = 0;
 442                                 }
 443                         }
 444
 445                         /*
 446                          * This code gives a bonus to interactive tasks.
 447                          *
 448                          * The boost works by updating the 'average sleep time'
 449                          * value here, based on ->timestamp. The more time a
 450                          * task spends sleeping, the higher the average gets -
 451                          * and the higher the priority boost gets as well.
 452                          */
 453                         p->sleep_avg += sleep_time;
 454
 455                         if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
 456                                 p->sleep_avg = NS_MAX_SLEEP_AVG;
 457                                 if (!HIGH_CREDIT(p))
 458                                         p->interactive_credit++;
 459                         }
 460                 }
 461         }
 462
 463         p->prio = effective_prio(p);
 464 }
 465
 466 /*
 467  * activate_task - move a task to the runqueue and do priority recalculation
 468  *
 469  * Update all the scheduling statistics stuff. (sleep average
 470  * calculation, priority modifiers, etc.)
 471  */
 472 static inline void activate_task(task_t *p, runqueue_t *rq)
 473 {
 474         unsigned long long now = sched_clock();
 475
 476         recalc_task_prio(p, now);
 477
 478         /*
 479          * This checks to make sure it's not an uninterruptible task
 480          * that is now waking up.
 481          */
 482         if (!p->activated) {
 483                 /*
 484                  * Tasks which were woken up by interrupts (ie. hw events)
 485                  * are most likely of interactive nature. So we give them
 486                  * the credit of extending their sleep time to the period
 487                  * of time they spend on the runqueue, waiting for execution
 488                  * on a CPU, first time around:
 489                  */
 490                 if (in_interrupt())
 491                         p->activated = 2;
 492                 else {
 493                         /*
 494                          * Normal first-time wakeups get a credit too for
 495                          * on-runqueue time, but it will be weighted down:
 496                          */
 497                         p->activated = 1;
 498                 }
 499         }
 500         p->timestamp = now;
 501
 502         __activate_task(p, rq);
 503 }
 504
 505 /*
 506  * deactivate_task - remove a task from the runqueue.
 507  */
 508 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 509 {
 510         nr_running_dec(rq);
 511         if (p->state == TASK_UNINTERRUPTIBLE)
 512                 rq->nr_uninterruptible++;
 513         dequeue_task(p, p->array);
 514         p->array = NULL;
 515 }
 516
 517 /*
 518  * resched_task - mark a task 'to be rescheduled now'.
 519  *
 520  * On UP this means the setting of the need_resched flag, on SMP it
 521  * might also involve a cross-CPU call to trigger the scheduler on
 522  * the target CPU.
 523  */
 524 static inline void resched_task(task_t *p)
 525 {
 526 #ifdef CONFIG_SMP
 527         int need_resched, nrpolling;
 528
 529         preempt_disable();
 530         /* minimise the chance of sending an interrupt to poll_idle() */
 531         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 532         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
 533         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 534
 535         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 536                 smp_send_reschedule(task_cpu(p));
 537         preempt_enable();
 538 #else
 539         set_tsk_need_resched(p);
 540 #endif
 541 }
 542
 543 /**
 544  * task_curr - is this task currently executing on a CPU?
 545  * @p: the task in question.
 546  */
 547 inline int task_curr(task_t *p)
 548 {
 549         return cpu_curr(task_cpu(p)) == p;
 550 }
 551
 552 #ifdef CONFIG_SMP
 553 typedef struct {
 554         struct list_head list;
 555         task_t *task;
 556         struct completion done;
 557 } migration_req_t;
 558
 559 /*
 560  * The task's runqueue lock must be held, and the new mask must be valid.
 561  * Returns true if you have to wait for migration thread.
 562  */
 563 static int __set_cpus_allowed(task_t *p, cpumask_t new_mask,
 564                                 migration_req_t *req)
 565 {
 566         runqueue_t *rq = task_rq(p);
 567
 568         p->cpus_allowed = new_mask;
 569         /*
 570          * Can the task run on the task's current CPU? If not then
 571          * migrate the thread off to a proper CPU.
 572          */
 573         if (cpu_isset(task_cpu(p), new_mask))
 574                 return 0;
 575
 576         /*
 577          * If the task is not on a runqueue (and not running), then
 578          * it is sufficient to simply update the task's cpu field.
 579          */
 580         if (!p->array && !task_running(rq, p)) {
 581                 set_task_cpu(p, any_online_cpu(p->cpus_allowed));
 582                 return 0;
 583         }
 584
 585         init_completion(&req->done);
 586         req->task = p;
 587         list_add(&req->list, &rq->migration_queue);
 588         return 1;
 589 }
 590
 591 /*
 592  * wait_task_inactive - wait for a thread to unschedule.
 593  *
 594  * The caller must ensure that the task *will* unschedule sometime soon,
 595  * else this function might spin for a *long* time. This function can't
 596  * be called with interrupts off, or it may introduce deadlock with
 597  * smp_call_function() if an IPI is sent by the same process we are
 598  * waiting to become inactive.
 599  */
 600 void wait_task_inactive(task_t * p)
 601 {
 602         unsigned long flags;
 603         runqueue_t *rq;
 604         int preempted;
 605
 606 repeat:
 607         rq = task_rq_lock(p, &flags);
 608         /* Must be off runqueue entirely, not preempted. */
 609         if (unlikely(p->array)) {
 610                 /* If it's preempted, we yield.  It could be a while. */
 611                 preempted = !task_running(rq, p);
 612                 task_rq_unlock(rq, &flags);
 613                 cpu_relax();
 614                 if (preempted)
 615                         yield();
 616                 goto repeat;
 617         }
 618         task_rq_unlock(rq, &flags);
 619 }
 620
 621 /***
 622  * kick_process - kick a running thread to enter/exit the kernel
 623  * @p: the to-be-kicked thread
 624  *
 625  * Cause a process which is running on another CPU to enter
 626  * kernel-mode, without any delay. (to get signals handled.)
 627  */
 628 void kick_process(task_t *p)
 629 {
 630         int cpu;
 631
 632         preempt_disable();
 633         cpu = task_cpu(p);
 634         if ((cpu != smp_processor_id()) && task_curr(p))
 635                 smp_send_reschedule(cpu);
 636         preempt_enable();
 637 }
 638
 639 EXPORT_SYMBOL_GPL(kick_process);
 640
 641 #endif
 642
 643 /***
 644  * try_to_wake_up - wake up a thread
 645  * @p: the to-be-woken-up thread
 646  * @state: the mask of task states that can be woken
 647  * @sync: do a synchronous wakeup?
 648  *
 649  * Put it on the run-queue if it's not already there. The "current"
 650  * thread is always on the run-queue (except when the actual
 651  * re-schedule is in progress), and as such you're allowed to do
 652  * the simpler "current->state = TASK_RUNNING" to mark yourself
 653  * runnable without the overhead of this.
 654  *
 655  * returns failure only if the task is already active.
 656  */
 657 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 658 {
 659         unsigned long flags;
 660         int success = 0;
 661         long old_state;
 662         runqueue_t *rq;
 663
 664 repeat_lock_task:
 665         rq = task_rq_lock(p, &flags);
 666         old_state = p->state;
 667         if (old_state & state) {
 668                 if (!p->array) {
 669                         /*
 670                          * Fast-migrate the task if it's not running or runnable
 671                          * currently. Do not violate hard affinity.
 672                          */
 673                         if (unlikely(sync && !task_running(rq, p) &&
 674                                 (task_cpu(p) != smp_processor_id()) &&
 675                                         cpu_isset(smp_processor_id(),
 676                                                         p->cpus_allowed) &&
 677                                         !cpu_is_offline(smp_processor_id()))) {
 678                                 set_task_cpu(p, smp_processor_id());
 679                                 task_rq_unlock(rq, &flags);
 680                                 goto repeat_lock_task;
 681                         }
 682                         if (old_state == TASK_UNINTERRUPTIBLE) {
 683                                 rq->nr_uninterruptible--;
 684                                 /*
 685                                  * Tasks on involuntary sleep don't earn
 686                                  * sleep_avg beyond just interactive state.
 687                                  */
 688                                 p->activated = -1;
 689                         }
 690                         if (sync && (task_cpu(p) == smp_processor_id()))
 691                                 __activate_task(p, rq);
 692                         else {
 693                                 activate_task(p, rq);
 694                                 if (TASK_PREEMPTS_CURR(p, rq))
 695                                         resched_task(rq->curr);
 696                         }
 697                         success = 1;
 698                 }
 699                 p->state = TASK_RUNNING;
 700         }
 701         task_rq_unlock(rq, &flags);
 702
 703         return success;
 704 }
 705 int fastcall wake_up_process(task_t * p)
 706 {
 707         return try_to_wake_up(p, TASK_STOPPED |
 708                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 709 }
 710
 711 EXPORT_SYMBOL(wake_up_process);
 712
 713 int fastcall wake_up_state(task_t *p, unsigned int state)
 714 {
 715         return try_to_wake_up(p, state, 0);
 716 }
 717
 718 /*
 719  * Perform scheduler related setup for a newly forked process p.
 720  * p is forked by current.
 721  */
 722 void fastcall sched_fork(task_t *p)
 723 {
 724         /*
 725          * We mark the process as running here, but have not actually
 726          * inserted it onto the runqueue yet. This guarantees that
 727          * nobody will actually run it, and a signal or other external
 728          * event cannot wake it up and insert it on the runqueue either.
 729          */
 730         p->state = TASK_RUNNING;
 731         INIT_LIST_HEAD(&p->run_list);
 732         p->array = NULL;
 733         spin_lock_init(&p->switch_lock);
 734 #ifdef CONFIG_PREEMPT
 735         /*
 736          * During context-switch we hold precisely one spinlock, which
 737          * schedule_tail drops. (in the common case it's this_rq()->lock,
 738          * but it also can be p->switch_lock.) So we compensate with a count
 739          * of 1. Also, we want to start with kernel preemption disabled.
 740          */
 741         p->thread_info->preempt_count = 1;
 742 #endif
 743         /*
 744          * Share the timeslice between parent and child, thus the
 745          * total amount of pending timeslices in the system doesn't change,
 746          * resulting in more scheduling fairness.
 747          */
 748         local_irq_disable();
 749         p->time_slice = (current->time_slice + 1) >> 1;
 750         /*
 751          * The remainder of the first timeslice might be recovered by
 752          * the parent if the child exits early enough.
 753          */
 754         p->first_time_slice = 1;
 755         current->time_slice >>= 1;
 756         p->timestamp = sched_clock();
 757         if (!current->time_slice) {
 758                 /*
 759                  * This case is rare, it happens when the parent has only
 760                  * a single jiffy left from its timeslice. Taking the
 761                  * runqueue lock is not a problem.
 762                  */
 763                 current->time_slice = 1;
 764                 preempt_disable();
 765                 scheduler_tick(0, 0);
 766                 local_irq_enable();
 767                 preempt_enable();
 768         } else
 769                 local_irq_enable();
 770 }
 771
 772 /*
 773  * wake_up_forked_process - wake up a freshly forked process.
 774  *
 775  * This function will do some initial scheduler statistics housekeeping
 776  * that must be done for every newly created process.
 777  */
 778 void fastcall wake_up_forked_process(task_t * p)
 779 {
 780         unsigned long flags;
 781         runqueue_t *rq = task_rq_lock(current, &flags);
 782
 783         BUG_ON(p->state != TASK_RUNNING);
 784
 785         /*
 786          * We decrease the sleep average of forking parents
 787          * and children as well, to keep max-interactive tasks
 788          * from forking tasks that are max-interactive.
 789          */
 790         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
 791                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 792
 793         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
 794                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 795
 796         p->interactive_credit = 0;
 797
 798         p->prio = effective_prio(p);
 799         set_task_cpu(p, smp_processor_id());
 800
 801         if (unlikely(!current->array))
 802                 __activate_task(p, rq);
 803         else {
 804                 p->prio = current->prio;
 805                 list_add_tail(&p->run_list, &current->run_list);
 806                 p->array = current->array;
 807                 p->array->nr_active++;
 808                 nr_running_inc(rq);
 809         }
 810         task_rq_unlock(rq, &flags);
 811 }
 812
 813 /*
 814  * Potentially available exiting-child timeslices are
 815  * retrieved here - this way the parent does not get
 816  * penalized for creating too many threads.
 817  *
 818  * (this cannot be used to 'generate' timeslices
 819  * artificially, because any timeslice recovered here
 820  * was given away by the parent in the first place.)
 821  */
 822 void fastcall sched_exit(task_t * p)
 823 {
 824         unsigned long flags;
 825         runqueue_t *rq;
 826
 827         local_irq_save(flags);
 828         if (p->first_time_slice) {
 829                 p->parent->time_slice += p->time_slice;
 830                 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
 831                         p->parent->time_slice = MAX_TIMESLICE;
 832         }
 833         local_irq_restore(flags);
 834         /*
 835          * If the child was a (relative-) CPU hog then decrease
 836          * the sleep_avg of the parent as well.
 837          */
 838         rq = task_rq_lock(p->parent, &flags);
 839         if (p->sleep_avg < p->parent->sleep_avg)
 840                 p->parent->sleep_avg = p->parent->sleep_avg /
 841                 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
 842                 (EXIT_WEIGHT + 1);
 843         task_rq_unlock(rq, &flags);
 844 }
 845
 846 /**
 847  * finish_task_switch - clean up after a task-switch
 848  * @prev: the thread we just switched away from.
 849  *
 850  * We enter this with the runqueue still locked, and finish_arch_switch()
 851  * will unlock it along with doing any other architecture-specific cleanup
 852  * actions.
 853  *
 854  * Note that we may have delayed dropping an mm in context_switch(). If
 855  * so, we finish that here outside of the runqueue lock.  (Doing it
 856  * with the lock held can cause deadlocks; see schedule() for
 857  * details.)
 858  */
 859 static inline void finish_task_switch(task_t *prev)
 860 {
 861         runqueue_t *rq = this_rq();
 862         struct mm_struct *mm = rq->prev_mm;
 863         unsigned long prev_task_flags;
 864
 865         rq->prev_mm = NULL;
 866
 867         /*
 868          * A task struct has one reference for the use as "current".
 869          * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
 870          * schedule one last time. The schedule call will never return,
 871          * and the scheduled task must drop that reference.
 872          * The test for TASK_ZOMBIE must occur while the runqueue locks are
 873          * still held, otherwise prev could be scheduled on another cpu, die
 874          * there before we look at prev->state, and then the reference would
 875          * be dropped twice.
 876          *              Manfred Spraul <manfred@colorfullife.com>
 877          */
 878         prev_task_flags = prev->flags;
 879         finish_arch_switch(rq, prev);
 880         if (mm)
 881                 mmdrop(mm);
 882         if (unlikely(prev_task_flags & PF_DEAD))
 883                 put_task_struct(prev);
 884 }
 885
 886 /**
 887  * schedule_tail - first thing a freshly forked thread must call.
 888  * @prev: the thread we just switched away from.
 889  */
 890 asmlinkage void schedule_tail(task_t *prev)
 891 {
 892         finish_task_switch(prev);
 893
 894         if (current->set_child_tid)
 895                 put_user(current->pid, current->set_child_tid);
 896 }
 897
 898 /*
 899  * context_switch - switch to the new MM and the new
 900  * thread's register state.
 901  */
 902 static inline
 903 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
 904 {
 905         struct mm_struct *mm = next->mm;
 906         struct mm_struct *oldmm = prev->active_mm;
 907
 908         if (unlikely(!mm)) {
 909                 next->active_mm = oldmm;
 910                 atomic_inc(&oldmm->mm_count);
 911                 enter_lazy_tlb(oldmm, next);
 912         } else
 913                 switch_mm(oldmm, mm, next);
 914
 915         if (unlikely(!prev->mm)) {
 916                 prev->active_mm = NULL;
 917                 WARN_ON(rq->prev_mm);
 918                 rq->prev_mm = oldmm;
 919         }
 920
 921         /* Here we just switch the register state and the stack. */
 922         switch_to(prev, next, prev);
 923
 924         return prev;
 925 }
 926
 927 /*
 928  * nr_running, nr_uninterruptible and nr_context_switches:
 929  *
 930  * externally visible scheduler statistics: current number of runnable
 931  * threads, current number of uninterruptible-sleeping threads, total
 932  * number of context switches performed since bootup.
 933  */
 934 unsigned long nr_running(void)
 935 {
 936         unsigned long i, sum = 0;
 937
 938         for (i = 0; i < NR_CPUS; i++)
 939                 sum += cpu_rq(i)->nr_running;
 940
 941         return sum;
 942 }
 943
 944 unsigned long nr_uninterruptible(void)
 945 {
 946         unsigned long i, sum = 0;
 947
 948         for_each_cpu(i)
 949                 sum += cpu_rq(i)->nr_uninterruptible;
 950
 951         return sum;
 952 }
 953
 954 unsigned long long nr_context_switches(void)
 955 {
 956         unsigned long long i, sum = 0;
 957
 958         for_each_cpu(i)
 959                 sum += cpu_rq(i)->nr_switches;
 960
 961         return sum;
 962 }
 963
 964 unsigned long nr_iowait(void)
 965 {
 966         unsigned long i, sum = 0;
 967
 968         for_each_cpu(i)
 969                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
 970
 971         return sum;
 972 }
 973
 974 /*
 975  * double_rq_lock - safely lock two runqueues
 976  *
 977  * Note this does not disable interrupts like task_rq_lock,
 978  * you need to do so manually before calling.
 979  */
 980 static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
 981 {
 982         if (rq1 == rq2)
 983                 spin_lock(&rq1->lock);
 984         else {
 985                 if (rq1 < rq2) {
 986                         spin_lock(&rq1->lock);
 987                         spin_lock(&rq2->lock);
 988                 } else {
 989                         spin_lock(&rq2->lock);
 990                         spin_lock(&rq1->lock);
 991                 }
 992         }
 993 }
 994
 995 /*
 996  * double_rq_unlock - safely unlock two runqueues
 997  *
 998  * Note this does not restore interrupts like task_rq_unlock,
 999  * you need to do so manually after calling.
1000  */
1001 static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1002 {
1003         spin_unlock(&rq1->lock);
1004         if (rq1 != rq2)
1005                 spin_unlock(&rq2->lock);
1006 }
1007
1008 #ifdef CONFIG_NUMA
1009 /*
1010  * If dest_cpu is allowed for this process, migrate the task to it.
1011  * This is accomplished by forcing the cpu_allowed mask to only
1012  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1013  * the cpu_allowed mask is restored.
1014  */
1015 static void sched_migrate_task(task_t *p, int dest_cpu)
1016 {
1017         runqueue_t *rq;
1018         migration_req_t req;
1019         unsigned long flags;
1020         cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu);
1021
1022         lock_cpu_hotplug();
1023         rq = task_rq_lock(p, &flags);
1024         old_mask = p->cpus_allowed;
1025         if (!cpu_isset(dest_cpu, old_mask) || !cpu_online(dest_cpu))
1026                 goto out;
1027
1028         /* force the process onto the specified CPU */
1029         if (__set_cpus_allowed(p, new_mask, &req)) {
1030                 /* Need to wait for migration thread. */
1031                 task_rq_unlock(rq, &flags);
1032                 wake_up_process(rq->migration_thread);
1033                 wait_for_completion(&req.done);
1034
1035                 /* If we raced with sys_sched_setaffinity, don't
1036                  * restore mask. */
1037                 rq = task_rq_lock(p, &flags);
1038                 if (likely(cpus_equal(p->cpus_allowed, new_mask))) {
1039                         /* Restore old mask: won't need migration
1040                          * thread, since current cpu is allowed. */
1041                         BUG_ON(__set_cpus_allowed(p, old_mask, NULL));
1042                 }
1043         }
1044 out:
1045         task_rq_unlock(rq, &flags);
1046         unlock_cpu_hotplug();
1047 }
1048
1049 /*
1050  * Find the least loaded CPU.  Slightly favor the current CPU by
1051  * setting its runqueue length as the minimum to start.
1052  */
1053 static int sched_best_cpu(struct task_struct *p)
1054 {
1055         int i, minload, load, best_cpu, node = 0;
1056         cpumask_t cpumask;
1057
1058         best_cpu = task_cpu(p);
1059         if (cpu_rq(best_cpu)->nr_running <= 2)
1060                 return best_cpu;
1061
1062         minload = 10000000;
1063         for_each_node_with_cpus(i) {
1064                 /*
1065                  * Node load is always divided by nr_cpus_node to normalise
1066                  * load values in case cpu count differs from node to node.
1067                  * We first multiply node_nr_running by 10 to get a little
1068                  * better resolution.
1069                  */
1070                 load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i);
1071                 if (load < minload) {
1072                         minload = load;
1073                         node = i;
1074                 }
1075         }
1076
1077         minload = 10000000;
1078         cpumask = node_to_cpumask(node);
1079         for (i = 0; i < NR_CPUS; ++i) {
1080                 if (!cpu_isset(i, cpumask))
1081                         continue;
1082                 if (cpu_rq(i)->nr_running < minload) {
1083                         best_cpu = i;
1084                         minload = cpu_rq(i)->nr_running;
1085                 }
1086         }
1087         return best_cpu;
1088 }
1089
1090 void sched_balance_exec(void)
1091 {
1092         int new_cpu;
1093
1094         if (numnodes > 1) {
1095                 new_cpu = sched_best_cpu(current);
1096                 if (new_cpu != smp_processor_id())
1097                         sched_migrate_task(current, new_cpu);
1098         }
1099 }
1100
1101 /*
1102  * Find the busiest node. All previous node loads contribute with a
1103  * geometrically deccaying weight to the load measure:
1104  *      load_{t} = load_{t-1}/2 + nr_node_running_{t}
1105  * This way sudden load peaks are flattened out a bit.
1106  * Node load is divided by nr_cpus_node() in order to compare nodes
1107  * of different cpu count but also [first] multiplied by 10 to
1108  * provide better resolution.
1109  */
1110 static int find_busiest_node(int this_node)
1111 {
1112         int i, node = -1, load, this_load, maxload;
1113
1114         if (!nr_cpus_node(this_node))
1115                 return node;
1116         this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
1117                 + (10 * atomic_read(&node_nr_running[this_node])
1118                 / nr_cpus_node(this_node));
1119         this_rq()->prev_node_load[this_node] = this_load;
1120         for_each_node_with_cpus(i) {
1121                 if (i == this_node)
1122                         continue;
1123                 load = (this_rq()->prev_node_load[i] >> 1)
1124                         + (10 * atomic_read(&node_nr_running[i])
1125                         / nr_cpus_node(i));
1126                 this_rq()->prev_node_load[i] = load;
1127                 if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
1128                         maxload = load;
1129                         node = i;
1130                 }
1131         }
1132         return node;
1133 }
1134
1135 #endif /* CONFIG_NUMA */
1136
1137 #ifdef CONFIG_SMP
1138
1139 /*
1140  * double_lock_balance - lock the busiest runqueue
1141  *
1142  * this_rq is locked already. Recalculate nr_running if we have to
1143  * drop the runqueue lock.
1144  */
1145 static inline
1146 unsigned int double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest,
1147                                  int this_cpu, int idle,
1148                                  unsigned int nr_running)
1149 {
1150         if (unlikely(!spin_trylock(&busiest->lock))) {
1151                 if (busiest < this_rq) {
1152                         spin_unlock(&this_rq->lock);
1153                         spin_lock(&busiest->lock);
1154                         spin_lock(&this_rq->lock);
1155                         /* Need to recalculate nr_running */
1156                         if (idle || (this_rq->nr_running >
1157                                         this_rq->prev_cpu_load[this_cpu]))
1158                                 nr_running = this_rq->nr_running;
1159                         else
1160                                 nr_running = this_rq->prev_cpu_load[this_cpu];
1161                 } else
1162                         spin_lock(&busiest->lock);
1163         }
1164         return nr_running;
1165 }
1166
1167 /*
1168  * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
1169  */
1170 static inline
1171 runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle,
1172                                int *imbalance, cpumask_t cpumask)
1173 {
1174         int nr_running, load, max_load, i;
1175         runqueue_t *busiest, *rq_src;
1176
1177         /*
1178          * We search all runqueues to find the most busy one.
1179          * We do this lockless to reduce cache-bouncing overhead,
1180          * we re-check the 'best' source CPU later on again, with
1181          * the lock held.
1182          *
1183          * We fend off statistical fluctuations in runqueue lengths by
1184          * saving the runqueue length (as seen by the balancing CPU) during
1185          * the previous load-balancing operation and using the smaller one
1186          * of the current and saved lengths. If a runqueue is long enough
1187          * for a longer amount of time then we recognize it and pull tasks
1188          * from it.
1189          *
1190          * The 'current runqueue length' is a statistical maximum variable,
1191          * for that one we take the longer one - to avoid fluctuations in
1192          * the other direction. So for a load-balance to happen it needs
1193          * stable long runqueue on the target CPU and stable short runqueue
1194          * on the local runqueue.
1195          *
1196          * We make an exception if this CPU is about to become idle - in
1197          * that case we are less picky about moving a task across CPUs and
1198          * take what can be taken.
1199          */
1200         if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
1201                 nr_running = this_rq->nr_running;
1202         else
1203                 nr_running = this_rq->prev_cpu_load[this_cpu];
1204
1205         busiest = NULL;
1206         max_load = 1;
1207         for (i = 0; i < NR_CPUS; i++) {
1208                 if (!cpu_isset(i, cpumask))
1209                         continue;
1210
1211                 rq_src = cpu_rq(i);
1212                 if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
1213                         load = rq_src->nr_running;
1214                 else
1215                         load = this_rq->prev_cpu_load[i];
1216                 this_rq->prev_cpu_load[i] = rq_src->nr_running;
1217
1218                 if ((load > max_load) && (rq_src != this_rq)) {
1219                         busiest = rq_src;
1220                         max_load = load;
1221                 }
1222         }
1223
1224         if (likely(!busiest))
1225                 goto out;
1226
1227         *imbalance = max_load - nr_running;
1228
1229         /* It needs an at least ~25% imbalance to trigger balancing. */
1230         if (!idle && ((*imbalance)*4 < max_load)) {
1231                 busiest = NULL;
1232                 goto out;
1233         }
1234
1235         nr_running = double_lock_balance(this_rq, busiest, this_cpu,
1236                                          idle, nr_running);
1237         /*
1238          * Make sure nothing changed since we checked the
1239          * runqueue length.
1240          */
1241         if (busiest->nr_running <= nr_running) {
1242                 spin_unlock(&busiest->lock);
1243                 busiest = NULL;
1244         }
1245 out:
1246         return busiest;
1247 }
1248
1249 /*
1250  * pull_task - move a task from a remote runqueue to the local runqueue.
1251  * Both runqueues must be locked.
1252  */
1253 static inline
1254 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1255                runqueue_t *this_rq, int this_cpu)
1256 {
1257         dequeue_task(p, src_array);
1258         nr_running_dec(src_rq);
1259         set_task_cpu(p, this_cpu);
1260         nr_running_inc(this_rq);
1261         enqueue_task(p, this_rq->active);
1262         p->timestamp = sched_clock() -
1263                                 (src_rq->timestamp_last_tick - p->timestamp);
1264         /*
1265          * Note that idle threads have a prio of MAX_PRIO, for this test
1266          * to be always true for them.
1267          */
1268         if (TASK_PREEMPTS_CURR(p, this_rq))
1269                 set_need_resched();
1270 }
1271
1272 /*
1273  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1274  */
1275 static inline
1276 int can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle)
1277 {
1278         unsigned long delta = rq->timestamp_last_tick - tsk->timestamp;
1279
1280         /*
1281          * We do not migrate tasks that are:
1282          * 1) running (obviously), or
1283          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1284          * 3) are cache-hot on their current CPU.
1285          */
1286         if (task_running(rq, tsk))
1287                 return 0;
1288         if (!cpu_isset(this_cpu, tsk->cpus_allowed))
1289                 return 0;
1290         if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks)))
1291                 return 0;
1292         return 1;
1293 }
1294
1295 /*
1296  * Current runqueue is empty, or rebalance tick: if there is an
1297  * inbalance (current runqueue is too short) then pull from
1298  * busiest runqueue(s).
1299  *
1300  * We call this with the current runqueue locked,
1301  * irqs disabled.
1302  */
1303 static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
1304 {
1305         int imbalance, idx, this_cpu = smp_processor_id();
1306         runqueue_t *busiest;
1307         prio_array_t *array;
1308         struct list_head *head, *curr;
1309         task_t *tmp;
1310
1311         if (cpu_is_offline(this_cpu))
1312                 goto out;
1313
1314         busiest = find_busiest_queue(this_rq, this_cpu, idle,
1315                                      &imbalance, cpumask);
1316         if (!busiest)
1317                 goto out;
1318
1319         /*
1320          * We only want to steal a number of tasks equal to 1/2 the imbalance,
1321          * otherwise we'll just shift the imbalance to the new queue:
1322          */
1323         imbalance /= 2;
1324
1325         /*
1326          * We first consider expired tasks. Those will likely not be
1327          * executed in the near future, and they are most likely to
1328          * be cache-cold, thus switching CPUs has the least effect
1329          * on them.
1330          */
1331         if (busiest->expired->nr_active)
1332                 array = busiest->expired;
1333         else
1334                 array = busiest->active;
1335
1336 new_array:
1337         /* Start searching at priority 0: */
1338         idx = 0;
1339 skip_bitmap:
1340         if (!idx)
1341                 idx = sched_find_first_bit(array->bitmap);
1342         else
1343                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1344         if (idx >= MAX_PRIO) {
1345                 if (array == busiest->expired) {
1346                         array = busiest->active;
1347                         goto new_array;
1348                 }
1349                 goto out_unlock;
1350         }
1351
1352         head = array->queue + idx;
1353         curr = head->prev;
1354 skip_queue:
1355         tmp = list_entry(curr, task_t, run_list);
1356
1357         curr = curr->prev;
1358
1359         if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
1360                 if (curr != head)
1361                         goto skip_queue;
1362                 idx++;
1363                 goto skip_bitmap;
1364         }
1365         pull_task(busiest, array, tmp, this_rq, this_cpu);
1366
1367         /* Only migrate one task if we are idle */
1368         if (!idle && --imbalance) {
1369                 if (curr != head)
1370                         goto skip_queue;
1371                 idx++;
1372                 goto skip_bitmap;
1373         }
1374 out_unlock:
1375         spin_unlock(&busiest->lock);
1376 out:
1377         ;
1378 }
1379
1380 /*
1381  * One of the idle_cpu_tick() and busy_cpu_tick() functions will
1382  * get called every timer tick, on every CPU. Our balancing action
1383  * frequency and balancing agressivity depends on whether the CPU is
1384  * idle or not.
1385  *
1386  * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
1387  * systems with HZ=100, every 10 msecs.)
1388  *
1389  * On NUMA, do a node-rebalance every 400 msecs.
1390  */
1391 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
1392 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
1393 #define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
1394 #define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
1395
1396 #ifdef CONFIG_NUMA
1397 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
1398 {
1399         int node = find_busiest_node(cpu_to_node(this_cpu));
1400
1401         if (node >= 0) {
1402                 cpumask_t cpumask = node_to_cpumask(node);
1403                 cpu_set(this_cpu, cpumask);
1404                 spin_lock(&this_rq->lock);
1405                 load_balance(this_rq, idle, cpumask);
1406                 spin_unlock(&this_rq->lock);
1407         }
1408 }
1409 #endif
1410
1411 static void rebalance_tick(runqueue_t *this_rq, int idle)
1412 {
1413 #ifdef CONFIG_NUMA
1414         int this_cpu = smp_processor_id();
1415 #endif
1416         unsigned long j = jiffies;
1417
1418         /*
1419          * First do inter-node rebalancing, then intra-node rebalancing,
1420          * if both events happen in the same tick. The inter-node
1421          * rebalancing does not necessarily have to create a perfect
1422          * balance within the node, since we load-balance the most loaded
1423          * node with the current CPU. (ie. other CPUs in the local node
1424          * are not balanced.)
1425          */
1426         if (idle) {
1427 #ifdef CONFIG_NUMA
1428                 if (!(j % IDLE_NODE_REBALANCE_TICK))
1429                         balance_node(this_rq, idle, this_cpu);
1430 #endif
1431                 if (!(j % IDLE_REBALANCE_TICK)) {
1432                         spin_lock(&this_rq->lock);
1433                         load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1434                         spin_unlock(&this_rq->lock);
1435                 }
1436                 return;
1437         }
1438 #ifdef CONFIG_NUMA
1439         if (!(j % BUSY_NODE_REBALANCE_TICK))
1440                 balance_node(this_rq, idle, this_cpu);
1441 #endif
1442         if (!(j % BUSY_REBALANCE_TICK)) {
1443                 spin_lock(&this_rq->lock);
1444                 load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
1445                 spin_unlock(&this_rq->lock);
1446         }
1447 }
1448 #else
1449 /*
1450  * on UP we do not need to balance between CPUs:
1451  */
1452 static inline void rebalance_tick(runqueue_t *this_rq, int idle)
1453 {
1454 }
1455 #endif
1456
1457 DEFINE_PER_CPU(struct kernel_stat, kstat);
1458
1459 EXPORT_PER_CPU_SYMBOL(kstat);
1460
1461 /*
1462  * We place interactive tasks back into the active array, if possible.
1463  *
1464  * To guarantee that this does not starve expired tasks we ignore the
1465  * interactivity of a task if the first expired task had to wait more
1466  * than a 'reasonable' amount of time. This deadline timeout is
1467  * load-dependent, as the frequency of array switched decreases with
1468  * increasing number of running tasks. We also ignore the interactivity
1469  * if a better static_prio task has expired:
1470  */
1471 #define EXPIRED_STARVING(rq) \
1472         ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
1473                 (jiffies - (rq)->expired_timestamp >= \
1474                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
1475                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
1476
1477 /*
1478  * This function gets called by the timer code, with HZ frequency.
1479  * We call it with interrupts disabled.
1480  *
1481  * It also gets called by the fork code, when changing the parent's
1482  * timeslices.
1483  */
1484 void scheduler_tick(int user_ticks, int sys_ticks)
1485 {
1486         int cpu = smp_processor_id();
1487         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1488         runqueue_t *rq = this_rq();
1489         task_t *p = current;
1490
1491         rq->timestamp_last_tick = sched_clock();
1492
1493         if (rcu_pending(cpu))
1494                 rcu_check_callbacks(cpu, user_ticks);
1495
1496         /* note: this timer irq context must be accounted for as well */
1497         if (hardirq_count() - HARDIRQ_OFFSET) {
1498                 cpustat->irq += sys_ticks;
1499                 sys_ticks = 0;
1500         } else if (softirq_count()) {
1501                 cpustat->softirq += sys_ticks;
1502                 sys_ticks = 0;
1503         }
1504
1505         if (p == rq->idle) {
1506                 if (atomic_read(&rq->nr_iowait) > 0)
1507                         cpustat->iowait += sys_ticks;
1508                 else
1509                         cpustat->idle += sys_ticks;
1510                 rebalance_tick(rq, 1);
1511                 return;
1512         }
1513         if (TASK_NICE(p) > 0)
1514                 cpustat->nice += user_ticks;
1515         else
1516                 cpustat->user += user_ticks;
1517         cpustat->system += sys_ticks;
1518
1519         /* Task might have expired already, but not scheduled off yet */
1520         if (p->array != rq->active) {
1521                 set_tsk_need_resched(p);
1522                 goto out;
1523         }
1524         spin_lock(&rq->lock);
1525         /*
1526          * The task was running during this tick - update the
1527          * time slice counter. Note: we do not update a thread's
1528          * priority until it either goes to sleep or uses up its
1529          * timeslice. This makes it possible for interactive tasks
1530          * to use up their timeslices at their highest priority levels.
1531          */
1532         if (unlikely(rt_task(p))) {
1533                 /*
1534                  * RR tasks need a special form of timeslice management.
1535                  * FIFO tasks have no timeslices.
1536                  */
1537                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
1538                         p->time_slice = task_timeslice(p);
1539                         p->first_time_slice = 0;
1540                         set_tsk_need_resched(p);
1541
1542                         /* put it at the end of the queue: */
1543                         dequeue_task(p, rq->active);
1544                         enqueue_task(p, rq->active);
1545                 }
1546                 goto out_unlock;
1547         }
1548         if (!--p->time_slice) {
1549                 dequeue_task(p, rq->active);
1550                 set_tsk_need_resched(p);
1551                 p->prio = effective_prio(p);
1552                 p->time_slice = task_timeslice(p);
1553                 p->first_time_slice = 0;
1554
1555                 if (!rq->expired_timestamp)
1556                         rq->expired_timestamp = jiffies;
1557                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
1558                         enqueue_task(p, rq->expired);
1559                         if (p->static_prio < rq->best_expired_prio)
1560                                 rq->best_expired_prio = p->static_prio;
1561                 } else
1562                         enqueue_task(p, rq->active);
1563         } else {
1564                 /*
1565                  * Prevent a too long timeslice allowing a task to monopolize
1566                  * the CPU. We do this by splitting up the timeslice into
1567                  * smaller pieces.
1568                  *
1569                  * Note: this does not mean the task's timeslices expire or
1570                  * get lost in any way, they just might be preempted by
1571                  * another task of equal priority. (one with higher
1572                  * priority would have preempted this task already.) We
1573                  * requeue this task to the end of the list on this priority
1574                  * level, which is in essence a round-robin of tasks with
1575                  * equal priority.
1576                  *
1577                  * This only applies to tasks in the interactive
1578                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
1579                  */
1580                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
1581                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
1582                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
1583                         (p->array == rq->active)) {
1584
1585                         dequeue_task(p, rq->active);
1586                         set_tsk_need_resched(p);
1587                         p->prio = effective_prio(p);
1588                         enqueue_task(p, rq->active);
1589                 }
1590         }
1591 out_unlock:
1592         spin_unlock(&rq->lock);
1593 out:
1594         rebalance_tick(rq, 0);
1595 }
1596
1597 /*
1598  * schedule() is the main scheduler function.
1599  */
1600 asmlinkage void __sched schedule(void)
1601 {
1602         long *switch_count;
1603         task_t *prev, *next;
1604         runqueue_t *rq;
1605         prio_array_t *array;
1606         struct list_head *queue;
1607         unsigned long long now;
1608         unsigned long run_time;
1609         int idx;
1610
1611         /*
1612          * Test if we are atomic.  Since do_exit() needs to call into
1613          * schedule() atomically, we ignore that path for now.
1614          * Otherwise, whine if we are scheduling when we should not be.
1615          */
1616         if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
1617                 if (unlikely(in_atomic())) {
1618                         printk(KERN_ERR "bad: scheduling while atomic!\n");
1619                         dump_stack();
1620                 }
1621         }
1622
1623 need_resched:
1624         preempt_disable();
1625         prev = current;
1626         rq = this_rq();
1627
1628         release_kernel_lock(prev);
1629         now = sched_clock();
1630         if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
1631                 run_time = now - prev->timestamp;
1632         else
1633                 run_time = NS_MAX_SLEEP_AVG;
1634
1635         /*
1636          * Tasks with interactive credits get charged less run_time
1637          * at high sleep_avg to delay them losing their interactive
1638          * status
1639          */
1640         if (HIGH_CREDIT(prev))
1641                 run_time /= (CURRENT_BONUS(prev) ? : 1);
1642
1643         spin_lock_irq(&rq->lock);
1644
1645         /*
1646          * if entering off of a kernel preemption go straight
1647          * to picking the next task.
1648          */
1649         switch_count = &prev->nivcsw;
1650         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
1651                 switch_count = &prev->nvcsw;
1652                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
1653                                 unlikely(signal_pending(prev))))
1654                         prev->state = TASK_RUNNING;
1655                 else
1656                         deactivate_task(prev, rq);
1657         }
1658
1659         if (unlikely(!rq->nr_running)) {
1660 #ifdef CONFIG_SMP
1661                 load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
1662 #endif
1663                 if (!rq->nr_running) {
1664                         next = rq->idle;
1665                         rq->expired_timestamp = 0;
1666                         goto switch_tasks;
1667                 }
1668         }
1669
1670         array = rq->active;
1671         if (unlikely(!array->nr_active)) {
1672                 /*
1673                  * Switch the active and expired arrays.
1674                  */
1675                 rq->active = rq->expired;
1676                 rq->expired = array;
1677                 array = rq->active;
1678                 rq->expired_timestamp = 0;
1679                 rq->best_expired_prio = MAX_PRIO;
1680         }
1681
1682         idx = sched_find_first_bit(array->bitmap);
1683         queue = array->queue + idx;
1684         next = list_entry(queue->next, task_t, run_list);
1685
1686         if (!rt_task(next) && next->activated > 0) {
1687                 unsigned long long delta = now - next->timestamp;
1688
1689                 if (next->activated == 1)
1690                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
1691
1692                 array = next->array;
1693                 dequeue_task(next, array);
1694                 recalc_task_prio(next, next->timestamp + delta);
1695                 enqueue_task(next, array);
1696         }
1697         next->activated = 0;
1698 switch_tasks:
1699         prefetch(next);
1700         clear_tsk_need_resched(prev);
1701         RCU_qsctr(task_cpu(prev))++;
1702
1703         prev->sleep_avg -= run_time;
1704         if ((long)prev->sleep_avg <= 0) {
1705                 prev->sleep_avg = 0;
1706                 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
1707                         prev->interactive_credit--;
1708         }
1709         prev->timestamp = now;
1710
1711         if (likely(prev != next)) {
1712                 next->timestamp = now;
1713                 rq->nr_switches++;
1714                 rq->curr = next;
1715                 ++*switch_count;
1716
1717                 prepare_arch_switch(rq, next);
1718                 prev = context_switch(rq, prev, next);
1719                 barrier();
1720
1721                 finish_task_switch(prev);
1722         } else
1723                 spin_unlock_irq(&rq->lock);
1724
1725         reacquire_kernel_lock(current);
1726         preempt_enable_no_resched();
1727         if (test_thread_flag(TIF_NEED_RESCHED))
1728                 goto need_resched;
1729 }
1730
1731 EXPORT_SYMBOL(schedule);
1732
1733 #ifdef CONFIG_PREEMPT
1734 /*
1735  * this is is the entry point to schedule() from in-kernel preemption
1736  * off of preempt_enable.  Kernel preemptions off return from interrupt
1737  * occur there and call schedule directly.
1738  */
1739 asmlinkage void __sched preempt_schedule(void)
1740 {
1741         struct thread_info *ti = current_thread_info();
1742
1743         /*
1744          * If there is a non-zero preempt_count or interrupts are disabled,
1745          * we do not want to preempt the current task.  Just return..
1746          */
1747         if (unlikely(ti->preempt_count || irqs_disabled()))
1748                 return;
1749
1750 need_resched:
1751         ti->preempt_count = PREEMPT_ACTIVE;
1752         schedule();
1753         ti->preempt_count = 0;
1754
1755         /* we could miss a preemption opportunity between schedule and now */
1756         barrier();
1757         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
1758                 goto need_resched;
1759 }
1760
1761 EXPORT_SYMBOL(preempt_schedule);
1762 #endif /* CONFIG_PREEMPT */
1763
1764 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync)
1765 {
1766         task_t *p = curr->task;
1767         return try_to_wake_up(p, mode, sync);
1768 }
1769
1770 EXPORT_SYMBOL(default_wake_function);
1771
1772 /*
1773  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
1774  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
1775  * number) then we wake all the non-exclusive tasks and one exclusive task.
1776  *
1777  * There are circumstances in which we can try to wake a task which has already
1778  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
1779  * zero in this (rare) case, and we handle it by continuing to scan the queue.
1780  */
1781 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
1782                              int nr_exclusive, int sync)
1783 {
1784         struct list_head *tmp, *next;
1785
1786         list_for_each_safe(tmp, next, &q->task_list) {
1787                 wait_queue_t *curr;
1788                 unsigned flags;
1789                 curr = list_entry(tmp, wait_queue_t, task_list);
1790                 flags = curr->flags;
1791                 if (curr->func(curr, mode, sync) &&
1792                     (flags & WQ_FLAG_EXCLUSIVE) &&
1793                     !--nr_exclusive)
1794                         break;
1795         }
1796 }
1797
1798 /**
1799  * __wake_up - wake up threads blocked on a waitqueue.
1800  * @q: the waitqueue
1801  * @mode: which threads
1802  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1803  */
1804 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1805 {
1806         unsigned long flags;
1807
1808         spin_lock_irqsave(&q->lock, flags);
1809         __wake_up_common(q, mode, nr_exclusive, 0);
1810         spin_unlock_irqrestore(&q->lock, flags);
1811 }
1812
1813 EXPORT_SYMBOL(__wake_up);
1814
1815 /*
1816  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
1817  */
1818 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
1819 {
1820         __wake_up_common(q, mode, 1, 0);
1821 }
1822
1823 /**
1824  * __wake_up - sync- wake up threads blocked on a waitqueue.
1825  * @q: the waitqueue
1826  * @mode: which threads
1827  * @nr_exclusive: how many wake-one or wake-many threads to wake up
1828  *
1829  * The sync wakeup differs that the waker knows that it will schedule
1830  * away soon, so while the target thread will be woken up, it will not
1831  * be migrated to another CPU - ie. the two threads are 'synchronized'
1832  * with each other. This can prevent needless bouncing between CPUs.
1833  *
1834  * On UP it can prevent extra preemption.
1835  */
1836 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
1837 {
1838         unsigned long flags;
1839
1840         if (unlikely(!q))
1841                 return;
1842
1843         spin_lock_irqsave(&q->lock, flags);
1844         if (likely(nr_exclusive))
1845                 __wake_up_common(q, mode, nr_exclusive, 1);
1846         else
1847                 __wake_up_common(q, mode, nr_exclusive, 0);
1848         spin_unlock_irqrestore(&q->lock, flags);
1849 }
1850 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
1851
1852 void fastcall complete(struct completion *x)
1853 {
1854         unsigned long flags;
1855
1856         spin_lock_irqsave(&x->wait.lock, flags);
1857         x->done++;
1858         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1859                          1, 0);
1860         spin_unlock_irqrestore(&x->wait.lock, flags);
1861 }
1862 EXPORT_SYMBOL(complete);
1863
1864 void fastcall complete_all(struct completion *x)
1865 {
1866         unsigned long flags;
1867
1868         spin_lock_irqsave(&x->wait.lock, flags);
1869         x->done += UINT_MAX/2;
1870         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
1871                          0, 0);
1872         spin_unlock_irqrestore(&x->wait.lock, flags);
1873 }
1874 EXPORT_SYMBOL(complete_all);
1875
1876 void fastcall __sched wait_for_completion(struct completion *x)
1877 {
1878         might_sleep();
1879         spin_lock_irq(&x->wait.lock);
1880         if (!x->done) {
1881                 DECLARE_WAITQUEUE(wait, current);
1882
1883                 wait.flags |= WQ_FLAG_EXCLUSIVE;
1884                 __add_wait_queue_tail(&x->wait, &wait);
1885                 do {
1886                         __set_current_state(TASK_UNINTERRUPTIBLE);
1887                         spin_unlock_irq(&x->wait.lock);
1888                         schedule();
1889                         spin_lock_irq(&x->wait.lock);
1890                 } while (!x->done);
1891                 __remove_wait_queue(&x->wait, &wait);
1892         }
1893         x->done--;
1894         spin_unlock_irq(&x->wait.lock);
1895 }
1896 EXPORT_SYMBOL(wait_for_completion);
1897
1898 #define SLEEP_ON_VAR                                    \
1899         unsigned long flags;                            \
1900         wait_queue_t wait;                              \
1901         init_waitqueue_entry(&wait, current);
1902
1903 #define SLEEP_ON_HEAD                                   \
1904         spin_lock_irqsave(&q->lock,flags);              \
1905         __add_wait_queue(q, &wait);                     \
1906         spin_unlock(&q->lock);
1907
1908 #define SLEEP_ON_TAIL                                   \
1909         spin_lock_irq(&q->lock);                        \
1910         __remove_wait_queue(q, &wait);                  \
1911         spin_unlock_irqrestore(&q->lock, flags);
1912
1913 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
1914 {
1915         SLEEP_ON_VAR
1916
1917         current->state = TASK_INTERRUPTIBLE;
1918
1919         SLEEP_ON_HEAD
1920         schedule();
1921         SLEEP_ON_TAIL
1922 }
1923
1924 EXPORT_SYMBOL(interruptible_sleep_on);
1925
1926 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1927 {
1928         SLEEP_ON_VAR
1929
1930         current->state = TASK_INTERRUPTIBLE;
1931
1932         SLEEP_ON_HEAD
1933         timeout = schedule_timeout(timeout);
1934         SLEEP_ON_TAIL
1935
1936         return timeout;
1937 }
1938
1939 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
1940
1941 void fastcall __sched sleep_on(wait_queue_head_t *q)
1942 {
1943         SLEEP_ON_VAR
1944
1945         current->state = TASK_UNINTERRUPTIBLE;
1946
1947         SLEEP_ON_HEAD
1948         schedule();
1949         SLEEP_ON_TAIL
1950 }
1951
1952 EXPORT_SYMBOL(sleep_on);
1953
1954 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
1955 {
1956         SLEEP_ON_VAR
1957
1958         current->state = TASK_UNINTERRUPTIBLE;
1959
1960         SLEEP_ON_HEAD
1961         timeout = schedule_timeout(timeout);
1962         SLEEP_ON_TAIL
1963
1964         return timeout;
1965 }
1966
1967 EXPORT_SYMBOL(sleep_on_timeout);
1968
1969 void set_user_nice(task_t *p, long nice)
1970 {
1971         unsigned long flags;
1972         prio_array_t *array;
1973         runqueue_t *rq;
1974         int old_prio, new_prio, delta;
1975
1976         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
1977                 return;
1978         /*
1979          * We have to be careful, if called from sys_setpriority(),
1980          * the task might be in the middle of scheduling on another CPU.
1981          */
1982         rq = task_rq_lock(p, &flags);
1983         /*
1984          * The RT priorities are set via setscheduler(), but we still
1985          * allow the 'normal' nice value to be set - but as expected
1986          * it wont have any effect on scheduling until the task is
1987          * not SCHED_NORMAL:
1988          */
1989         if (rt_task(p)) {
1990                 p->static_prio = NICE_TO_PRIO(nice);
1991                 goto out_unlock;
1992         }
1993         array = p->array;
1994         if (array)
1995                 dequeue_task(p, array);
1996
1997         old_prio = p->prio;
1998         new_prio = NICE_TO_PRIO(nice);
1999         delta = new_prio - old_prio;
2000         p->static_prio = NICE_TO_PRIO(nice);
2001         p->prio += delta;
2002
2003         if (array) {
2004                 enqueue_task(p, array);
2005                 /*
2006                  * If the task increased its priority or is running and
2007                  * lowered its priority, then reschedule its CPU:
2008                  */
2009                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2010                         resched_task(rq->curr);
2011         }
2012 out_unlock:
2013         task_rq_unlock(rq, &flags);
2014 }
2015
2016 EXPORT_SYMBOL(set_user_nice);
2017
2018 #ifndef __alpha__
2019
2020 /*
2021  * sys_nice - change the priority of the current process.
2022  * @increment: priority increment
2023  *
2024  * sys_setpriority is a more generic, but much slower function that
2025  * does similar things.
2026  */
2027 asmlinkage long sys_nice(int increment)
2028 {
2029         int retval;
2030         long nice;
2031
2032         /*
2033          * Setpriority might change our priority at the same moment.
2034          * We don't have to worry. Conceptually one call occurs first
2035          * and we have a single winner.
2036          */
2037         if (increment < 0) {
2038                 if (!capable(CAP_SYS_NICE))
2039                         return -EPERM;
2040                 if (increment < -40)
2041                         increment = -40;
2042         }
2043         if (increment > 40)
2044                 increment = 40;
2045
2046         nice = PRIO_TO_NICE(current->static_prio) + increment;
2047         if (nice < -20)
2048                 nice = -20;
2049         if (nice > 19)
2050                 nice = 19;
2051
2052         retval = security_task_setnice(current, nice);
2053         if (retval)
2054                 return retval;
2055
2056         set_user_nice(current, nice);
2057         return 0;
2058 }
2059
2060 #endif
2061
2062 /**
2063  * task_prio - return the priority value of a given task.
2064  * @p: the task in question.
2065  *
2066  * This is the priority value as seen by users in /proc.
2067  * RT tasks are offset by -200. Normal tasks are centered
2068  * around 0, value goes from -16 to +15.
2069  */
2070 int task_prio(task_t *p)
2071 {
2072         return p->prio - MAX_RT_PRIO;
2073 }
2074
2075 /**
2076  * task_nice - return the nice value of a given task.
2077  * @p: the task in question.
2078  */
2079 int task_nice(task_t *p)
2080 {
2081         return TASK_NICE(p);
2082 }
2083
2084 EXPORT_SYMBOL(task_nice);
2085
2086 /**
2087  * idle_cpu - is a given cpu idle currently?
2088  * @cpu: the processor in question.
2089  */
2090 int idle_cpu(int cpu)
2091 {
2092         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
2093 }
2094
2095 EXPORT_SYMBOL_GPL(idle_cpu);
2096
2097 /**
2098  * find_process_by_pid - find a process with a matching PID value.
2099  * @pid: the pid in question.
2100  */
2101 static inline task_t *find_process_by_pid(pid_t pid)
2102 {
2103         return pid ? find_task_by_pid(pid) : current;
2104 }
2105
2106 /* Actually do priority change: must hold rq lock. */
2107 static void __setscheduler(struct task_struct *p, int policy, int prio)
2108 {
2109         BUG_ON(p->array);
2110         p->policy = policy;
2111         p->rt_priority = prio;
2112         if (policy != SCHED_NORMAL)
2113                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
2114         else
2115                 p->prio = p->static_prio;
2116 }
2117
2118 /*
2119  * setscheduler - change the scheduling policy and/or RT priority of a thread.
2120  */
2121 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
2122 {
2123         struct sched_param lp;
2124         int retval = -EINVAL;
2125         int oldprio;
2126         prio_array_t *array;
2127         unsigned long flags;
2128         runqueue_t *rq;
2129         task_t *p;
2130
2131         if (!param || pid < 0)
2132                 goto out_nounlock;
2133
2134         retval = -EFAULT;
2135         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
2136                 goto out_nounlock;
2137
2138         /*
2139          * We play safe to avoid deadlocks.
2140          */
2141         read_lock_irq(&tasklist_lock);
2142
2143         p = find_process_by_pid(pid);
2144
2145         retval = -ESRCH;
2146         if (!p)
2147                 goto out_unlock_tasklist;
2148
2149         /*
2150          * To be able to change p->policy safely, the apropriate
2151          * runqueue lock must be held.
2152          */
2153         rq = task_rq_lock(p, &flags);
2154
2155         if (policy < 0)
2156                 policy = p->policy;
2157         else {
2158                 retval = -EINVAL;
2159                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
2160                                 policy != SCHED_NORMAL)
2161                         goto out_unlock;
2162         }
2163
2164         /*
2165          * Valid priorities for SCHED_FIFO and SCHED_RR are
2166          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
2167          */
2168         retval = -EINVAL;
2169         if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
2170                 goto out_unlock;
2171         if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
2172                 goto out_unlock;
2173
2174         retval = -EPERM;
2175         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
2176             !capable(CAP_SYS_NICE))
2177                 goto out_unlock;
2178         if ((current->euid != p->euid) && (current->euid != p->uid) &&
2179             !capable(CAP_SYS_NICE))
2180                 goto out_unlock;
2181
2182         retval = security_task_setscheduler(p, policy, &lp);
2183         if (retval)
2184                 goto out_unlock;
2185
2186         array = p->array;
2187         if (array)
2188                 deactivate_task(p, task_rq(p));
2189         retval = 0;
2190         oldprio = p->prio;
2191         __setscheduler(p, policy, lp.sched_priority);
2192         if (array) {
2193                 __activate_task(p, task_rq(p));
2194                 /*
2195                  * Reschedule if we are currently running on this runqueue and
2196                  * our priority decreased, or if we are not currently running on
2197                  * this runqueue and our priority is higher than the current's
2198                  */
2199                 if (task_running(rq, p)) {
2200                         if (p->prio > oldprio)
2201                                 resched_task(rq->curr);
2202                 } else if (p->prio < rq->curr->prio)
2203                         resched_task(rq->curr);
2204         }
2205
2206 out_unlock:
2207         task_rq_unlock(rq, &flags);
2208 out_unlock_tasklist:
2209         read_unlock_irq(&tasklist_lock);
2210
2211 out_nounlock:
2212         return retval;
2213 }
2214
2215 /**
2216  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
2217  * @pid: the pid in question.
2218  * @policy: new policy
2219  * @param: structure containing the new RT priority.
2220  */
2221 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
2222                                        struct sched_param __user *param)
2223 {
2224         return setscheduler(pid, policy, param);
2225 }
2226
2227 /**
2228  * sys_sched_setparam - set/change the RT priority of a thread
2229  * @pid: the pid in question.
2230  * @param: structure containing the new RT priority.
2231  */
2232 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
2233 {
2234         return setscheduler(pid, -1, param);
2235 }
2236
2237 /**
2238  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
2239  * @pid: the pid in question.
2240  */
2241 asmlinkage long sys_sched_getscheduler(pid_t pid)
2242 {
2243         int retval = -EINVAL;
2244         task_t *p;
2245
2246         if (pid < 0)
2247                 goto out_nounlock;
2248
2249         retval = -ESRCH;
2250         read_lock(&tasklist_lock);
2251         p = find_process_by_pid(pid);
2252         if (p) {
2253                 retval = security_task_getscheduler(p);
2254                 if (!retval)
2255                         retval = p->policy;
2256         }
2257         read_unlock(&tasklist_lock);
2258
2259 out_nounlock:
2260         return retval;
2261 }
2262
2263 /**
2264  * sys_sched_getscheduler - get the RT priority of a thread
2265  * @pid: the pid in question.
2266  * @param: structure containing the RT priority.
2267  */
2268 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
2269 {
2270         struct sched_param lp;
2271         int retval = -EINVAL;
2272         task_t *p;
2273
2274         if (!param || pid < 0)
2275                 goto out_nounlock;
2276
2277         read_lock(&tasklist_lock);
2278         p = find_process_by_pid(pid);
2279         retval = -ESRCH;
2280         if (!p)
2281                 goto out_unlock;
2282
2283         retval = security_task_getscheduler(p);
2284         if (retval)
2285                 goto out_unlock;
2286
2287         lp.sched_priority = p->rt_priority;
2288         read_unlock(&tasklist_lock);
2289
2290         /*
2291          * This one might sleep, we cannot do it with a spinlock held ...
2292          */
2293         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
2294
2295 out_nounlock:
2296         return retval;
2297
2298 out_unlock:
2299         read_unlock(&tasklist_lock);
2300         return retval;
2301 }
2302
2303 /**
2304  * sys_sched_setaffinity - set the cpu affinity of a process
2305  * @pid: pid of the process
2306  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2307  * @user_mask_ptr: user-space pointer to the new cpu mask
2308  */
2309 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
2310                                       unsigned long __user *user_mask_ptr)
2311 {
2312         cpumask_t new_mask;
2313         int retval;
2314         task_t *p;
2315
2316         if (len < sizeof(new_mask))
2317                 return -EINVAL;
2318
2319         if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
2320                 return -EFAULT;
2321
2322         lock_cpu_hotplug();
2323         read_lock(&tasklist_lock);
2324
2325         p = find_process_by_pid(pid);
2326         if (!p) {
2327                 read_unlock(&tasklist_lock);
2328                 unlock_cpu_hotplug();
2329                 return -ESRCH;
2330         }
2331
2332         /*
2333          * It is not safe to call set_cpus_allowed with the
2334          * tasklist_lock held.  We will bump the task_struct's
2335          * usage count and then drop tasklist_lock.
2336          */
2337         get_task_struct(p);
2338         read_unlock(&tasklist_lock);
2339
2340         retval = -EPERM;
2341         if ((current->euid != p->euid) && (current->euid != p->uid) &&
2342                         !capable(CAP_SYS_NICE))
2343                 goto out_unlock;
2344
2345         retval = set_cpus_allowed(p, new_mask);
2346
2347 out_unlock:
2348         put_task_struct(p);
2349         unlock_cpu_hotplug();
2350         return retval;
2351 }
2352
2353 /**
2354  * sys_sched_getaffinity - get the cpu affinity of a process
2355  * @pid: pid of the process
2356  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
2357  * @user_mask_ptr: user-space pointer to hold the current cpu mask
2358  */
2359 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
2360                                       unsigned long __user *user_mask_ptr)
2361 {
2362         unsigned int real_len;
2363         cpumask_t mask;
2364         int retval;
2365         task_t *p;
2366
2367         real_len = sizeof(mask);
2368         if (len < real_len)
2369                 return -EINVAL;
2370
2371         read_lock(&tasklist_lock);
2372
2373         retval = -ESRCH;
2374         p = find_process_by_pid(pid);
2375         if (!p)
2376                 goto out_unlock;
2377
2378         retval = 0;
2379         cpus_and(mask, p->cpus_allowed, cpu_possible_map);
2380
2381 out_unlock:
2382         read_unlock(&tasklist_lock);
2383         if (retval)
2384                 return retval;
2385         if (copy_to_user(user_mask_ptr, &mask, real_len))
2386                 return -EFAULT;
2387         return real_len;
2388 }
2389
2390 /**
2391  * sys_sched_yield - yield the current processor to other threads.
2392  *
2393  * this function yields the current CPU by moving the calling thread
2394  * to the expired array. If there are no other threads running on this
2395  * CPU then this function will return.
2396  */
2397 asmlinkage long sys_sched_yield(void)
2398 {
2399         runqueue_t *rq = this_rq_lock();
2400         prio_array_t *array = current->array;
2401
2402         /*
2403          * We implement yielding by moving the task into the expired
2404          * queue.
2405          *
2406          * (special rule: RT tasks will just roundrobin in the active
2407          *  array.)
2408          */
2409         if (likely(!rt_task(current))) {
2410                 dequeue_task(current, array);
2411                 enqueue_task(current, rq->expired);
2412         } else {
2413                 list_del(&current->run_list);
2414                 list_add_tail(&current->run_list, array->queue + current->prio);
2415         }
2416         /*
2417          * Since we are going to call schedule() anyway, there's
2418          * no need to preempt:
2419          */
2420         _raw_spin_unlock(&rq->lock);
2421         preempt_enable_no_resched();
2422
2423         schedule();
2424
2425         return 0;
2426 }
2427
2428 void __sched __cond_resched(void)
2429 {
2430         set_current_state(TASK_RUNNING);
2431         schedule();
2432 }
2433
2434 EXPORT_SYMBOL(__cond_resched);
2435
2436 /**
2437  * yield - yield the current processor to other threads.
2438  *
2439  * this is a shortcut for kernel-space yielding - it marks the
2440  * thread runnable and calls sys_sched_yield().
2441  */
2442 void __sched yield(void)
2443 {
2444         set_current_state(TASK_RUNNING);
2445         sys_sched_yield();
2446 }
2447
2448 EXPORT_SYMBOL(yield);
2449
2450 /*
2451  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
2452  * that process accounting knows that this is a task in IO wait state.
2453  *
2454  * But don't do that if it is a deliberate, throttling IO wait (this task
2455  * has set its backing_dev_info: the queue against which it should throttle)
2456  */
2457 void __sched io_schedule(void)
2458 {
2459         struct runqueue *rq = this_rq();
2460
2461         atomic_inc(&rq->nr_iowait);
2462         schedule();
2463         atomic_dec(&rq->nr_iowait);
2464 }
2465
2466 EXPORT_SYMBOL(io_schedule);
2467
2468 long __sched io_schedule_timeout(long timeout)
2469 {
2470         struct runqueue *rq = this_rq();
2471         long ret;
2472
2473         atomic_inc(&rq->nr_iowait);
2474         ret = schedule_timeout(timeout);
2475         atomic_dec(&rq->nr_iowait);
2476         return ret;
2477 }
2478
2479 /**
2480  * sys_sched_get_priority_max - return maximum RT priority.
2481  * @policy: scheduling class.
2482  *
2483  * this syscall returns the maximum rt_priority that can be used
2484  * by a given scheduling class.
2485  */
2486 asmlinkage long sys_sched_get_priority_max(int policy)
2487 {
2488         int ret = -EINVAL;
2489
2490         switch (policy) {
2491         case SCHED_FIFO:
2492         case SCHED_RR:
2493                 ret = MAX_USER_RT_PRIO-1;
2494                 break;
2495         case SCHED_NORMAL:
2496                 ret = 0;
2497                 break;
2498         }
2499         return ret;
2500 }
2501
2502 /**
2503  * sys_sched_get_priority_min - return minimum RT priority.
2504  * @policy: scheduling class.
2505  *
2506  * this syscall returns the minimum rt_priority that can be used
2507  * by a given scheduling class.
2508  */
2509 asmlinkage long sys_sched_get_priority_min(int policy)
2510 {
2511         int ret = -EINVAL;
2512
2513         switch (policy) {
2514         case SCHED_FIFO:
2515         case SCHED_RR:
2516                 ret = 1;
2517                 break;
2518         case SCHED_NORMAL:
2519                 ret = 0;
2520         }
2521         return ret;
2522 }
2523
2524 /**
2525  * sys_sched_rr_get_interval - return the default timeslice of a process.
2526  * @pid: pid of the process.
2527  * @interval: userspace pointer to the timeslice value.
2528  *
2529  * this syscall writes the default timeslice value of a given process
2530  * into the user-space timespec buffer. A value of '0' means infinity.
2531  */
2532 asmlinkage
2533 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
2534 {
2535         int retval = -EINVAL;
2536         struct timespec t;
2537         task_t *p;
2538
2539         if (pid < 0)
2540                 goto out_nounlock;
2541
2542         retval = -ESRCH;
2543         read_lock(&tasklist_lock);
2544         p = find_process_by_pid(pid);
2545         if (!p)
2546                 goto out_unlock;
2547
2548         retval = security_task_getscheduler(p);
2549         if (retval)
2550                 goto out_unlock;
2551
2552         jiffies_to_timespec(p->policy & SCHED_FIFO ?
2553                                 0 : task_timeslice(p), &t);
2554         read_unlock(&tasklist_lock);
2555         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
2556 out_nounlock:
2557         return retval;
2558 out_unlock:
2559         read_unlock(&tasklist_lock);
2560         return retval;
2561 }
2562
2563 static inline struct task_struct *eldest_child(struct task_struct *p)
2564 {
2565         if (list_empty(&p->children)) return NULL;
2566         return list_entry(p->children.next,struct task_struct,sibling);
2567 }
2568
2569 static inline struct task_struct *older_sibling(struct task_struct *p)
2570 {
2571         if (p->sibling.prev==&p->parent->children) return NULL;
2572         return list_entry(p->sibling.prev,struct task_struct,sibling);
2573 }
2574
2575 static inline struct task_struct *younger_sibling(struct task_struct *p)
2576 {
2577         if (p->sibling.next==&p->parent->children) return NULL;
2578         return list_entry(p->sibling.next,struct task_struct,sibling);
2579 }
2580
2581 static void show_task(task_t * p)
2582 {
2583         task_t *relative;
2584         unsigned state;
2585         unsigned long free = 0;
2586         static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
2587
2588         printk("%-13.13s ", p->comm);
2589         state = p->state ? __ffs(p->state) + 1 : 0;
2590         if (state < ARRAY_SIZE(stat_nam))
2591                 printk(stat_nam[state]);
2592         else
2593                 printk("?");
2594 #if (BITS_PER_LONG == 32)
2595         if (state == TASK_RUNNING)
2596                 printk(" running ");
2597         else
2598                 printk(" %08lX ", thread_saved_pc(p));
2599 #else
2600         if (state == TASK_RUNNING)
2601                 printk("  running task   ");
2602         else
2603                 printk(" %016lx ", thread_saved_pc(p));
2604 #endif
2605 #ifdef CONFIG_DEBUG_STACK_USAGE
2606         {
2607                 unsigned long * n = (unsigned long *) (p->thread_info+1);
2608                 while (!*n)
2609                         n++;
2610                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
2611         }
2612 #endif
2613         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
2614         if ((relative = eldest_child(p)))
2615                 printk("%5d ", relative->pid);
2616         else
2617                 printk("      ");
2618         if ((relative = younger_sibling(p)))
2619                 printk("%7d", relative->pid);
2620         else
2621                 printk("       ");
2622         if ((relative = older_sibling(p)))
2623                 printk(" %5d", relative->pid);
2624         else
2625                 printk("      ");
2626         if (!p->mm)
2627                 printk(" (L-TLB)\n");
2628         else
2629                 printk(" (NOTLB)\n");
2630
2631         if (state != TASK_RUNNING)
2632                 show_stack(p, NULL);
2633 }
2634
2635 void show_state(void)
2636 {
2637         task_t *g, *p;
2638
2639 #if (BITS_PER_LONG == 32)
2640         printk("\n"
2641                "                                               sibling\n");
2642         printk("  task             PC      pid father child younger older\n");
2643 #else
2644         printk("\n"
2645                "                                                       sibling\n");
2646         printk("  task                 PC          pid father child younger older\n");
2647 #endif
2648         read_lock(&tasklist_lock);
2649         do_each_thread(g, p) {
2650                 /*
2651                  * reset the NMI-timeout, listing all files on a slow
2652                  * console might take alot of time:
2653                  */
2654                 touch_nmi_watchdog();
2655                 show_task(p);
2656         } while_each_thread(g, p);
2657
2658         read_unlock(&tasklist_lock);
2659 }
2660
2661 void __init init_idle(task_t *idle, int cpu)
2662 {
2663         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
2664         unsigned long flags;
2665
2666         local_irq_save(flags);
2667         double_rq_lock(idle_rq, rq);
2668
2669         idle_rq->curr = idle_rq->idle = idle;
2670         deactivate_task(idle, rq);
2671         idle->array = NULL;
2672         idle->prio = MAX_PRIO;
2673         idle->state = TASK_RUNNING;
2674         set_task_cpu(idle, cpu);
2675         double_rq_unlock(idle_rq, rq);
2676         set_tsk_need_resched(idle);
2677         local_irq_restore(flags);
2678
2679         /* Set the preempt count _outside_ the spinlocks! */
2680 #ifdef CONFIG_PREEMPT
2681         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
2682 #else
2683         idle->thread_info->preempt_count = 0;
2684 #endif
2685 }
2686
2687 /*
2688  * In a system that switches off the HZ timer idle_cpu_mask
2689  * indicates which cpus entered this state. This is used
2690  * in the rcu update to wait only for active cpus. For system
2691  * which do not switch off the HZ timer idle_cpu_mask should
2692  * always be CPU_MASK_NONE.
2693  */
2694 cpumask_t idle_cpu_mask = CPU_MASK_NONE;
2695
2696 #ifdef CONFIG_SMP
2697 /*
2698  * This is how migration works:
2699  *
2700  * 1) we queue a migration_req_t structure in the source CPU's
2701  *    runqueue and wake up that CPU's migration thread.
2702  * 2) we down() the locked semaphore => thread blocks.
2703  * 3) migration thread wakes up (implicitly it forces the migrated
2704  *    thread off the CPU)
2705  * 4) it gets the migration request and checks whether the migrated
2706  *    task is still in the wrong runqueue.
2707  * 5) if it's in the wrong runqueue then the migration thread removes
2708  *    it and puts it into the right queue.
2709  * 6) migration thread up()s the semaphore.
2710  * 7) we wake up and the migration is done.
2711  */
2712
2713 /*
2714  * Change a given task's CPU affinity. Migrate the thread to a
2715  * proper CPU and schedule it away if the CPU it's executing on
2716  * is removed from the allowed bitmask.
2717  *
2718  * NOTE: the caller must have a valid reference to the task, the
2719  * task must not exit() & deallocate itself prematurely.  The
2720  * call is not atomic; no spinlocks may be held.
2721  */
2722 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
2723 {
2724         unsigned long flags;
2725         int ret = 0;
2726         migration_req_t req;
2727         runqueue_t *rq;
2728
2729         rq = task_rq_lock(p, &flags);
2730         if (any_online_cpu(new_mask) == NR_CPUS) {
2731                 ret = -EINVAL;
2732                 goto out;
2733         }
2734
2735         if (__set_cpus_allowed(p, new_mask, &req)) {
2736                 /* Need help from migration thread: drop lock and wait. */
2737                 task_rq_unlock(rq, &flags);
2738                 wake_up_process(rq->migration_thread);
2739                 wait_for_completion(&req.done);
2740                 return 0;
2741         }
2742 out:
2743         task_rq_unlock(rq, &flags);
2744         return ret;
2745 }
2746
2747 EXPORT_SYMBOL_GPL(set_cpus_allowed);
2748
2749 /* Move (not current) task off this cpu, onto dest cpu. */
2750 static void move_task_away(struct task_struct *p, int dest_cpu)
2751 {
2752         runqueue_t *rq_dest;
2753
2754         rq_dest = cpu_rq(dest_cpu);
2755
2756         double_rq_lock(this_rq(), rq_dest);
2757         if (task_cpu(p) != smp_processor_id())
2758                 goto out; /* Already moved */
2759
2760         set_task_cpu(p, dest_cpu);
2761         if (p->array) {
2762                 deactivate_task(p, this_rq());
2763                 activate_task(p, rq_dest);
2764                 if (p->prio < rq_dest->curr->prio)
2765                         resched_task(rq_dest->curr);
2766         }
2767         p->timestamp = rq_dest->timestamp_last_tick;
2768
2769 out:
2770         double_rq_unlock(this_rq(), rq_dest);
2771 }
2772
2773 /*
2774  * migration_thread - this is a highprio system thread that performs
2775  * thread migration by bumping thread off CPU then 'pushing' onto
2776  * another runqueue.
2777  */
2778 static int migration_thread(void * data)
2779 {
2780         runqueue_t *rq;
2781         int cpu = (long)data;
2782
2783         rq = cpu_rq(cpu);
2784         BUG_ON(rq->migration_thread != current);
2785
2786         while (!kthread_should_stop()) {
2787                 struct list_head *head;
2788                 migration_req_t *req;
2789
2790                 if (current->flags & PF_FREEZE)
2791                         refrigerator(PF_FREEZE);
2792
2793                 spin_lock_irq(&rq->lock);
2794                 head = &rq->migration_queue;
2795                 current->state = TASK_INTERRUPTIBLE;
2796                 if (list_empty(head)) {
2797                         spin_unlock_irq(&rq->lock);
2798                         schedule();
2799                         continue;
2800                 }
2801                 req = list_entry(head->next, migration_req_t, list);
2802                 list_del_init(head->next);
2803                 spin_unlock(&rq->lock);
2804
2805                 move_task_away(req->task,
2806                                any_online_cpu(req->task->cpus_allowed));
2807                 local_irq_enable();
2808                 complete(&req->done);
2809         }
2810         return 0;
2811 }
2812
2813 #ifdef CONFIG_HOTPLUG_CPU
2814 /* migrate_all_tasks - function to migrate all the tasks from the
2815  * current cpu caller must have already scheduled this to the target
2816  * cpu via set_cpus_allowed.  Machine is stopped.  */
2817 void migrate_all_tasks(void)
2818 {
2819         struct task_struct *tsk, *t;
2820         int dest_cpu, src_cpu;
2821         unsigned int node;
2822
2823         /* We're nailed to this CPU. */
2824         src_cpu = smp_processor_id();
2825
2826         /* Not required, but here for neatness. */
2827         write_lock(&tasklist_lock);
2828
2829         /* watch out for per node tasks, let's stay on this node */
2830         node = cpu_to_node(src_cpu);
2831
2832         do_each_thread(t, tsk) {
2833                 cpumask_t mask;
2834                 if (tsk == current)
2835                         continue;
2836
2837                 if (task_cpu(tsk) != src_cpu)
2838                         continue;
2839
2840                 /* Figure out where this task should go (attempting to
2841                  * keep it on-node), and check if it can be migrated
2842                  * as-is.  NOTE that kernel threads bound to more than
2843                  * one online cpu will be migrated. */
2844                 mask = node_to_cpumask(node);
2845                 cpus_and(mask, mask, tsk->cpus_allowed);
2846                 dest_cpu = any_online_cpu(mask);
2847                 if (dest_cpu == NR_CPUS)
2848                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
2849                 if (dest_cpu == NR_CPUS) {
2850                         cpus_clear(tsk->cpus_allowed);
2851                         cpus_complement(tsk->cpus_allowed);
2852                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
2853
2854                         /* Don't tell them about moving exiting tasks
2855                            or kernel threads (both mm NULL), since
2856                            they never leave kernel. */
2857                         if (tsk->mm && printk_ratelimit())
2858                                 printk(KERN_INFO "process %d (%s) no "
2859                                        "longer affine to cpu%d\n",
2860                                        tsk->pid, tsk->comm, src_cpu);
2861                 }
2862
2863                 move_task_away(tsk, dest_cpu);
2864         } while_each_thread(t, tsk);
2865
2866         write_unlock(&tasklist_lock);
2867 }
2868 #endif /* CONFIG_HOTPLUG_CPU */
2869
2870 /*
2871  * migration_call - callback that gets triggered when a CPU is added.
2872  * Here we can start up the necessary migration thread for the new CPU.
2873  */
2874 static int migration_call(struct notifier_block *nfb, unsigned long action,
2875                           void *hcpu)
2876 {
2877         int cpu = (long)hcpu;
2878         struct task_struct *p;
2879         struct runqueue *rq;
2880         unsigned long flags;
2881
2882         switch (action) {
2883         case CPU_UP_PREPARE:
2884                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
2885                 if (IS_ERR(p))
2886                         return NOTIFY_BAD;
2887                 kthread_bind(p, cpu);
2888                 /* Must be high prio: stop_machine expects to yield to it. */
2889                 rq = task_rq_lock(p, &flags);
2890                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
2891                 task_rq_unlock(rq, &flags);
2892                 cpu_rq(cpu)->migration_thread = p;
2893                 break;
2894         case CPU_ONLINE:
2895                 /* Strictly unneccessary, as first user will wake it. */
2896                 wake_up_process(cpu_rq(cpu)->migration_thread);
2897                 break;
2898 #ifdef CONFIG_HOTPLUG_CPU
2899         case CPU_UP_CANCELED:
2900                 /* Unbind it from offline cpu so it can run.  Fall thru. */
2901                 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
2902         case CPU_DEAD:
2903                 kthread_stop(cpu_rq(cpu)->migration_thread);
2904                 cpu_rq(cpu)->migration_thread = NULL;
2905                 BUG_ON(cpu_rq(cpu)->nr_running != 0);
2906                 break;
2907 #endif
2908         }
2909         return NOTIFY_OK;
2910 }
2911
2912 static struct notifier_block __devinitdata migration_notifier = {
2913         .notifier_call = migration_call,
2914 };
2915
2916 int __init migration_init(void)
2917 {
2918         void *cpu = (void *)(long)smp_processor_id();
2919         /* Start one for boot CPU. */
2920         migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
2921         migration_call(&migration_notifier, CPU_ONLINE, cpu);
2922         register_cpu_notifier(&migration_notifier);
2923         return 0;
2924 }
2925 #endif
2926
2927 /*
2928  * The 'big kernel lock'
2929  *
2930  * This spinlock is taken and released recursively by lock_kernel()
2931  * and unlock_kernel().  It is transparently dropped and reaquired
2932  * over schedule().  It is used to protect legacy code that hasn't
2933  * been migrated to a proper locking design yet.
2934  *
2935  * Don't use in new code.
2936  *
2937  * Note: spinlock debugging needs this even on !CONFIG_SMP.
2938  */
2939 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
2940 EXPORT_SYMBOL(kernel_flag);
2941
2942 void __init sched_init(void)
2943 {
2944         runqueue_t *rq;
2945         int i, j, k;
2946
2947         for (i = 0; i < NR_CPUS; i++) {
2948                 prio_array_t *array;
2949
2950                 rq = cpu_rq(i);
2951                 rq->active = rq->arrays;
2952                 rq->expired = rq->arrays + 1;
2953                 rq->best_expired_prio = MAX_PRIO;
2954
2955                 spin_lock_init(&rq->lock);
2956                 INIT_LIST_HEAD(&rq->migration_queue);
2957                 atomic_set(&rq->nr_iowait, 0);
2958                 nr_running_init(rq);
2959
2960                 for (j = 0; j < 2; j++) {
2961                         array = rq->arrays + j;
2962                         for (k = 0; k < MAX_PRIO; k++) {
2963                                 INIT_LIST_HEAD(array->queue + k);
2964                                 __clear_bit(k, array->bitmap);
2965                         }
2966                         // delimiter for bitsearch
2967                         __set_bit(MAX_PRIO, array->bitmap);
2968                 }
2969         }
2970         /*
2971          * We have to do a little magic to get the first
2972          * thread right in SMP mode.
2973          */
2974         rq = this_rq();
2975         rq->curr = current;
2976         rq->idle = current;
2977         set_task_cpu(current, smp_processor_id());
2978         wake_up_forked_process(current);
2979
2980         init_timers();
2981
2982         /*
2983          * The boot idle thread does lazy MMU switching as well:
2984          */
2985         atomic_inc(&init_mm.mm_count);
2986         enter_lazy_tlb(&init_mm, current);
2987 }
2988
2989 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
2990 void __might_sleep(char *file, int line)
2991 {
2992 #if defined(in_atomic)
2993         static unsigned long prev_jiffy;        /* ratelimiting */
2994
2995         if ((in_atomic() || irqs_disabled()) &&
2996             system_state == SYSTEM_RUNNING) {
2997                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
2998                         return;
2999                 prev_jiffy = jiffies;
3000                 printk(KERN_ERR "Debug: sleeping function called from invalid"
3001                                 " context at %s:%d\n", file, line);
3002                 printk("in_atomic():%d, irqs_disabled():%d\n",
3003                         in_atomic(), irqs_disabled());
3004                 dump_stack();
3005         }
3006 #endif
3007 }
3008 EXPORT_SYMBOL(__might_sleep);
3009 #endif
3010
3011
3012 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
3013 /*
3014  * This could be a long-held lock.  If another CPU holds it for a long time,
3015  * and that CPU is not asked to reschedule then *this* CPU will spin on the
3016  * lock for a long time, even if *this* CPU is asked to reschedule.
3017  *
3018  * So what we do here, in the slow (contended) path is to spin on the lock by
3019  * hand while permitting preemption.
3020  *
3021  * Called inside preempt_disable().
3022  */
3023 void __sched __preempt_spin_lock(spinlock_t *lock)
3024 {
3025         if (preempt_count() > 1) {
3026                 _raw_spin_lock(lock);
3027                 return;
3028         }
3029         do {
3030                 preempt_enable();
3031                 while (spin_is_locked(lock))
3032                         cpu_relax();
3033                 preempt_disable();
3034         } while (!_raw_spin_trylock(lock));
3035 }
3036
3037 EXPORT_SYMBOL(__preempt_spin_lock);
3038
3039 void __sched __preempt_write_lock(rwlock_t *lock)
3040 {
3041         if (preempt_count() > 1) {
3042                 _raw_write_lock(lock);
3043                 return;
3044         }
3045
3046         do {
3047                 preempt_enable();
3048                 while (rwlock_is_locked(lock))
3049                         cpu_relax();
3050                 preempt_disable();
3051         } while (!_raw_write_trylock(lock));
3052 }
3053
3054 EXPORT_SYMBOL(__preempt_write_lock);
3055 #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */