linux-2.6-591-chopstix-intern.patch

   1 diff -Nurb linux-2.6.27-590/arch/Kconfig linux-2.6.27-591/arch/Kconfig
   2 --- linux-2.6.27-590/arch/Kconfig       2010-01-29 16:29:46.000000000 -0500
   3 +++ linux-2.6.27-591/arch/Kconfig       2010-01-29 16:30:22.000000000 -0500
   4 @@ -13,9 +13,18 @@
   5
   6           If unsure, say N.
   7
   8 +config CHOPSTIX
   9 +       bool "Chopstix (PlanetLab)"
  10 +       depends on MODULES && OPROFILE
  11 +       help
  12 +         Chopstix allows you to monitor various events by summarizing them
  13 +         in lossy data structures and transferring these data structures
  14 +         into user space. If in doubt, say "N".
  15 +
  16  config HAVE_OPROFILE
  17         def_bool n
  18
  19 +
  20  config KPROBES
  21         bool "Kprobes"
  22         depends on KALLSYMS && MODULES
  23 diff -Nurb linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c
  24 --- linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c   2008-10-09 18:13:53.000000000 -0400
  25 +++ linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c   2010-01-29 16:45:48.000000000 -0500
  26 @@ -9,6 +9,7 @@
  27  #include <linux/signal.h>
  28  #include <linux/personality.h>
  29  #include <linux/suspend.h>
  30 +#include <linux/arrays.h>
  31  #include <linux/kbuild.h>
  32  #include <asm/ucontext.h>
  33  #include "sigframe.h"
  34 @@ -24,9 +25,20 @@
  35  #include <linux/lguest.h>
  36  #include "../../../drivers/lguest/lg.h"
  37
  38 +
  39 +#define STACKOFFSET(sym, str, mem) \
  40 +       DEFINE(sym, offsetof(struct str, mem)-sizeof(struct str));
  41 +
  42  /* workaround for a warning with -Wmissing-prototypes */
  43  void foo(void);
  44
  45 +struct event_spec {
  46 +       unsigned long pc;
  47 +       unsigned long dcookie;
  48 +       unsigned count;
  49 +       unsigned int number;
  50 +};
  51 +
  52  void foo(void)
  53  {
  54         OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
  55 @@ -50,6 +62,16 @@
  56         OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
  57         BLANK();
  58
  59 +    STACKOFFSET(TASK_thread, task_struct, thread);
  60 +    STACKOFFSET(THREAD_esp, thread_struct, sp);
  61 +    STACKOFFSET(EVENT_event_data, event, event_data);
  62 +    STACKOFFSET(EVENT_task, event, task);
  63 +    STACKOFFSET(EVENT_event_type, event, event_type);
  64 +    STACKOFFSET(SPEC_number, event_spec, number);
  65 +    DEFINE(EVENT_SIZE, sizeof(struct event));
  66 +    DEFINE(SPEC_SIZE, sizeof(struct event_spec));
  67 +    DEFINE(SPEC_EVENT_SIZE, sizeof(struct event_spec)+sizeof(struct event));
  68 +
  69         OFFSET(TI_task, thread_info, task);
  70         OFFSET(TI_exec_domain, thread_info, exec_domain);
  71         OFFSET(TI_flags, thread_info, flags);
  72 diff -Nurb linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c.rej linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c.rej
  73 --- linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c.rej       1969-12-31 19:00:00.000000000 -0500
  74 +++ linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c.rej       2010-01-31 22:21:08.000000000 -0500
  75 @@ -0,0 +1,17 @@
  76 +***************
  77 +*** 63,69 ****
  78 +       BLANK();
  79 +
  80 +      STACKOFFSET(TASK_thread, task_struct, thread);
  81 +-     STACKOFFSET(THREAD_esp, thread_struct, esp);
  82 +      STACKOFFSET(EVENT_event_data, event, event_data);
  83 +      STACKOFFSET(EVENT_task, event, task);
  84 +      STACKOFFSET(EVENT_event_type, event, event_type);
  85 +--- 63,69 ----
  86 +       BLANK();
  87 +
  88 +      STACKOFFSET(TASK_thread, task_struct, thread);
  89 ++     STACKOFFSET(THREAD_esp, thread_struct, sp);
  90 +      STACKOFFSET(EVENT_event_data, event, event_data);
  91 +      STACKOFFSET(EVENT_task, event, task);
  92 +      STACKOFFSET(EVENT_event_type, event, event_type);
  93 diff -Nurb linux-2.6.27-590/arch/x86/kernel/entry_32.S linux-2.6.27-591/arch/x86/kernel/entry_32.S
  94 --- linux-2.6.27-590/arch/x86/kernel/entry_32.S 2008-10-09 18:13:53.000000000 -0400
  95 +++ linux-2.6.27-591/arch/x86/kernel/entry_32.S 2010-01-29 16:30:22.000000000 -0500
  96 @@ -426,6 +426,33 @@
  97         cmpl $(nr_syscalls), %eax
  98         jae syscall_badsys
  99  syscall_call:
 100 +    /* Move Chopstix syscall probe here */
 101 +    /* Save and clobber: eax, ecx, ebp  */
 102 +    pushl   %eax
 103 +    pushl   %ecx
 104 +    pushl   %ebp
 105 +    movl    %esp, %ebp
 106 +    subl    $SPEC_EVENT_SIZE, %esp
 107 +    movl    rec_event, %ecx
 108 +    testl   %ecx, %ecx
 109 +    jz  carry_on
 110 +    # struct event is first, just below %ebp
 111 +    movl    %eax, (SPEC_number-EVENT_SIZE)(%ebp)
 112 +    leal    -SPEC_EVENT_SIZE(%ebp), %eax
 113 +    movl    %eax, EVENT_event_data(%ebp)
 114 +    movl    $6, EVENT_event_type(%ebp)
 115 +    movl    rec_event, %edx
 116 +    movl    $1, 4(%esp)
 117 +    leal    -EVENT_SIZE(%ebp), %eax
 118 +    movl    %eax, (%esp)
 119 +    call    rec_event_asm
 120 +carry_on:
 121 +    addl $SPEC_EVENT_SIZE, %esp
 122 +    popl %ebp
 123 +    popl %ecx
 124 +    popl %eax
 125 +     /* End chopstix */
 126 +
 127         call *sys_call_table(,%eax,4)
 128         movl %eax,PT_EAX(%esp)          # store the return value
 129  syscall_exit:
 130 diff -Nurb linux-2.6.27-590/arch/x86/mm/fault.c linux-2.6.27-591/arch/x86/mm/fault.c
 131 --- linux-2.6.27-590/arch/x86/mm/fault.c        2010-01-29 16:29:46.000000000 -0500
 132 +++ linux-2.6.27-591/arch/x86/mm/fault.c        2010-01-29 16:30:22.000000000 -0500
 133 @@ -79,6 +79,15 @@
 134  #endif
 135  }
 136
 137 +
 138 +extern void (*rec_event)(void *,unsigned int);
 139 +struct event_spec {
 140 +       unsigned long pc;
 141 +       unsigned long dcookie;
 142 +       unsigned count;
 143 +       unsigned char reason;
 144 +};
 145 +
 146  /*
 147   * X86_32
 148   * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 149 diff -Nurb linux-2.6.27-590/drivers/oprofile/cpu_buffer.c linux-2.6.27-591/drivers/oprofile/cpu_buffer.c
 150 --- linux-2.6.27-590/drivers/oprofile/cpu_buffer.c      2008-10-09 18:13:53.000000000 -0400
 151 +++ linux-2.6.27-591/drivers/oprofile/cpu_buffer.c      2010-01-29 16:30:22.000000000 -0500
 152 @@ -21,6 +21,7 @@
 153  #include <linux/oprofile.h>
 154  #include <linux/vmalloc.h>
 155  #include <linux/errno.h>
 156 +#include <linux/arrays.h>
 157
 158  #include "event_buffer.h"
 159  #include "cpu_buffer.h"
 160 @@ -147,6 +148,17 @@
 161                 b->head_pos = 0;
 162  }
 163
 164 +#ifdef CONFIG_CHOPSTIX
 165 +
 166 +struct event_spec {
 167 +       unsigned int pc;
 168 +       unsigned long dcookie;
 169 +       unsigned count;
 170 +};
 171 +
 172 +extern void (*rec_event)(void *,unsigned int);
 173 +#endif
 174 +
 175  static inline void
 176  add_sample(struct oprofile_cpu_buffer * cpu_buf,
 177             unsigned long pc, unsigned long event)
 178 @@ -155,6 +167,7 @@
 179         entry->eip = pc;
 180         entry->event = event;
 181         increment_head(cpu_buf);
 182 +
 183  }
 184
 185  static inline void
 186 @@ -250,8 +263,28 @@
 187  {
 188         int is_kernel = !user_mode(regs);
 189         unsigned long pc = profile_pc(regs);
 190 +       int res=0;
 191
 192 +#ifdef CONFIG_CHOPSTIX
 193 +       if (rec_event) {
 194 +               struct event esig;
 195 +               struct event_spec espec;
 196 +               esig.task = current;
 197 +               espec.pc=pc;
 198 +               espec.count=1;
 199 +               esig.event_data=&espec;
 200 +               esig.event_type=event; /* index in the event array currently set up */
 201 +                                       /* make sure the counters are loaded in the order we want them to show up*/
 202 +               (*rec_event)(&esig, 1);
 203 +       }
 204 +       else {
 205         oprofile_add_ext_sample(pc, regs, event, is_kernel);
 206 +       }
 207 +#else
 208 +       oprofile_add_ext_sample(pc, regs, event, is_kernel);
 209 +#endif
 210 +
 211 +
 212  }
 213
 214  void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 215 diff -Nurb linux-2.6.27-590/fs/bio.c linux-2.6.27-591/fs/bio.c
 216 --- linux-2.6.27-590/fs/bio.c   2008-10-09 18:13:53.000000000 -0400
 217 +++ linux-2.6.27-591/fs/bio.c   2010-01-31 22:21:09.000000000 -0500
 218 @@ -27,6 +27,7 @@
 219  #include <linux/workqueue.h>
 220  #include <linux/blktrace_api.h>
 221  #include <scsi/sg.h>           /* for struct sg_iovec */
 222 +#include <linux/arrays.h>
 223
 224  static struct kmem_cache *bio_slab __read_mostly;
 225
 226 @@ -44,6 +45,7 @@
 227  };
 228  #undef BV
 229
 230 +
 231  /*
 232   * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 233   * IO code that does not need private memory pools.
 234 @@ -1171,6 +1173,14 @@
 235         }
 236  }
 237
 238 +struct event_spec {
 239 +       unsigned long pc;
 240 +       unsigned long dcookie;
 241 +       unsigned count;
 242 +       unsigned char reason;
 243 +};
 244 +
 245 +extern void (*rec_event)(void *,unsigned int);
 246  /**
 247   * bio_endio - end I/O on a bio
 248   * @bio:       bio
 249 @@ -1192,6 +1202,24 @@
 250         else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 251                 error = -EIO;
 252
 253 +#if 0
 254 +               if (rec_event) {
 255 +                       struct event event;
 256 +                       struct event_spec espec;
 257 +                       unsigned long eip;
 258 +
 259 +                       espec.reason = 1;/*response */
 260 +
 261 +                       eip = bio->bi_end_io;
 262 +                       event.event_data=&espec;
 263 +                       espec.pc=eip;
 264 +                       event.event_type=3;
 265 +                       /* index in the event array currently set up */
 266 +                       /* make sure the counters are loaded in the order we want them to show up*/
 267 +                       (*rec_event)(&event, bytes_done);
 268 +               }
 269 +#endif
 270 +
 271         if (bio->bi_end_io)
 272                 bio->bi_end_io(bio, error);
 273  }
 274 diff -Nurb linux-2.6.27-590/fs/exec.c linux-2.6.27-591/fs/exec.c
 275 --- linux-2.6.27-590/fs/exec.c  2010-01-29 16:29:48.000000000 -0500
 276 +++ linux-2.6.27-591/fs/exec.c  2010-01-29 16:45:48.000000000 -0500
 277 @@ -27,6 +27,7 @@
 278  #include <linux/fdtable.h>
 279  #include <linux/mm.h>
 280  #include <linux/stat.h>
 281 +#include <linux/dcookies.h>
 282  #include <linux/fcntl.h>
 283  #include <linux/smp_lock.h>
 284  #include <linux/swap.h>
 285 @@ -698,6 +699,13 @@
 286                 goto out;
 287         }
 288
 289 + #ifdef CONFIG_CHOPSTIX
 290 +    unsigned long cookie;
 291 +    extern void (*rec_event)(void *, unsigned int);
 292 +    if (rec_event && !nd.dentry->d_cookie)
 293 +        get_dcookie(nd.dentry, nd.mnt, &cookie);
 294 + #endif
 295 +
 296         return file;
 297
 298   out_path_put:
 299 diff -Nurb linux-2.6.27-590/fs/exec.c.rej linux-2.6.27-591/fs/exec.c.rej
 300 --- linux-2.6.27-590/fs/exec.c.rej      1969-12-31 19:00:00.000000000 -0500
 301 +++ linux-2.6.27-591/fs/exec.c.rej      2010-01-31 22:21:18.000000000 -0500
 302 @@ -0,0 +1,36 @@
 303 +***************
 304 +*** 40,46 ****
 305 +  #include <linux/personality.h>
 306 +  #include <linux/binfmts.h>
 307 +  #include <linux/utsname.h>
 308 +- /*#include <linux/pid_namespace.h>*/
 309 +  #include <linux/module.h>
 310 +  #include <linux/namei.h>
 311 +  #include <linux/proc_fs.h>
 312 +--- 40,46 ----
 313 +  #include <linux/personality.h>
 314 +  #include <linux/binfmts.h>
 315 +  #include <linux/utsname.h>
 316 ++ #include <linux/pid_namespace.h>
 317 +  #include <linux/module.h>
 318 +  #include <linux/namei.h>
 319 +  #include <linux/proc_fs.h>
 320 +***************
 321 +*** 702,709 ****
 322 +   #ifdef CONFIG_CHOPSTIX
 323 +      unsigned long cookie;
 324 +      extern void (*rec_event)(void *, unsigned int);
 325 +-     if (rec_event && !nd.dentry->d_cookie)
 326 +-         get_dcookie(nd.dentry, nd.mnt, &cookie);
 327 +   #endif
 328 +
 329 +       return file;
 330 +--- 702,709 ----
 331 +   #ifdef CONFIG_CHOPSTIX
 332 +      unsigned long cookie;
 333 +      extern void (*rec_event)(void *, unsigned int);
 334 ++     if (rec_event && !nd.path.dentry->d_cookie)
 335 ++         get_dcookie(&nd.path, &cookie);
 336 +   #endif
 337 +
 338 +       return file;
 339 diff -Nurb linux-2.6.27-590/include/linux/arrays.h linux-2.6.27-591/include/linux/arrays.h
 340 --- linux-2.6.27-590/include/linux/arrays.h     1969-12-31 19:00:00.000000000 -0500
 341 +++ linux-2.6.27-591/include/linux/arrays.h     2010-01-29 16:30:22.000000000 -0500
 342 @@ -0,0 +1,36 @@
 343 +#ifndef __ARRAYS_H__
 344 +#define __ARRAYS_H__
 345 +#include <linux/list.h>
 346 +
 347 +#define SAMPLING_METHOD_DEFAULT 0
 348 +#define SAMPLING_METHOD_LOG 1
 349 +
 350 +/* Every probe has an array handler */
 351 +
 352 +/* XXX - Optimize this structure */
 353 +
 354 +extern void (*rec_event)(void *,unsigned int);
 355 +struct array_handler {
 356 +       struct list_head link;
 357 +       unsigned int (*hash_func)(void *);
 358 +       unsigned int (*sampling_func)(void *,int,void *);
 359 +       unsigned short size;
 360 +       unsigned int threshold;
 361 +       unsigned char **expcount;
 362 +       unsigned int sampling_method;
 363 +       unsigned int **arrays;
 364 +       unsigned int arraysize;
 365 +       unsigned int num_samples[2];
 366 +       void **epoch_samples; /* size-sized lists of samples */
 367 +       unsigned int (*serialize)(void *, void *);
 368 +       unsigned char code[5];
 369 +};
 370 +
 371 +struct event {
 372 +       struct list_head link;
 373 +       void *event_data;
 374 +       unsigned int count;
 375 +       unsigned int event_type;
 376 +       struct task_struct *task;
 377 +};
 378 +#endif
 379 diff -Nurb linux-2.6.27-590/include/linux/sched.h linux-2.6.27-591/include/linux/sched.h
 380 --- linux-2.6.27-590/include/linux/sched.h      2010-01-29 16:29:48.000000000 -0500
 381 +++ linux-2.6.27-591/include/linux/sched.h      2010-02-01 16:41:30.000000000 -0500
 382 @@ -1133,6 +1133,11 @@
 383         cputime_t utime, stime, utimescaled, stimescaled;
 384         cputime_t gtime;
 385         cputime_t prev_utime, prev_stime;
 386 +
 387 +    #ifdef CONFIG_CHOPSTIX
 388 +            unsigned long last_interrupted, last_ran_j;
 389 +    #endif
 390 +
 391         unsigned long nvcsw, nivcsw; /* context switch counts */
 392         struct timespec start_time;             /* monotonic time */
 393         struct timespec real_start_time;        /* boot based time */
 394 diff -Nurb linux-2.6.27-590/include/linux/sched.h.rej linux-2.6.27-591/include/linux/sched.h.rej
 395 --- linux-2.6.27-590/include/linux/sched.h.rej  1969-12-31 19:00:00.000000000 -0500
 396 +++ linux-2.6.27-591/include/linux/sched.h.rej  2010-01-29 16:30:22.000000000 -0500
 397 @@ -0,0 +1,19 @@
 398 +***************
 399 +*** 850,855 ****
 400 +  #endif
 401 +       unsigned long sleep_avg;
 402 +       unsigned long long timestamp, last_ran;
 403 +       unsigned long long sched_time; /* sched_clock time spent running */
 404 +       enum sleep_type sleep_type;
 405 +
 406 +--- 850,859 ----
 407 +  #endif
 408 +       unsigned long sleep_avg;
 409 +       unsigned long long timestamp, last_ran;
 410 ++ #ifdef CONFIG_CHOPSTIX
 411 ++      unsigned long last_interrupted, last_ran_j;
 412 ++ #endif
 413 ++
 414 +       unsigned long long sched_time; /* sched_clock time spent running */
 415 +       enum sleep_type sleep_type;
 416 +
 417 diff -Nurb linux-2.6.27-590/kernel/sched.c linux-2.6.27-591/kernel/sched.c
 418 --- linux-2.6.27-590/kernel/sched.c     2010-01-29 16:29:48.000000000 -0500
 419 +++ linux-2.6.27-591/kernel/sched.c     2010-02-01 16:41:30.000000000 -0500
 420 @@ -10,7 +10,7 @@
 421   *  1998-11-19 Implemented schedule_timeout() and related stuff
 422   *             by Andrea Arcangeli
 423   *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
 424 - *             hybrid priority-list and round-robin design with
 425 + *             hybrid priority-list and round-robin deventn with
 426   *             an array-switch method of distributing timeslices
 427   *             and per-CPU runqueues.  Cleanups and useful suggestions
 428   *             by Davide Libenzi, preemptible kernel bits by Robert Love.
 429 @@ -73,12 +73,16 @@
 430  #include <linux/ftrace.h>
 431  #include <linux/vs_sched.h>
 432  #include <linux/vs_cvirt.h>
 433 +#include <linux/arrays.h>
 434
 435  #include <asm/tlb.h>
 436  #include <asm/irq_regs.h>
 437
 438  #include "sched_cpupri.h"
 439
 440 +#define INTERRUPTIBLE   -1
 441 +#define RUNNING         0
 442 +
 443  /*
 444   * Convert user-nice values [ -20 ... 0 ... 19 ]
 445   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 446 @@ -2368,6 +2372,10 @@
 447         INIT_HLIST_HEAD(&p->preempt_notifiers);
 448  #endif
 449
 450 +#ifdef CONFIG_CHOPSTIX
 451 +    p->last_ran_j = jiffies;
 452 +    p->last_interrupted = INTERRUPTIBLE;
 453 +#endif
 454         /*
 455          * We mark the process as running here, but have not actually
 456          * inserted it onto the runqueue yet. This guarantees that
 457 @@ -4428,6 +4436,29 @@
 458         }
 459  }
 460
 461 +void (*rec_event)(void *,unsigned int) = NULL;
 462 +EXPORT_SYMBOL(rec_event);
 463 +#ifdef CONFIG_CHOPSTIX
 464 +
 465 +struct event_spec {
 466 +    unsigned long pc;
 467 +    unsigned long dcookie;
 468 +    unsigned int count;
 469 +    unsigned int reason;
 470 +};
 471 +
 472 +/* To support safe calling from asm */
 473 +asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
 474 +    struct pt_regs *regs;
 475 +    struct event_spec *es = event_signature_in->event_data;
 476 +    regs = task_pt_regs(current);
 477 +    event_signature_in->task=current;
 478 +    es->pc=regs->ip;
 479 +    event_signature_in->count=1;
 480 +    (*rec_event)(event_signature_in, count);
 481 +}
 482 +#endif
 483 +
 484  /*
 485   * schedule() is the main scheduler function.
 486   */
 487 @@ -4482,6 +4513,61 @@
 488         next = pick_next_task(rq, prev);
 489
 490         if (likely(prev != next)) {
 491 +
 492 +#ifdef CONFIG_CHOPSTIX
 493 +       /* Run only if the Chopstix module so decrees it */
 494 +       if (rec_event) {
 495 +        unsigned long diff;
 496 +        int sampling_reason;
 497 +               prev->last_ran_j = jiffies;
 498 +               if (next->last_interrupted!=INTERRUPTIBLE) {
 499 +                       if (next->last_interrupted!=RUNNING) {
 500 +                               diff = (jiffies-next->last_interrupted);
 501 +                               sampling_reason = 0;/* BLOCKING */
 502 +                       }
 503 +                       else {
 504 +                               diff = jiffies-next->last_ran_j;
 505 +                               sampling_reason = 1;/* PREEMPTION */
 506 +                       }
 507 +
 508 +                       if (diff >= HZ/10) {
 509 +                struct event_spec {
 510 +                   unsigned long pc;
 511 +                   unsigned long dcookie;
 512 +                   unsigned int count;
 513 +                   unsigned int reason;
 514 +                };
 515 +
 516 +                               struct event event;
 517 +                               struct event_spec espec;
 518 +                struct pt_regs *regs;
 519 +                regs = task_pt_regs(current);
 520 +
 521 +                               espec.reason = sampling_reason;
 522 +                               event.event_data=&espec;
 523 +                               event.task=next;
 524 +                               espec.pc=regs->ip;
 525 +                               event.event_type=2;
 526 +                               /* index in the event array currently set up */
 527 +                               /* make sure the counters are loaded in the order we want them to show up*/
 528 +                               (*rec_event)(&event, diff);
 529 +                       }
 530 +               }
 531 +        /* next has been elected to run */
 532 +               next->last_interrupted=0;
 533 +
 534 +        /* An uninterruptible process just yielded. Record the current jiffy */
 535 +        if (prev->state & TASK_UNINTERRUPTIBLE) {
 536 +            prev->last_interrupted=jiffies;
 537 +        }
 538 +         /* An interruptible process just yielded, or it got preempted.
 539 +          * Mark it as interruptible */
 540 +        else if (prev->state & TASK_INTERRUPTIBLE) {
 541 +            prev->last_interrupted=INTERRUPTIBLE;
 542 +        }
 543 +       }
 544 +#endif
 545 +
 546                 sched_info_switch(prev, next);
 547
 548                 rq->nr_switches++;
 549 @@ -5369,6 +5455,7 @@
 550         get_task_struct(p);
 551         read_unlock(&tasklist_lock);
 552
 553 +
 554         retval = -EPERM;
 555         if ((current->euid != p->euid) && (current->euid != p->uid) &&
 556                         !capable(CAP_SYS_NICE))
 557 @@ -9296,3 +9383,26 @@
 558         .subsys_id = cpuacct_subsys_id,
 559  };
 560  #endif /* CONFIG_CGROUP_CPUACCT */
 561 +
 562 +#ifdef CONFIG_CHOPSTIX
 563 +void (*rec_event)();
 564 +EXPORT_SYMBOL(rec_event);
 565 +
 566 +struct event_spec {
 567 +    unsigned long pc;
 568 +    unsigned long dcookie;
 569 +    unsigned int count;
 570 +    unsigned int reason;
 571 +};
 572 +
 573 +/* To support safe calling from asm */
 574 +asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
 575 +    struct pt_regs *regs;
 576 +    struct event_spec *es = event_signature_in->event_data;
 577 +    regs = task_pt_regs(current);
 578 +    event_signature_in->task=current;
 579 +    es->pc=regs->ip;
 580 +    event_signature_in->count=1;
 581 +    (*rec_event)(event_signature_in, count);
 582 +}
 583 +#endif
 584 diff -Nurb linux-2.6.27-590/kernel/sched.c.orig linux-2.6.27-591/kernel/sched.c.orig
 585 --- linux-2.6.27-590/kernel/sched.c.orig        1969-12-31 19:00:00.000000000 -0500
 586 +++ linux-2.6.27-591/kernel/sched.c.orig        2010-01-31 22:21:08.000000000 -0500
 587 @@ -0,0 +1,9349 @@
 588 +/*
 589 + *  kernel/sched.c
 590 + *
 591 + *  Kernel scheduler and related syscalls
 592 + *
 593 + *  Copyright (C) 1991-2002  Linus Torvalds
 594 + *
 595 + *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 596 + *             make semaphores SMP safe
 597 + *  1998-11-19 Implemented schedule_timeout() and related stuff
 598 + *             by Andrea Arcangeli
 599 + *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
 600 + *             hybrid priority-list and round-robin deventn with
 601 + *             an array-switch method of distributing timeslices
 602 + *             and per-CPU runqueues.  Cleanups and useful suggestions
 603 + *             by Davide Libenzi, preemptible kernel bits by Robert Love.
 604 + *  2003-09-03 Interactivity tuning by Con Kolivas.
 605 + *  2004-04-02 Scheduler domains code by Nick Piggin
 606 + *  2007-04-15  Work begun on replacing all interactivity tuning with a
 607 + *              fair scheduling design by Con Kolivas.
 608 + *  2007-05-05  Load balancing (smp-nice) and other improvements
 609 + *              by Peter Williams
 610 + *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
 611 + *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
 612 + *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
 613 + *              Thomas Gleixner, Mike Kravetz
 614 + */
 615 +
 616 +#include <linux/mm.h>
 617 +#include <linux/module.h>
 618 +#include <linux/nmi.h>
 619 +#include <linux/init.h>
 620 +#include <linux/uaccess.h>
 621 +#include <linux/highmem.h>
 622 +#include <linux/smp_lock.h>
 623 +#include <asm/mmu_context.h>
 624 +#include <linux/interrupt.h>
 625 +#include <linux/capability.h>
 626 +#include <linux/completion.h>
 627 +#include <linux/kernel_stat.h>
 628 +#include <linux/debug_locks.h>
 629 +#include <linux/security.h>
 630 +#include <linux/notifier.h>
 631 +#include <linux/profile.h>
 632 +#include <linux/freezer.h>
 633 +#include <linux/vmalloc.h>
 634 +#include <linux/blkdev.h>
 635 +#include <linux/delay.h>
 636 +#include <linux/pid_namespace.h>
 637 +#include <linux/smp.h>
 638 +#include <linux/threads.h>
 639 +#include <linux/timer.h>
 640 +#include <linux/rcupdate.h>
 641 +#include <linux/cpu.h>
 642 +#include <linux/cpuset.h>
 643 +#include <linux/percpu.h>
 644 +#include <linux/kthread.h>
 645 +#include <linux/seq_file.h>
 646 +#include <linux/sysctl.h>
 647 +#include <linux/syscalls.h>
 648 +#include <linux/times.h>
 649 +#include <linux/tsacct_kern.h>
 650 +#include <linux/kprobes.h>
 651 +#include <linux/delayacct.h>
 652 +#include <linux/reciprocal_div.h>
 653 +#include <linux/unistd.h>
 654 +#include <linux/pagemap.h>
 655 +#include <linux/hrtimer.h>
 656 +#include <linux/tick.h>
 657 +#include <linux/bootmem.h>
 658 +#include <linux/debugfs.h>
 659 +#include <linux/ctype.h>
 660 +#include <linux/ftrace.h>
 661 +#include <linux/vs_sched.h>
 662 +#include <linux/vs_cvirt.h>
 663 +#include <linux/arrays.h>
 664 +
 665 +#include <asm/tlb.h>
 666 +#include <asm/irq_regs.h>
 667 +
 668 +#include "sched_cpupri.h"
 669 +
 670 +#define INTERRUPTIBLE   -1
 671 +#define RUNNING         0
 672 +
 673 +/*
 674 + * Convert user-nice values [ -20 ... 0 ... 19 ]
 675 + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 676 + * and back.
 677 + */
 678 +#define NICE_TO_PRIO(nice)     (MAX_RT_PRIO + (nice) + 20)
 679 +#define PRIO_TO_NICE(prio)     ((prio) - MAX_RT_PRIO - 20)
 680 +#define TASK_NICE(p)           PRIO_TO_NICE((p)->static_prio)
 681 +
 682 +/*
 683 + * 'User priority' is the nice value converted to something we
 684 + * can work with better when scaling various scheduler parameters,
 685 + * it's a [ 0 ... 39 ] range.
 686 + */
 687 +#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
 688 +#define TASK_USER_PRIO(p)      USER_PRIO((p)->static_prio)
 689 +#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
 690 +
 691 +/*
 692 + * Helpers for converting nanosecond timing to jiffy resolution
 693 + */
 694 +#define NS_TO_JIFFIES(TIME)    ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 695 +
 696 +#define NICE_0_LOAD            SCHED_LOAD_SCALE
 697 +#define NICE_0_SHIFT           SCHED_LOAD_SHIFT
 698 +
 699 +/*
 700 + * These are the 'tuning knobs' of the scheduler:
 701 + *
 702 + * default timeslice is 100 msecs (used only for SCHED_RR tasks).
 703 + * Timeslices get refilled after they expire.
 704 + */
 705 +#define DEF_TIMESLICE          (100 * HZ / 1000)
 706 +
 707 +/*
 708 + * single value that denotes runtime == period, ie unlimited time.
 709 + */
 710 +#define RUNTIME_INF    ((u64)~0ULL)
 711 +
 712 +#ifdef CONFIG_SMP
 713 +/*
 714 + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 715 + * Since cpu_power is a 'constant', we can use a reciprocal divide.
 716 + */
 717 +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 718 +{
 719 +       return reciprocal_divide(load, sg->reciprocal_cpu_power);
 720 +}
 721 +
 722 +/*
 723 + * Each time a sched group cpu_power is changed,
 724 + * we must compute its reciprocal value
 725 + */
 726 +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 727 +{
 728 +       sg->__cpu_power += val;
 729 +       sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 730 +}
 731 +#endif
 732 +
 733 +static inline int rt_policy(int policy)
 734 +{
 735 +       if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 736 +               return 1;
 737 +       return 0;
 738 +}
 739 +
 740 +static inline int task_has_rt_policy(struct task_struct *p)
 741 +{
 742 +       return rt_policy(p->policy);
 743 +}
 744 +
 745 +/*
 746 + * This is the priority-queue data structure of the RT scheduling class:
 747 + */
 748 +struct rt_prio_array {
 749 +       DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 750 +       struct list_head queue[MAX_RT_PRIO];
 751 +};
 752 +
 753 +struct rt_bandwidth {
 754 +       /* nests inside the rq lock: */
 755 +       spinlock_t              rt_runtime_lock;
 756 +       ktime_t                 rt_period;
 757 +       u64                     rt_runtime;
 758 +       struct hrtimer          rt_period_timer;
 759 +};
 760 +
 761 +static struct rt_bandwidth def_rt_bandwidth;
 762 +
 763 +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 764 +
 765 +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 766 +{
 767 +       struct rt_bandwidth *rt_b =
 768 +               container_of(timer, struct rt_bandwidth, rt_period_timer);
 769 +       ktime_t now;
 770 +       int overrun;
 771 +       int idle = 0;
 772 +
 773 +       for (;;) {
 774 +               now = hrtimer_cb_get_time(timer);
 775 +               overrun = hrtimer_forward(timer, now, rt_b->rt_period);
 776 +
 777 +               if (!overrun)
 778 +                       break;
 779 +
 780 +               idle = do_sched_rt_period_timer(rt_b, overrun);
 781 +       }
 782 +
 783 +       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 784 +}
 785 +
 786 +static
 787 +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 788 +{
 789 +       rt_b->rt_period = ns_to_ktime(period);
 790 +       rt_b->rt_runtime = runtime;
 791 +
 792 +       spin_lock_init(&rt_b->rt_runtime_lock);
 793 +
 794 +       hrtimer_init(&rt_b->rt_period_timer,
 795 +                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 796 +       rt_b->rt_period_timer.function = sched_rt_period_timer;
 797 +       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 798 +}
 799 +
 800 +static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 801 +{
 802 +       ktime_t now;
 803 +
 804 +       if (rt_b->rt_runtime == RUNTIME_INF)
 805 +               return;
 806 +
 807 +       if (hrtimer_active(&rt_b->rt_period_timer))
 808 +               return;
 809 +
 810 +       spin_lock(&rt_b->rt_runtime_lock);
 811 +       for (;;) {
 812 +               if (hrtimer_active(&rt_b->rt_period_timer))
 813 +                       break;
 814 +
 815 +               now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 816 +               hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 817 +               hrtimer_start(&rt_b->rt_period_timer,
 818 +                             rt_b->rt_period_timer.expires,
 819 +                             HRTIMER_MODE_ABS);
 820 +       }
 821 +       spin_unlock(&rt_b->rt_runtime_lock);
 822 +}
 823 +
 824 +#ifdef CONFIG_RT_GROUP_SCHED
 825 +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 826 +{
 827 +       hrtimer_cancel(&rt_b->rt_period_timer);
 828 +}
 829 +#endif
 830 +
 831 +/*
 832 + * sched_domains_mutex serializes calls to arch_init_sched_domains,
 833 + * detach_destroy_domains and partition_sched_domains.
 834 + */
 835 +static DEFINE_MUTEX(sched_domains_mutex);
 836 +
 837 +#ifdef CONFIG_GROUP_SCHED
 838 +
 839 +#include <linux/cgroup.h>
 840 +
 841 +struct cfs_rq;
 842 +
 843 +static LIST_HEAD(task_groups);
 844 +
 845 +/* task group related information */
 846 +struct task_group {
 847 +#ifdef CONFIG_CGROUP_SCHED
 848 +       struct cgroup_subsys_state css;
 849 +#endif
 850 +
 851 +#ifdef CONFIG_FAIR_GROUP_SCHED
 852 +       /* schedulable entities of this group on each cpu */
 853 +       struct sched_entity **se;
 854 +       /* runqueue "owned" by this group on each cpu */
 855 +       struct cfs_rq **cfs_rq;
 856 +       unsigned long shares;
 857 +#endif
 858 +
 859 +#ifdef CONFIG_RT_GROUP_SCHED
 860 +       struct sched_rt_entity **rt_se;
 861 +       struct rt_rq **rt_rq;
 862 +
 863 +       struct rt_bandwidth rt_bandwidth;
 864 +#endif
 865 +
 866 +       struct rcu_head rcu;
 867 +       struct list_head list;
 868 +
 869 +       struct task_group *parent;
 870 +       struct list_head siblings;
 871 +       struct list_head children;
 872 +};
 873 +
 874 +#ifdef CONFIG_USER_SCHED
 875 +
 876 +/*
 877 + * Root task group.
 878 + *     Every UID task group (including init_task_group aka UID-0) will
 879 + *     be a child to this group.
 880 + */
 881 +struct task_group root_task_group;
 882 +
 883 +#ifdef CONFIG_FAIR_GROUP_SCHED
 884 +/* Default task group's sched entity on each cpu */
 885 +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 886 +/* Default task group's cfs_rq on each cpu */
 887 +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 888 +#endif /* CONFIG_FAIR_GROUP_SCHED */
 889 +
 890 +#ifdef CONFIG_RT_GROUP_SCHED
 891 +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 892 +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 893 +#endif /* CONFIG_RT_GROUP_SCHED */
 894 +#else /* !CONFIG_FAIR_GROUP_SCHED */
 895 +#define root_task_group init_task_group
 896 +#endif /* CONFIG_FAIR_GROUP_SCHED */
 897 +
 898 +/* task_group_lock serializes add/remove of task groups and also changes to
 899 + * a task group's cpu shares.
 900 + */
 901 +static DEFINE_SPINLOCK(task_group_lock);
 902 +
 903 +#ifdef CONFIG_FAIR_GROUP_SCHED
 904 +#ifdef CONFIG_USER_SCHED
 905 +# define INIT_TASK_GROUP_LOAD  (2*NICE_0_LOAD)
 906 +#else /* !CONFIG_USER_SCHED */
 907 +# define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
 908 +#endif /* CONFIG_USER_SCHED */
 909 +
 910 +/*
 911 + * A weight of 0 or 1 can cause arithmetics problems.
 912 + * A weight of a cfs_rq is the sum of weights of which entities
 913 + * are queued on this cfs_rq, so a weight of a entity should not be
 914 + * too large, so as the shares value of a task group.
 915 + * (The default weight is 1024 - so there's no practical
 916 + *  limitation from this.)
 917 + */
 918 +#define MIN_SHARES     2
 919 +#define MAX_SHARES     (1UL << 18)
 920 +
 921 +static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 922 +#endif
 923 +
 924 +/* Default task group.
 925 + *     Every task in system belong to this group at bootup.
 926 + */
 927 +struct task_group init_task_group;
 928 +
 929 +/* return group to which a task belongs */
 930 +static inline struct task_group *task_group(struct task_struct *p)
 931 +{
 932 +       struct task_group *tg;
 933 +
 934 +#ifdef CONFIG_USER_SCHED
 935 +       tg = p->user->tg;
 936 +#elif defined(CONFIG_CGROUP_SCHED)
 937 +       tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 938 +                               struct task_group, css);
 939 +#else
 940 +       tg = &init_task_group;
 941 +#endif
 942 +       return tg;
 943 +}
 944 +
 945 +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 946 +static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 947 +{
 948 +#ifdef CONFIG_FAIR_GROUP_SCHED
 949 +       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 950 +       p->se.parent = task_group(p)->se[cpu];
 951 +#endif
 952 +
 953 +#ifdef CONFIG_RT_GROUP_SCHED
 954 +       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 955 +       p->rt.parent = task_group(p)->rt_se[cpu];
 956 +#endif
 957 +}
 958 +
 959 +#else
 960 +
 961 +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 962 +static inline struct task_group *task_group(struct task_struct *p)
 963 +{
 964 +       return NULL;
 965 +}
 966 +
 967 +#endif /* CONFIG_GROUP_SCHED */
 968 +
 969 +/* CFS-related fields in a runqueue */
 970 +struct cfs_rq {
 971 +       struct load_weight load;
 972 +       unsigned long nr_running;
 973 +
 974 +       u64 exec_clock;
 975 +       u64 min_vruntime;
 976 +       u64 pair_start;
 977 +
 978 +       struct rb_root tasks_timeline;
 979 +       struct rb_node *rb_leftmost;
 980 +
 981 +       struct list_head tasks;
 982 +       struct list_head *balance_iterator;
 983 +
 984 +       /*
 985 +        * 'curr' points to currently running entity on this cfs_rq.
 986 +        * It is set to NULL otherwise (i.e when none are currently running).
 987 +        */
 988 +       struct sched_entity *curr, *next;
 989 +
 990 +       unsigned long nr_spread_over;
 991 +
 992 +#ifdef CONFIG_FAIR_GROUP_SCHED
 993 +       struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 994 +
 995 +       /*
 996 +        * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 997 +        * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 998 +        * (like users, containers etc.)
 999 +        *
1000 +        * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
1001 +        * list is used during load balance.
1002 +        */
1003 +       struct list_head leaf_cfs_rq_list;
1004 +       struct task_group *tg;  /* group that "owns" this runqueue */
1005 +
1006 +#ifdef CONFIG_SMP
1007 +       /*
1008 +        * the part of load.weight contributed by tasks
1009 +        */
1010 +       unsigned long task_weight;
1011 +
1012 +       /*
1013 +        *   h_load = weight * f(tg)
1014 +        *
1015 +        * Where f(tg) is the recursive weight fraction assigned to
1016 +        * this group.
1017 +        */
1018 +       unsigned long h_load;
1019 +
1020 +       /*
1021 +        * this cpu's part of tg->shares
1022 +        */
1023 +       unsigned long shares;
1024 +
1025 +       /*
1026 +        * load.weight at the time we set shares
1027 +        */
1028 +       unsigned long rq_weight;
1029 +#endif
1030 +#endif
1031 +};
1032 +
1033 +/* Real-Time classes' related field in a runqueue: */
1034 +struct rt_rq {
1035 +       struct rt_prio_array active;
1036 +       unsigned long rt_nr_running;
1037 +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1038 +       int highest_prio; /* highest queued rt task prio */
1039 +#endif
1040 +#ifdef CONFIG_SMP
1041 +       unsigned long rt_nr_migratory;
1042 +       int overloaded;
1043 +#endif
1044 +       int rt_throttled;
1045 +       u64 rt_time;
1046 +       u64 rt_runtime;
1047 +       /* Nests inside the rq lock: */
1048 +       spinlock_t rt_runtime_lock;
1049 +
1050 +#ifdef CONFIG_RT_GROUP_SCHED
1051 +       unsigned long rt_nr_boosted;
1052 +
1053 +       struct rq *rq;
1054 +       struct list_head leaf_rt_rq_list;
1055 +       struct task_group *tg;
1056 +       struct sched_rt_entity *rt_se;
1057 +#endif
1058 +};
1059 +
1060 +#ifdef CONFIG_SMP
1061 +
1062 +/*
1063 + * We add the notion of a root-domain which will be used to define per-domain
1064 + * variables. Each exclusive cpuset essentially defines an island domain by
1065 + * fully partitioning the member cpus from any other cpuset. Whenever a new
1066 + * exclusive cpuset is created, we also create and attach a new root-domain
1067 + * object.
1068 + *
1069 + */
1070 +struct root_domain {
1071 +       atomic_t refcount;
1072 +       cpumask_t span;
1073 +       cpumask_t online;
1074 +
1075 +       /*
1076 +        * The "RT overload" flag: it gets set if a CPU has more than
1077 +        * one runnable RT task.
1078 +        */
1079 +       cpumask_t rto_mask;
1080 +       atomic_t rto_count;
1081 +#ifdef CONFIG_SMP
1082 +       struct cpupri cpupri;
1083 +#endif
1084 +};
1085 +
1086 +/*
1087 + * By default the system creates a single root-domain with all cpus as
1088 + * members (mimicking the global state we have today).
1089 + */
1090 +static struct root_domain def_root_domain;
1091 +
1092 +#endif
1093 +       unsigned long norm_time;
1094 +       unsigned long idle_time;
1095 +#ifdef CONFIG_VSERVER_IDLETIME
1096 +       int idle_skip;
1097 +#endif
1098 +#ifdef CONFIG_VSERVER_HARDCPU
1099 +       struct list_head hold_queue;
1100 +       unsigned long nr_onhold;
1101 +       int idle_tokens;
1102 +#endif
1103 +
1104 +/*
1105 + * This is the main, per-CPU runqueue data structure.
1106 + *
1107 + * Locking rule: those places that want to lock multiple runqueues
1108 + * (such as the load balancing or the thread migration code), lock
1109 + * acquire operations must be ordered by ascending &runqueue.
1110 + */
1111 +struct rq {
1112 +       /* runqueue lock: */
1113 +       spinlock_t lock;
1114 +
1115 +       /*
1116 +        * nr_running and cpu_load should be in the same cacheline because
1117 +        * remote CPUs use both these fields when doing load calculation.
1118 +        */
1119 +       unsigned long nr_running;
1120 +       #define CPU_LOAD_IDX_MAX 5
1121 +       unsigned long cpu_load[CPU_LOAD_IDX_MAX];
1122 +       unsigned char idle_at_tick;
1123 +#ifdef CONFIG_NO_HZ
1124 +       unsigned long last_tick_seen;
1125 +       unsigned char in_nohz_recently;
1126 +#endif
1127 +       /* capture load from *all* tasks on this cpu: */
1128 +       struct load_weight load;
1129 +       unsigned long nr_load_updates;
1130 +       u64 nr_switches;
1131 +
1132 +       struct cfs_rq cfs;
1133 +       struct rt_rq rt;
1134 +
1135 +#ifdef CONFIG_FAIR_GROUP_SCHED
1136 +       /* list of leaf cfs_rq on this cpu: */
1137 +       struct list_head leaf_cfs_rq_list;
1138 +#endif
1139 +#ifdef CONFIG_RT_GROUP_SCHED
1140 +       struct list_head leaf_rt_rq_list;
1141 +#endif
1142 +
1143 +       /*
1144 +        * This is part of a global counter where only the total sum
1145 +        * over all CPUs matters. A task can increase this counter on
1146 +        * one CPU and if it got migrated afterwards it may decrease
1147 +        * it on another CPU. Always updated under the runqueue lock:
1148 +        */
1149 +       unsigned long nr_uninterruptible;
1150 +
1151 +       struct task_struct *curr, *idle;
1152 +       unsigned long next_balance;
1153 +       struct mm_struct *prev_mm;
1154 +
1155 +       u64 clock;
1156 +
1157 +       atomic_t nr_iowait;
1158 +
1159 +#ifdef CONFIG_SMP
1160 +       struct root_domain *rd;
1161 +       struct sched_domain *sd;
1162 +
1163 +       /* For active balancing */
1164 +       int active_balance;
1165 +       int push_cpu;
1166 +       /* cpu of this runqueue: */
1167 +       int cpu;
1168 +       int online;
1169 +
1170 +       unsigned long avg_load_per_task;
1171 +
1172 +       struct task_struct *migration_thread;
1173 +       struct list_head migration_queue;
1174 +#endif
1175 +
1176 +#ifdef CONFIG_SCHED_HRTICK
1177 +#ifdef CONFIG_SMP
1178 +       int hrtick_csd_pending;
1179 +       struct call_single_data hrtick_csd;
1180 +#endif
1181 +       struct hrtimer hrtick_timer;
1182 +#endif
1183 +
1184 +#ifdef CONFIG_SCHEDSTATS
1185 +       /* latency stats */
1186 +       struct sched_info rq_sched_info;
1187 +
1188 +       /* sys_sched_yield() stats */
1189 +       unsigned int yld_exp_empty;
1190 +       unsigned int yld_act_empty;
1191 +       unsigned int yld_both_empty;
1192 +       unsigned int yld_count;
1193 +
1194 +       /* schedule() stats */
1195 +       unsigned int sched_switch;
1196 +       unsigned int sched_count;
1197 +       unsigned int sched_goidle;
1198 +
1199 +       /* try_to_wake_up() stats */
1200 +       unsigned int ttwu_count;
1201 +       unsigned int ttwu_local;
1202 +
1203 +       /* BKL stats */
1204 +       unsigned int bkl_count;
1205 +#endif
1206 +};
1207 +
1208 +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
1209 +
1210 +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
1211 +{
1212 +       rq->curr->sched_class->check_preempt_curr(rq, p);
1213 +}
1214 +
1215 +static inline int cpu_of(struct rq *rq)
1216 +{
1217 +#ifdef CONFIG_SMP
1218 +       return rq->cpu;
1219 +#else
1220 +       return 0;
1221 +#endif
1222 +}
1223 +
1224 +/*
1225 + * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1226 + * See detach_destroy_domains: synchronize_sched for details.
1227 + *
1228 + * The domain tree of any CPU may only be accessed from within
1229 + * preempt-disabled sections.
1230 + */
1231 +#define for_each_domain(cpu, __sd) \
1232 +       for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
1233 +
1234 +#define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
1235 +#define this_rq()              (&__get_cpu_var(runqueues))
1236 +#define task_rq(p)             cpu_rq(task_cpu(p))
1237 +#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
1238 +
1239 +static inline void update_rq_clock(struct rq *rq)
1240 +{
1241 +       rq->clock = sched_clock_cpu(cpu_of(rq));
1242 +}
1243 +
1244 +/*
1245 + * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
1246 + */
1247 +#ifdef CONFIG_SCHED_DEBUG
1248 +# define const_debug __read_mostly
1249 +#else
1250 +# define const_debug static const
1251 +#endif
1252 +
1253 +/**
1254 + * runqueue_is_locked
1255 + *
1256 + * Returns true if the current cpu runqueue is locked.
1257 + * This interface allows printk to be called with the runqueue lock
1258 + * held and know whether or not it is OK to wake up the klogd.
1259 + */
1260 +int runqueue_is_locked(void)
1261 +{
1262 +       int cpu = get_cpu();
1263 +       struct rq *rq = cpu_rq(cpu);
1264 +       int ret;
1265 +
1266 +       ret = spin_is_locked(&rq->lock);
1267 +       put_cpu();
1268 +       return ret;
1269 +}
1270 +
1271 +/*
1272 + * Debugging: various feature bits
1273 + */
1274 +
1275 +#define SCHED_FEAT(name, enabled)      \
1276 +       __SCHED_FEAT_##name ,
1277 +
1278 +enum {
1279 +#include "sched_features.h"
1280 +};
1281 +
1282 +#undef SCHED_FEAT
1283 +
1284 +#define SCHED_FEAT(name, enabled)      \
1285 +       (1UL << __SCHED_FEAT_##name) * enabled |
1286 +
1287 +const_debug unsigned int sysctl_sched_features =
1288 +#include "sched_features.h"
1289 +       0;
1290 +
1291 +#undef SCHED_FEAT
1292 +
1293 +#ifdef CONFIG_SCHED_DEBUG
1294 +#define SCHED_FEAT(name, enabled)      \
1295 +       #name ,
1296 +
1297 +static __read_mostly char *sched_feat_names[] = {
1298 +#include "sched_features.h"
1299 +       NULL
1300 +};
1301 +
1302 +#undef SCHED_FEAT
1303 +
1304 +static int sched_feat_open(struct inode *inode, struct file *filp)
1305 +{
1306 +       filp->private_data = inode->i_private;
1307 +       return 0;
1308 +}
1309 +
1310 +static ssize_t
1311 +sched_feat_read(struct file *filp, char __user *ubuf,
1312 +               size_t cnt, loff_t *ppos)
1313 +{
1314 +       char *buf;
1315 +       int r = 0;
1316 +       int len = 0;
1317 +       int i;
1318 +
1319 +       for (i = 0; sched_feat_names[i]; i++) {
1320 +               len += strlen(sched_feat_names[i]);
1321 +               len += 4;
1322 +       }
1323 +
1324 +       buf = kmalloc(len + 2, GFP_KERNEL);
1325 +       if (!buf)
1326 +               return -ENOMEM;
1327 +
1328 +       for (i = 0; sched_feat_names[i]; i++) {
1329 +               if (sysctl_sched_features & (1UL << i))
1330 +                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
1331 +               else
1332 +                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
1333 +       }
1334 +
1335 +       r += sprintf(buf + r, "\n");
1336 +       WARN_ON(r >= len + 2);
1337 +
1338 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1339 +
1340 +       kfree(buf);
1341 +
1342 +       return r;
1343 +}
1344 +
1345 +static ssize_t
1346 +sched_feat_write(struct file *filp, const char __user *ubuf,
1347 +               size_t cnt, loff_t *ppos)
1348 +{
1349 +       char buf[64];
1350 +       char *cmp = buf;
1351 +       int neg = 0;
1352 +       int i;
1353 +
1354 +       if (cnt > 63)
1355 +               cnt = 63;
1356 +
1357 +       if (copy_from_user(&buf, ubuf, cnt))
1358 +               return -EFAULT;
1359 +
1360 +       buf[cnt] = 0;
1361 +
1362 +       if (strncmp(buf, "NO_", 3) == 0) {
1363 +               neg = 1;
1364 +               cmp += 3;
1365 +       }
1366 +
1367 +       for (i = 0; sched_feat_names[i]; i++) {
1368 +               int len = strlen(sched_feat_names[i]);
1369 +
1370 +               if (strncmp(cmp, sched_feat_names[i], len) == 0) {
1371 +                       if (neg)
1372 +                               sysctl_sched_features &= ~(1UL << i);
1373 +                       else
1374 +                               sysctl_sched_features |= (1UL << i);
1375 +                       break;
1376 +               }
1377 +       }
1378 +
1379 +       if (!sched_feat_names[i])
1380 +               return -EINVAL;
1381 +
1382 +       filp->f_pos += cnt;
1383 +
1384 +       return cnt;
1385 +}
1386 +
1387 +static struct file_operations sched_feat_fops = {
1388 +       .open   = sched_feat_open,
1389 +       .read   = sched_feat_read,
1390 +       .write  = sched_feat_write,
1391 +};
1392 +
1393 +static __init int sched_init_debug(void)
1394 +{
1395 +       debugfs_create_file("sched_features", 0644, NULL, NULL,
1396 +                       &sched_feat_fops);
1397 +
1398 +       return 0;
1399 +}
1400 +late_initcall(sched_init_debug);
1401 +
1402 +#endif
1403 +
1404 +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
1405 +
1406 +/*
1407 + * Number of tasks to iterate in a single balance run.
1408 + * Limited because this is done with IRQs disabled.
1409 + */
1410 +const_debug unsigned int sysctl_sched_nr_migrate = 32;
1411 +
1412 +/*
1413 + * ratelimit for updating the group shares.
1414 + * default: 0.25ms
1415 + */
1416 +unsigned int sysctl_sched_shares_ratelimit = 250000;
1417 +
1418 +/*
1419 + * period over which we measure -rt task cpu usage in us.
1420 + * default: 1s
1421 + */
1422 +unsigned int sysctl_sched_rt_period = 1000000;
1423 +
1424 +static __read_mostly int scheduler_running;
1425 +
1426 +/*
1427 + * part of the period that we allow rt tasks to run in us.
1428 + * default: 0.95s
1429 + */
1430 +int sysctl_sched_rt_runtime = 950000;
1431 +
1432 +static inline u64 global_rt_period(void)
1433 +{
1434 +       return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
1435 +}
1436 +
1437 +static inline u64 global_rt_runtime(void)
1438 +{
1439 +       if (sysctl_sched_rt_runtime < 0)
1440 +               return RUNTIME_INF;
1441 +
1442 +       return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
1443 +}
1444 +
1445 +#ifndef prepare_arch_switch
1446 +# define prepare_arch_switch(next)     do { } while (0)
1447 +#endif
1448 +#ifndef finish_arch_switch
1449 +# define finish_arch_switch(prev)      do { } while (0)
1450 +#endif
1451 +
1452 +static inline int task_current(struct rq *rq, struct task_struct *p)
1453 +{
1454 +       return rq->curr == p;
1455 +}
1456 +
1457 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1458 +static inline int task_running(struct rq *rq, struct task_struct *p)
1459 +{
1460 +       return task_current(rq, p);
1461 +}
1462 +
1463 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1464 +{
1465 +}
1466 +
1467 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1468 +{
1469 +#ifdef CONFIG_DEBUG_SPINLOCK
1470 +       /* this is a valid case when another task releases the spinlock */
1471 +       rq->lock.owner = current;
1472 +#endif
1473 +       /*
1474 +        * If we are tracking spinlock dependencies then we have to
1475 +        * fix up the runqueue lock - which gets 'carried over' from
1476 +        * prev into current:
1477 +        */
1478 +       spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1479 +
1480 +       spin_unlock_irq(&rq->lock);
1481 +}
1482 +
1483 +#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1484 +static inline int task_running(struct rq *rq, struct task_struct *p)
1485 +{
1486 +#ifdef CONFIG_SMP
1487 +       return p->oncpu;
1488 +#else
1489 +       return task_current(rq, p);
1490 +#endif
1491 +}
1492 +
1493 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1494 +{
1495 +#ifdef CONFIG_SMP
1496 +       /*
1497 +        * We can optimise this out completely for !SMP, because the
1498 +        * SMP rebalancing from interrupt is the only thing that cares
1499 +        * here.
1500 +        */
1501 +       next->oncpu = 1;
1502 +#endif
1503 +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1504 +       spin_unlock_irq(&rq->lock);
1505 +#else
1506 +       spin_unlock(&rq->lock);
1507 +#endif
1508 +}
1509 +
1510 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1511 +{
1512 +#ifdef CONFIG_SMP
1513 +       /*
1514 +        * After ->oncpu is cleared, the task can be moved to a different CPU.
1515 +        * We must ensure this doesn't happen until the switch is completely
1516 +        * finished.
1517 +        */
1518 +       smp_wmb();
1519 +       prev->oncpu = 0;
1520 +#endif
1521 +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1522 +       local_irq_enable();
1523 +#endif
1524 +}
1525 +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1526 +
1527 +/*
1528 + * __task_rq_lock - lock the runqueue a given task resides on.
1529 + * Must be called interrupts disabled.
1530 + */
1531 +static inline struct rq *__task_rq_lock(struct task_struct *p)
1532 +       __acquires(rq->lock)
1533 +{
1534 +       for (;;) {
1535 +               struct rq *rq = task_rq(p);
1536 +               spin_lock(&rq->lock);
1537 +               if (likely(rq == task_rq(p)))
1538 +                       return rq;
1539 +               spin_unlock(&rq->lock);
1540 +       }
1541 +}
1542 +
1543 +/*
1544 + * task_rq_lock - lock the runqueue a given task resides on and disable
1545 + * interrupts. Note the ordering: we can safely lookup the task_rq without
1546 + * explicitly disabling preemption.
1547 + */
1548 +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1549 +       __acquires(rq->lock)
1550 +{
1551 +       struct rq *rq;
1552 +
1553 +       for (;;) {
1554 +               local_irq_save(*flags);
1555 +               rq = task_rq(p);
1556 +               spin_lock(&rq->lock);
1557 +               if (likely(rq == task_rq(p)))
1558 +                       return rq;
1559 +               spin_unlock_irqrestore(&rq->lock, *flags);
1560 +       }
1561 +}
1562 +
1563 +static void __task_rq_unlock(struct rq *rq)
1564 +       __releases(rq->lock)
1565 +{
1566 +       spin_unlock(&rq->lock);
1567 +}
1568 +
1569 +static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1570 +       __releases(rq->lock)
1571 +{
1572 +       spin_unlock_irqrestore(&rq->lock, *flags);
1573 +}
1574 +
1575 +/*
1576 + * this_rq_lock - lock this runqueue and disable interrupts.
1577 + */
1578 +static struct rq *this_rq_lock(void)
1579 +       __acquires(rq->lock)
1580 +{
1581 +       struct rq *rq;
1582 +
1583 +       local_irq_disable();
1584 +       rq = this_rq();
1585 +       spin_lock(&rq->lock);
1586 +
1587 +       return rq;
1588 +}
1589 +
1590 +#ifdef CONFIG_SCHED_HRTICK
1591 +/*
1592 + * Use HR-timers to deliver accurate preemption points.
1593 + *
1594 + * Its all a bit involved since we cannot program an hrt while holding the
1595 + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1596 + * reschedule event.
1597 + *
1598 + * When we get rescheduled we reprogram the hrtick_timer outside of the
1599 + * rq->lock.
1600 + */
1601 +
1602 +/*
1603 + * Use hrtick when:
1604 + *  - enabled by features
1605 + *  - hrtimer is actually high res
1606 + */
1607 +static inline int hrtick_enabled(struct rq *rq)
1608 +{
1609 +       if (!sched_feat(HRTICK))
1610 +               return 0;
1611 +       if (!cpu_active(cpu_of(rq)))
1612 +               return 0;
1613 +       return hrtimer_is_hres_active(&rq->hrtick_timer);
1614 +}
1615 +
1616 +static void hrtick_clear(struct rq *rq)
1617 +{
1618 +       if (hrtimer_active(&rq->hrtick_timer))
1619 +               hrtimer_cancel(&rq->hrtick_timer);
1620 +}
1621 +
1622 +/*
1623 + * High-resolution timer tick.
1624 + * Runs from hardirq context with interrupts disabled.
1625 + */
1626 +static enum hrtimer_restart hrtick(struct hrtimer *timer)
1627 +{
1628 +       struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1629 +
1630 +       WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1631 +
1632 +       spin_lock(&rq->lock);
1633 +       update_rq_clock(rq);
1634 +       rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1635 +       spin_unlock(&rq->lock);
1636 +
1637 +       return HRTIMER_NORESTART;
1638 +}
1639 +
1640 +#ifdef CONFIG_SMP
1641 +/*
1642 + * called from hardirq (IPI) context
1643 + */
1644 +static void __hrtick_start(void *arg)
1645 +{
1646 +       struct rq *rq = arg;
1647 +
1648 +       spin_lock(&rq->lock);
1649 +       hrtimer_restart(&rq->hrtick_timer);
1650 +       rq->hrtick_csd_pending = 0;
1651 +       spin_unlock(&rq->lock);
1652 +}
1653 +
1654 +/*
1655 + * Called to set the hrtick timer state.
1656 + *
1657 + * called with rq->lock held and irqs disabled
1658 + */
1659 +static void hrtick_start(struct rq *rq, u64 delay)
1660 +{
1661 +       struct hrtimer *timer = &rq->hrtick_timer;
1662 +       ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1663 +
1664 +       timer->expires = time;
1665 +
1666 +       if (rq == this_rq()) {
1667 +               hrtimer_restart(timer);
1668 +       } else if (!rq->hrtick_csd_pending) {
1669 +               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1670 +               rq->hrtick_csd_pending = 1;
1671 +       }
1672 +}
1673 +
1674 +static int
1675 +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1676 +{
1677 +       int cpu = (int)(long)hcpu;
1678 +
1679 +       switch (action) {
1680 +       case CPU_UP_CANCELED:
1681 +       case CPU_UP_CANCELED_FROZEN:
1682 +       case CPU_DOWN_PREPARE:
1683 +       case CPU_DOWN_PREPARE_FROZEN:
1684 +       case CPU_DEAD:
1685 +       case CPU_DEAD_FROZEN:
1686 +               hrtick_clear(cpu_rq(cpu));
1687 +               return NOTIFY_OK;
1688 +       }
1689 +
1690 +       return NOTIFY_DONE;
1691 +}
1692 +
1693 +static __init void init_hrtick(void)
1694 +{
1695 +       hotcpu_notifier(hotplug_hrtick, 0);
1696 +}
1697 +#else
1698 +/*
1699 + * Called to set the hrtick timer state.
1700 + *
1701 + * called with rq->lock held and irqs disabled
1702 + */
1703 +static void hrtick_start(struct rq *rq, u64 delay)
1704 +{
1705 +       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1706 +}
1707 +
1708 +static void init_hrtick(void)
1709 +{
1710 +}
1711 +#endif /* CONFIG_SMP */
1712 +
1713 +static void init_rq_hrtick(struct rq *rq)
1714 +{
1715 +#ifdef CONFIG_SMP
1716 +       rq->hrtick_csd_pending = 0;
1717 +
1718 +       rq->hrtick_csd.flags = 0;
1719 +       rq->hrtick_csd.func = __hrtick_start;
1720 +       rq->hrtick_csd.info = rq;
1721 +#endif
1722 +
1723 +       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1724 +       rq->hrtick_timer.function = hrtick;
1725 +       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1726 +}
1727 +#else
1728 +static inline void hrtick_clear(struct rq *rq)
1729 +{
1730 +}
1731 +
1732 +static inline void init_rq_hrtick(struct rq *rq)
1733 +{
1734 +}
1735 +
1736 +static inline void init_hrtick(void)
1737 +{
1738 +}
1739 +#endif
1740 +
1741 +/*
1742 + * resched_task - mark a task 'to be rescheduled now'.
1743 + *
1744 + * On UP this means the setting of the need_resched flag, on SMP it
1745 + * might also involve a cross-CPU call to trigger the scheduler on
1746 + * the target CPU.
1747 + */
1748 +#ifdef CONFIG_SMP
1749 +
1750 +#ifndef tsk_is_polling
1751 +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1752 +#endif
1753 +
1754 +static void resched_task(struct task_struct *p)
1755 +{
1756 +       int cpu;
1757 +
1758 +       assert_spin_locked(&task_rq(p)->lock);
1759 +
1760 +       if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1761 +               return;
1762 +
1763 +       set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1764 +
1765 +       cpu = task_cpu(p);
1766 +       if (cpu == smp_processor_id())
1767 +               return;
1768 +
1769 +       /* NEED_RESCHED must be visible before we test polling */
1770 +       smp_mb();
1771 +       if (!tsk_is_polling(p))
1772 +               smp_send_reschedule(cpu);
1773 +}
1774 +
1775 +static void resched_cpu(int cpu)
1776 +{
1777 +       struct rq *rq = cpu_rq(cpu);
1778 +       unsigned long flags;
1779 +
1780 +       if (!spin_trylock_irqsave(&rq->lock, flags))
1781 +               return;
1782 +       resched_task(cpu_curr(cpu));
1783 +       spin_unlock_irqrestore(&rq->lock, flags);
1784 +}
1785 +
1786 +#ifdef CONFIG_NO_HZ
1787 +/*
1788 + * When add_timer_on() enqueues a timer into the timer wheel of an
1789 + * idle CPU then this timer might expire before the next timer event
1790 + * which is scheduled to wake up that CPU. In case of a completely
1791 + * idle system the next event might even be infinite time into the
1792 + * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1793 + * leaves the inner idle loop so the newly added timer is taken into
1794 + * account when the CPU goes back to idle and evaluates the timer
1795 + * wheel for the next timer event.
1796 + */
1797 +void wake_up_idle_cpu(int cpu)
1798 +{
1799 +       struct rq *rq = cpu_rq(cpu);
1800 +
1801 +       if (cpu == smp_processor_id())
1802 +               return;
1803 +
1804 +       /*
1805 +        * This is safe, as this function is called with the timer
1806 +        * wheel base lock of (cpu) held. When the CPU is on the way
1807 +        * to idle and has not yet set rq->curr to idle then it will
1808 +        * be serialized on the timer wheel base lock and take the new
1809 +        * timer into account automatically.
1810 +        */
1811 +       if (rq->curr != rq->idle)
1812 +               return;
1813 +
1814 +       /*
1815 +        * We can set TIF_RESCHED on the idle task of the other CPU
1816 +        * lockless. The worst case is that the other CPU runs the
1817 +        * idle task through an additional NOOP schedule()
1818 +        */
1819 +       set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1820 +
1821 +       /* NEED_RESCHED must be visible before we test polling */
1822 +       smp_mb();
1823 +       if (!tsk_is_polling(rq->idle))
1824 +               smp_send_reschedule(cpu);
1825 +}
1826 +#endif /* CONFIG_NO_HZ */
1827 +
1828 +#else /* !CONFIG_SMP */
1829 +static void resched_task(struct task_struct *p)
1830 +{
1831 +       assert_spin_locked(&task_rq(p)->lock);
1832 +       set_tsk_need_resched(p);
1833 +}
1834 +#endif /* CONFIG_SMP */
1835 +
1836 +#if BITS_PER_LONG == 32
1837 +# define WMULT_CONST   (~0UL)
1838 +#else
1839 +# define WMULT_CONST   (1UL << 32)
1840 +#endif
1841 +
1842 +#define WMULT_SHIFT    32
1843 +
1844 +/*
1845 + * Shift right and round:
1846 + */
1847 +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1848 +
1849 +/*
1850 + * delta *= weight / lw
1851 + */
1852 +static unsigned long
1853 +calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1854 +               struct load_weight *lw)
1855 +{
1856 +       u64 tmp;
1857 +
1858 +       if (!lw->inv_weight) {
1859 +               if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1860 +                       lw->inv_weight = 1;
1861 +               else
1862 +                       lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1863 +                               / (lw->weight+1);
1864 +       }
1865 +
1866 +       tmp = (u64)delta_exec * weight;
1867 +       /*
1868 +        * Check whether we'd overflow the 64-bit multiplication:
1869 +        */
1870 +       if (unlikely(tmp > WMULT_CONST))
1871 +               tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1872 +                       WMULT_SHIFT/2);
1873 +       else
1874 +               tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1875 +
1876 +       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1877 +}
1878 +
1879 +static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1880 +{
1881 +       lw->weight += inc;
1882 +       lw->inv_weight = 0;
1883 +}
1884 +
1885 +static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1886 +{
1887 +       lw->weight -= dec;
1888 +       lw->inv_weight = 0;
1889 +}
1890 +
1891 +/*
1892 + * To aid in avoiding the subversion of "niceness" due to uneven distribution
1893 + * of tasks with abnormal "nice" values across CPUs the contribution that
1894 + * each task makes to its run queue's load is weighted according to its
1895 + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1896 + * scaled version of the new time slice allocation that they receive on time
1897 + * slice expiry etc.
1898 + */
1899 +
1900 +#define WEIGHT_IDLEPRIO                2
1901 +#define WMULT_IDLEPRIO         (1 << 31)
1902 +
1903 +/*
1904 + * Nice levels are multiplicative, with a gentle 10% change for every
1905 + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1906 + * nice 1, it will get ~10% less CPU time than another CPU-bound task
1907 + * that remained on nice 0.
1908 + *
1909 + * The "10% effect" is relative and cumulative: from _any_ nice level,
1910 + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1911 + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1912 + * If a task goes up by ~10% and another task goes down by ~10% then
1913 + * the relative distance between them is ~25%.)
1914 + */
1915 +static const int prio_to_weight[40] = {
1916 + /* -20 */     88761,     71755,     56483,     46273,     36291,
1917 + /* -15 */     29154,     23254,     18705,     14949,     11916,
1918 + /* -10 */      9548,      7620,      6100,      4904,      3906,
1919 + /*  -5 */      3121,      2501,      1991,      1586,      1277,
1920 + /*   0 */      1024,       820,       655,       526,       423,
1921 + /*   5 */       335,       272,       215,       172,       137,
1922 + /*  10 */       110,        87,        70,        56,        45,
1923 + /*  15 */        36,        29,        23,        18,        15,
1924 +};
1925 +
1926 +/*
1927 + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1928 + *
1929 + * In cases where the weight does not change often, we can use the
1930 + * precalculated inverse to speed up arithmetics by turning divisions
1931 + * into multiplications:
1932 + */
1933 +static const u32 prio_to_wmult[40] = {
1934 + /* -20 */     48388,     59856,     76040,     92818,    118348,
1935 + /* -15 */    147320,    184698,    229616,    287308,    360437,
1936 + /* -10 */    449829,    563644,    704093,    875809,   1099582,
1937 + /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
1938 + /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
1939 + /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
1940 + /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
1941 + /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1942 +};
1943 +
1944 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1945 +
1946 +/*
1947 + * runqueue iterator, to support SMP load-balancing between different
1948 + * scheduling classes, without having to expose their internal data
1949 + * structures to the load-balancing proper:
1950 + */
1951 +struct rq_iterator {
1952 +       void *arg;
1953 +       struct task_struct *(*start)(void *);
1954 +       struct task_struct *(*next)(void *);
1955 +};
1956 +
1957 +#ifdef CONFIG_SMP
1958 +static unsigned long
1959 +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1960 +             unsigned long max_load_move, struct sched_domain *sd,
1961 +             enum cpu_idle_type idle, int *all_pinned,
1962 +             int *this_best_prio, struct rq_iterator *iterator);
1963 +
1964 +static int
1965 +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1966 +                  struct sched_domain *sd, enum cpu_idle_type idle,
1967 +                  struct rq_iterator *iterator);
1968 +#endif
1969 +
1970 +#ifdef CONFIG_CGROUP_CPUACCT
1971 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1972 +#else
1973 +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1974 +#endif
1975 +
1976 +static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1977 +{
1978 +       update_load_add(&rq->load, load);
1979 +}
1980 +
1981 +static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1982 +{
1983 +       update_load_sub(&rq->load, load);
1984 +}
1985 +
1986 +#ifdef CONFIG_SMP
1987 +static unsigned long source_load(int cpu, int type);
1988 +static unsigned long target_load(int cpu, int type);
1989 +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1990 +
1991 +static unsigned long cpu_avg_load_per_task(int cpu)
1992 +{
1993 +       struct rq *rq = cpu_rq(cpu);
1994 +
1995 +       if (rq->nr_running)
1996 +               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1997 +
1998 +       return rq->avg_load_per_task;
1999 +}
2000 +
2001 +#ifdef CONFIG_FAIR_GROUP_SCHED
2002 +
2003 +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
2004 +
2005 +/*
2006 + * Iterate the full tree, calling @down when first entering a node and @up when
2007 + * leaving it for the final time.
2008 + */
2009 +static void
2010 +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
2011 +{
2012 +       struct task_group *parent, *child;
2013 +
2014 +       rcu_read_lock();
2015 +       parent = &root_task_group;
2016 +down:
2017 +       (*down)(parent, cpu, sd);
2018 +       list_for_each_entry_rcu(child, &parent->children, siblings) {
2019 +               parent = child;
2020 +               goto down;
2021 +
2022 +up:
2023 +               continue;
2024 +       }
2025 +       (*up)(parent, cpu, sd);
2026 +
2027 +       child = parent;
2028 +       parent = parent->parent;
2029 +       if (parent)
2030 +               goto up;
2031 +       rcu_read_unlock();
2032 +}
2033 +
2034 +static void __set_se_shares(struct sched_entity *se, unsigned long shares);
2035 +
2036 +/*
2037 + * Calculate and set the cpu's group shares.
2038 + */
2039 +static void
2040 +__update_group_shares_cpu(struct task_group *tg, int cpu,
2041 +                         unsigned long sd_shares, unsigned long sd_rq_weight)
2042 +{
2043 +       int boost = 0;
2044 +       unsigned long shares;
2045 +       unsigned long rq_weight;
2046 +
2047 +       if (!tg->se[cpu])
2048 +               return;
2049 +
2050 +       rq_weight = tg->cfs_rq[cpu]->load.weight;
2051 +
2052 +       /*
2053 +        * If there are currently no tasks on the cpu pretend there is one of
2054 +        * average load so that when a new task gets to run here it will not
2055 +        * get delayed by group starvation.
2056 +        */
2057 +       if (!rq_weight) {
2058 +               boost = 1;
2059 +               rq_weight = NICE_0_LOAD;
2060 +       }
2061 +
2062 +       if (unlikely(rq_weight > sd_rq_weight))
2063 +               rq_weight = sd_rq_weight;
2064 +
2065 +       /*
2066 +        *           \Sum shares * rq_weight
2067 +        * shares =  -----------------------
2068 +        *               \Sum rq_weight
2069 +        *
2070 +        */
2071 +       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
2072 +
2073 +       /*
2074 +        * record the actual number of shares, not the boosted amount.
2075 +        */
2076 +       tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
2077 +       tg->cfs_rq[cpu]->rq_weight = rq_weight;
2078 +
2079 +       if (shares < MIN_SHARES)
2080 +               shares = MIN_SHARES;
2081 +       else if (shares > MAX_SHARES)
2082 +               shares = MAX_SHARES;
2083 +
2084 +       __set_se_shares(tg->se[cpu], shares);
2085 +}
2086 +
2087 +/*
2088 + * Re-compute the task group their per cpu shares over the given domain.
2089 + * This needs to be done in a bottom-up fashion because the rq weight of a
2090 + * parent group depends on the shares of its child groups.
2091 + */
2092 +static void
2093 +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
2094 +{
2095 +       unsigned long rq_weight = 0;
2096 +       unsigned long shares = 0;
2097 +       int i;
2098 +
2099 +       for_each_cpu_mask(i, sd->span) {
2100 +               rq_weight += tg->cfs_rq[i]->load.weight;
2101 +               shares += tg->cfs_rq[i]->shares;
2102 +       }
2103 +
2104 +       if ((!shares && rq_weight) || shares > tg->shares)
2105 +               shares = tg->shares;
2106 +
2107 +       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
2108 +               shares = tg->shares;
2109 +
2110 +       if (!rq_weight)
2111 +               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
2112 +
2113 +       for_each_cpu_mask(i, sd->span) {
2114 +               struct rq *rq = cpu_rq(i);
2115 +               unsigned long flags;
2116 +
2117 +               spin_lock_irqsave(&rq->lock, flags);
2118 +               __update_group_shares_cpu(tg, i, shares, rq_weight);
2119 +               spin_unlock_irqrestore(&rq->lock, flags);
2120 +       }
2121 +}
2122 +
2123 +/*
2124 + * Compute the cpu's hierarchical load factor for each task group.
2125 + * This needs to be done in a top-down fashion because the load of a child
2126 + * group is a fraction of its parents load.
2127 + */
2128 +static void
2129 +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
2130 +{
2131 +       unsigned long load;
2132 +
2133 +       if (!tg->parent) {
2134 +               load = cpu_rq(cpu)->load.weight;
2135 +       } else {
2136 +               load = tg->parent->cfs_rq[cpu]->h_load;
2137 +               load *= tg->cfs_rq[cpu]->shares;
2138 +               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
2139 +       }
2140 +
2141 +       tg->cfs_rq[cpu]->h_load = load;
2142 +}
2143 +
2144 +static void
2145 +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
2146 +{
2147 +}
2148 +
2149 +static void update_shares(struct sched_domain *sd)
2150 +{
2151 +       u64 now = cpu_clock(raw_smp_processor_id());
2152 +       s64 elapsed = now - sd->last_update;
2153 +
2154 +       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
2155 +               sd->last_update = now;
2156 +               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
2157 +       }
2158 +}
2159 +
2160 +static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
2161 +{
2162 +       spin_unlock(&rq->lock);
2163 +       update_shares(sd);
2164 +       spin_lock(&rq->lock);
2165 +}
2166 +
2167 +static void update_h_load(int cpu)
2168 +{
2169 +       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
2170 +}
2171 +
2172 +#else
2173 +
2174 +static inline void update_shares(struct sched_domain *sd)
2175 +{
2176 +}
2177 +
2178 +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
2179 +{
2180 +}
2181 +
2182 +#endif
2183 +
2184 +#endif
2185 +
2186 +#ifdef CONFIG_FAIR_GROUP_SCHED
2187 +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
2188 +{
2189 +#ifdef CONFIG_SMP
2190 +       cfs_rq->shares = shares;
2191 +#endif
2192 +}
2193 +#endif
2194 +
2195 +#include "sched_stats.h"
2196 +#include "sched_idletask.c"
2197 +#include "sched_fair.c"
2198 +#include "sched_rt.c"
2199 +#ifdef CONFIG_SCHED_DEBUG
2200 +# include "sched_debug.c"
2201 +#endif
2202 +
2203 +#define sched_class_highest (&rt_sched_class)
2204 +#define for_each_class(class) \
2205 +   for (class = sched_class_highest; class; class = class->next)
2206 +
2207 +static void inc_nr_running(struct rq *rq)
2208 +{
2209 +       rq->nr_running++;
2210 +}
2211 +
2212 +static void dec_nr_running(struct rq *rq)
2213 +{
2214 +       rq->nr_running--;
2215 +}
2216 +
2217 +static void set_load_weight(struct task_struct *p)
2218 +{
2219 +       if (task_has_rt_policy(p)) {
2220 +               p->se.load.weight = prio_to_weight[0] * 2;
2221 +               p->se.load.inv_weight = prio_to_wmult[0] >> 1;
2222 +               return;
2223 +       }
2224 +
2225 +       /*
2226 +        * SCHED_IDLE tasks get minimal weight:
2227 +        */
2228 +       if (p->policy == SCHED_IDLE) {
2229 +               p->se.load.weight = WEIGHT_IDLEPRIO;
2230 +               p->se.load.inv_weight = WMULT_IDLEPRIO;
2231 +               return;
2232 +       }
2233 +
2234 +       p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
2235 +       p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
2236 +}
2237 +
2238 +static void update_avg(u64 *avg, u64 sample)
2239 +{
2240 +       s64 diff = sample - *avg;
2241 +       *avg += diff >> 3;
2242 +}
2243 +
2244 +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
2245 +{
2246 +       // BUG_ON(p->state & TASK_ONHOLD);
2247 +       sched_info_queued(p);
2248 +       p->sched_class->enqueue_task(rq, p, wakeup);
2249 +       p->se.on_rq = 1;
2250 +}
2251 +
2252 +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
2253 +{
2254 +       if (sleep && p->se.last_wakeup) {
2255 +               update_avg(&p->se.avg_overlap,
2256 +                          p->se.sum_exec_runtime - p->se.last_wakeup);
2257 +               p->se.last_wakeup = 0;
2258 +       }
2259 +
2260 +       sched_info_dequeued(p);
2261 +       p->sched_class->dequeue_task(rq, p, sleep);
2262 +       p->se.on_rq = 0;
2263 +}
2264 +
2265 +/*
2266 + * __normal_prio - return the priority that is based on the static prio
2267 + */
2268 +static inline int __normal_prio(struct task_struct *p)
2269 +{
2270 +       return p->static_prio;
2271 +}
2272 +
2273 +/*
2274 + * Calculate the expected normal priority: i.e. priority
2275 + * without taking RT-inheritance into account. Might be
2276 + * boosted by interactivity modifiers. Changes upon fork,
2277 + * setprio syscalls, and whenever the interactivity
2278 + * estimator recalculates.
2279 + */
2280 +static inline int normal_prio(struct task_struct *p)
2281 +{
2282 +       int prio;
2283 +
2284 +       if (task_has_rt_policy(p))
2285 +               prio = MAX_RT_PRIO-1 - p->rt_priority;
2286 +       else
2287 +               prio = __normal_prio(p);
2288 +       return prio;
2289 +}
2290 +
2291 +/*
2292 + * Calculate the current priority, i.e. the priority
2293 + * taken into account by the scheduler. This value might
2294 + * be boosted by RT tasks, or might be boosted by
2295 + * interactivity modifiers. Will be RT if the task got
2296 + * RT-boosted. If not then it returns p->normal_prio.
2297 + */
2298 +static int effective_prio(struct task_struct *p)
2299 +{
2300 +       p->normal_prio = normal_prio(p);
2301 +       /*
2302 +        * If we are RT tasks or we were boosted to RT priority,
2303 +        * keep the priority unchanged. Otherwise, update priority
2304 +        * to the normal priority:
2305 +        */
2306 +       if (!rt_prio(p->prio))
2307 +               return p->normal_prio;
2308 +       return p->prio;
2309 +}
2310 +
2311 +/*
2312 + * activate_task - move a task to the runqueue.
2313 + */
2314 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
2315 +{
2316 +       if (task_contributes_to_load(p))
2317 +               rq->nr_uninterruptible--;
2318 +
2319 +       enqueue_task(rq, p, wakeup);
2320 +       inc_nr_running(rq);
2321 +}
2322 +
2323 +/*
2324 + * deactivate_task - remove a task from the runqueue.
2325 + */
2326 +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
2327 +{
2328 +       if (task_contributes_to_load(p))
2329 +               rq->nr_uninterruptible++;
2330 +
2331 +       dequeue_task(rq, p, sleep);
2332 +       dec_nr_running(rq);
2333 +}
2334 +
2335 +/**
2336 + * task_curr - is this task currently executing on a CPU?
2337 + * @p: the task in question.
2338 + */
2339 +inline int task_curr(const struct task_struct *p)
2340 +{
2341 +       return cpu_curr(task_cpu(p)) == p;
2342 +}
2343 +
2344 +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
2345 +{
2346 +       set_task_rq(p, cpu);
2347 +#ifdef CONFIG_SMP
2348 +       /*
2349 +        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
2350 +        * successfuly executed on another CPU. We must ensure that updates of
2351 +        * per-task data have been completed by this moment.
2352 +        */
2353 +       smp_wmb();
2354 +       task_thread_info(p)->cpu = cpu;
2355 +#endif
2356 +}
2357 +
2358 +static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2359 +                                      const struct sched_class *prev_class,
2360 +                                      int oldprio, int running)
2361 +{
2362 +       if (prev_class != p->sched_class) {
2363 +               if (prev_class->switched_from)
2364 +                       prev_class->switched_from(rq, p, running);
2365 +               p->sched_class->switched_to(rq, p, running);
2366 +       } else
2367 +               p->sched_class->prio_changed(rq, p, oldprio, running);
2368 +}
2369 +
2370 +#ifdef CONFIG_SMP
2371 +
2372 +/* Used instead of source_load when we know the type == 0 */
2373 +static unsigned long weighted_cpuload(const int cpu)
2374 +{
2375 +       return cpu_rq(cpu)->load.weight;
2376 +}
2377 +
2378 +/*
2379 + * Is this task likely cache-hot:
2380 + */
2381 +static int
2382 +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2383 +{
2384 +       s64 delta;
2385 +
2386 +       /*
2387 +        * Buddy candidates are cache hot:
2388 +        */
2389 +       if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
2390 +               return 1;
2391 +
2392 +       if (p->sched_class != &fair_sched_class)
2393 +               return 0;
2394 +
2395 +       if (sysctl_sched_migration_cost == -1)
2396 +               return 1;
2397 +       if (sysctl_sched_migration_cost == 0)
2398 +               return 0;
2399 +
2400 +       delta = now - p->se.exec_start;
2401 +
2402 +       return delta < (s64)sysctl_sched_migration_cost;
2403 +}
2404 +
2405 +
2406 +void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2407 +{
2408 +       int old_cpu = task_cpu(p);
2409 +       struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2410 +       struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2411 +                     *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2412 +       u64 clock_offset;
2413 +
2414 +       clock_offset = old_rq->clock - new_rq->clock;
2415 +
2416 +#ifdef CONFIG_SCHEDSTATS
2417 +       if (p->se.wait_start)
2418 +               p->se.wait_start -= clock_offset;
2419 +       if (p->se.sleep_start)
2420 +               p->se.sleep_start -= clock_offset;
2421 +       if (p->se.block_start)
2422 +               p->se.block_start -= clock_offset;
2423 +       if (old_cpu != new_cpu) {
2424 +               schedstat_inc(p, se.nr_migrations);
2425 +               if (task_hot(p, old_rq->clock, NULL))
2426 +                       schedstat_inc(p, se.nr_forced2_migrations);
2427 +       }
2428 +#endif
2429 +       p->se.vruntime -= old_cfsrq->min_vruntime -
2430 +                                        new_cfsrq->min_vruntime;
2431 +
2432 +       __set_task_cpu(p, new_cpu);
2433 +}
2434 +
2435 +struct migration_req {
2436 +       struct list_head list;
2437 +
2438 +       struct task_struct *task;
2439 +       int dest_cpu;
2440 +
2441 +       struct completion done;
2442 +};
2443 +
2444 +#include "sched_mon.h"
2445 +
2446 +
2447 +/*
2448 + * The task's runqueue lock must be held.
2449 + * Returns true if you have to wait for migration thread.
2450 + */
2451 +static int
2452 +migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2453 +{
2454 +       struct rq *rq = task_rq(p);
2455 +
2456 +       vxm_migrate_task(p, rq, dest_cpu);
2457 +       /*
2458 +        * If the task is not on a runqueue (and not running), then
2459 +        * it is sufficient to simply update the task's cpu field.
2460 +        */
2461 +       if (!p->se.on_rq && !task_running(rq, p)) {
2462 +               set_task_cpu(p, dest_cpu);
2463 +               return 0;
2464 +       }
2465 +
2466 +       init_completion(&req->done);
2467 +       req->task = p;
2468 +       req->dest_cpu = dest_cpu;
2469 +       list_add(&req->list, &rq->migration_queue);
2470 +
2471 +       return 1;
2472 +}
2473 +
2474 +/*
2475 + * wait_task_inactive - wait for a thread to unschedule.
2476 + *
2477 + * If @match_state is nonzero, it's the @p->state value just checked and
2478 + * not expected to change.  If it changes, i.e. @p might have woken up,
2479 + * then return zero.  When we succeed in waiting for @p to be off its CPU,
2480 + * we return a positive number (its total switch count).  If a second call
2481 + * a short while later returns the same number, the caller can be sure that
2482 + * @p has remained unscheduled the whole time.
2483 + *
2484 + * The caller must ensure that the task *will* unschedule sometime soon,
2485 + * else this function might spin for a *long* time. This function can't
2486 + * be called with interrupts off, or it may introduce deadlock with
2487 + * smp_call_function() if an IPI is sent by the same process we are
2488 + * waiting to become inactive.
2489 + */
2490 +unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2491 +{
2492 +       unsigned long flags;
2493 +       int running, on_rq;
2494 +       unsigned long ncsw;
2495 +       struct rq *rq;
2496 +
2497 +       for (;;) {
2498 +               /*
2499 +                * We do the initial early heuristics without holding
2500 +                * any task-queue locks at all. We'll only try to get
2501 +                * the runqueue lock when things look like they will
2502 +                * work out!
2503 +                */
2504 +               rq = task_rq(p);
2505 +
2506 +               /*
2507 +                * If the task is actively running on another CPU
2508 +                * still, just relax and busy-wait without holding
2509 +                * any locks.
2510 +                *
2511 +                * NOTE! Since we don't hold any locks, it's not
2512 +                * even sure that "rq" stays as the right runqueue!
2513 +                * But we don't care, since "task_running()" will
2514 +                * return false if the runqueue has changed and p
2515 +                * is actually now running somewhere else!
2516 +                */
2517 +               while (task_running(rq, p)) {
2518 +                       if (match_state && unlikely(p->state != match_state))
2519 +                               return 0;
2520 +                       cpu_relax();
2521 +               }
2522 +
2523 +               /*
2524 +                * Ok, time to look more closely! We need the rq
2525 +                * lock now, to be *sure*. If we're wrong, we'll
2526 +                * just go back and repeat.
2527 +                */
2528 +               rq = task_rq_lock(p, &flags);
2529 +               running = task_running(rq, p);
2530 +               on_rq = p->se.on_rq;
2531 +               ncsw = 0;
2532 +               if (!match_state || p->state == match_state) {
2533 +                       ncsw = p->nivcsw + p->nvcsw;
2534 +                       if (unlikely(!ncsw))
2535 +                               ncsw = 1;
2536 +               }
2537 +               task_rq_unlock(rq, &flags);
2538 +
2539 +               /*
2540 +                * If it changed from the expected state, bail out now.
2541 +                */
2542 +               if (unlikely(!ncsw))
2543 +                       break;
2544 +
2545 +               /*
2546 +                * Was it really running after all now that we
2547 +                * checked with the proper locks actually held?
2548 +                *
2549 +                * Oops. Go back and try again..
2550 +                */
2551 +               if (unlikely(running)) {
2552 +                       cpu_relax();
2553 +                       continue;
2554 +               }
2555 +
2556 +               /*
2557 +                * It's not enough that it's not actively running,
2558 +                * it must be off the runqueue _entirely_, and not
2559 +                * preempted!
2560 +                *
2561 +                * So if it wa still runnable (but just not actively
2562 +                * running right now), it's preempted, and we should
2563 +                * yield - it could be a while.
2564 +                */
2565 +               if (unlikely(on_rq)) {
2566 +                       schedule_timeout_uninterruptible(1);
2567 +                       continue;
2568 +               }
2569 +
2570 +               /*
2571 +                * Ahh, all good. It wasn't running, and it wasn't
2572 +                * runnable, which means that it will never become
2573 +                * running in the future either. We're all done!
2574 +                */
2575 +               break;
2576 +       }
2577 +
2578 +       return ncsw;
2579 +}
2580 +
2581 +/***
2582 + * kick_process - kick a running thread to enter/exit the kernel
2583 + * @p: the to-be-kicked thread
2584 + *
2585 + * Cause a process which is running on another CPU to enter
2586 + * kernel-mode, without any delay. (to get signals handled.)
2587 + *
2588 + * NOTE: this function doesnt have to take the runqueue lock,
2589 + * because all it wants to ensure is that the remote task enters
2590 + * the kernel. If the IPI races and the task has been migrated
2591 + * to another CPU then no harm is done and the purpose has been
2592 + * achieved as well.
2593 + */
2594 +void kick_process(struct task_struct *p)
2595 +{
2596 +       int cpu;
2597 +
2598 +       preempt_disable();
2599 +       cpu = task_cpu(p);
2600 +       if ((cpu != smp_processor_id()) && task_curr(p))
2601 +               smp_send_reschedule(cpu);
2602 +       preempt_enable();
2603 +}
2604 +
2605 +/*
2606 + * Return a low guess at the load of a migration-source cpu weighted
2607 + * according to the scheduling class and "nice" value.
2608 + *
2609 + * We want to under-estimate the load of migration sources, to
2610 + * balance conservatively.
2611 + */
2612 +static unsigned long source_load(int cpu, int type)
2613 +{
2614 +       struct rq *rq = cpu_rq(cpu);
2615 +       unsigned long total = weighted_cpuload(cpu);
2616 +
2617 +       if (type == 0 || !sched_feat(LB_BIAS))
2618 +               return total;
2619 +
2620 +       return min(rq->cpu_load[type-1], total);
2621 +}
2622 +
2623 +/*
2624 + * Return a high guess at the load of a migration-target cpu weighted
2625 + * according to the scheduling class and "nice" value.
2626 + */
2627 +static unsigned long target_load(int cpu, int type)
2628 +{
2629 +       struct rq *rq = cpu_rq(cpu);
2630 +       unsigned long total = weighted_cpuload(cpu);
2631 +
2632 +       if (type == 0 || !sched_feat(LB_BIAS))
2633 +               return total;
2634 +
2635 +       return max(rq->cpu_load[type-1], total);
2636 +}
2637 +
2638 +/*
2639 + * find_idlest_group finds and returns the least busy CPU group within the
2640 + * domain.
2641 + */
2642 +static struct sched_group *
2643 +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2644 +{
2645 +       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2646 +       unsigned long min_load = ULONG_MAX, this_load = 0;
2647 +       int load_idx = sd->forkexec_idx;
2648 +       int imbalance = 100 + (sd->imbalance_pct-100)/2;
2649 +
2650 +       do {
2651 +               unsigned long load, avg_load;
2652 +               int local_group;
2653 +               int i;
2654 +
2655 +               /* Skip over this group if it has no CPUs allowed */
2656 +               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
2657 +                       continue;
2658 +
2659 +               local_group = cpu_isset(this_cpu, group->cpumask);
2660 +
2661 +               /* Tally up the load of all CPUs in the group */
2662 +               avg_load = 0;
2663 +
2664 +               for_each_cpu_mask_nr(i, group->cpumask) {
2665 +                       /* Bias balancing toward cpus of our domain */
2666 +                       if (local_group)
2667 +                               load = source_load(i, load_idx);
2668 +                       else
2669 +                               load = target_load(i, load_idx);
2670 +
2671 +                       avg_load += load;
2672 +               }
2673 +
2674 +               /* Adjust by relative CPU power of the group */
2675 +               avg_load = sg_div_cpu_power(group,
2676 +                               avg_load * SCHED_LOAD_SCALE);
2677 +
2678 +               if (local_group) {
2679 +                       this_load = avg_load;
2680 +                       this = group;
2681 +               } else if (avg_load < min_load) {
2682 +                       min_load = avg_load;
2683 +                       idlest = group;
2684 +               }
2685 +       } while (group = group->next, group != sd->groups);
2686 +
2687 +       if (!idlest || 100*this_load < imbalance*min_load)
2688 +               return NULL;
2689 +       return idlest;
2690 +}
2691 +
2692 +/*
2693 + * find_idlest_cpu - find the idlest cpu among the cpus in group.
2694 + */
2695 +static int
2696 +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2697 +               cpumask_t *tmp)
2698 +{
2699 +       unsigned long load, min_load = ULONG_MAX;
2700 +       int idlest = -1;
2701 +       int i;
2702 +
2703 +       /* Traverse only the allowed CPUs */
2704 +       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
2705 +
2706 +       for_each_cpu_mask_nr(i, *tmp) {
2707 +               load = weighted_cpuload(i);
2708 +
2709 +               if (load < min_load || (load == min_load && i == this_cpu)) {
2710 +                       min_load = load;
2711 +                       idlest = i;
2712 +               }
2713 +       }
2714 +
2715 +       return idlest;
2716 +}
2717 +
2718 +/*
2719 + * sched_balance_self: balance the current task (running on cpu) in domains
2720 + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2721 + * SD_BALANCE_EXEC.
2722 + *
2723 + * Balance, ie. select the least loaded group.
2724 + *
2725 + * Returns the target CPU number, or the same CPU if no balancing is needed.
2726 + *
2727 + * preempt must be disabled.
2728 + */
2729 +static int sched_balance_self(int cpu, int flag)
2730 +{
2731 +       struct task_struct *t = current;
2732 +       struct sched_domain *tmp, *sd = NULL;
2733 +
2734 +       for_each_domain(cpu, tmp) {
2735 +               /*
2736 +                * If power savings logic is enabled for a domain, stop there.
2737 +                */
2738 +               if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2739 +                       break;
2740 +               if (tmp->flags & flag)
2741 +                       sd = tmp;
2742 +       }
2743 +
2744 +       if (sd)
2745 +               update_shares(sd);
2746 +
2747 +       while (sd) {
2748 +               cpumask_t span, tmpmask;
2749 +               struct sched_group *group;
2750 +               int new_cpu, weight;
2751 +
2752 +               if (!(sd->flags & flag)) {
2753 +                       sd = sd->child;
2754 +                       continue;
2755 +               }
2756 +
2757 +               span = sd->span;
2758 +               group = find_idlest_group(sd, t, cpu);
2759 +               if (!group) {
2760 +                       sd = sd->child;
2761 +                       continue;
2762 +               }
2763 +
2764 +               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
2765 +               if (new_cpu == -1 || new_cpu == cpu) {
2766 +                       /* Now try balancing at a lower domain level of cpu */
2767 +                       sd = sd->child;
2768 +                       continue;
2769 +               }
2770 +
2771 +               /* Now try balancing at a lower domain level of new_cpu */
2772 +               cpu = new_cpu;
2773 +               sd = NULL;
2774 +               weight = cpus_weight(span);
2775 +               for_each_domain(cpu, tmp) {
2776 +                       if (weight <= cpus_weight(tmp->span))
2777 +                               break;
2778 +                       if (tmp->flags & flag)
2779 +                               sd = tmp;
2780 +               }
2781 +               /* while loop will break here if sd == NULL */
2782 +       }
2783 +
2784 +       return cpu;
2785 +}
2786 +
2787 +#endif /* CONFIG_SMP */
2788 +
2789 +/***
2790 + * try_to_wake_up - wake up a thread
2791 + * @p: the to-be-woken-up thread
2792 + * @state: the mask of task states that can be woken
2793 + * @sync: do a synchronous wakeup?
2794 + *
2795 + * Put it on the run-queue if it's not already there. The "current"
2796 + * thread is always on the run-queue (except when the actual
2797 + * re-schedule is in progress), and as such you're allowed to do
2798 + * the simpler "current->state = TASK_RUNNING" to mark yourself
2799 + * runnable without the overhead of this.
2800 + *
2801 + * returns failure only if the task is already active.
2802 + */
2803 +static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2804 +{
2805 +       int cpu, orig_cpu, this_cpu, success = 0;
2806 +       unsigned long flags;
2807 +       long old_state;
2808 +       struct rq *rq;
2809 +
2810 +       if (!sched_feat(SYNC_WAKEUPS))
2811 +               sync = 0;
2812 +
2813 +#ifdef CONFIG_SMP
2814 +       if (sched_feat(LB_WAKEUP_UPDATE)) {
2815 +               struct sched_domain *sd;
2816 +
2817 +               this_cpu = raw_smp_processor_id();
2818 +               cpu = task_cpu(p);
2819 +
2820 +               for_each_domain(this_cpu, sd) {
2821 +                       if (cpu_isset(cpu, sd->span)) {
2822 +                               update_shares(sd);
2823 +                               break;
2824 +                       }
2825 +               }
2826 +       }
2827 +#endif
2828 +
2829 +       smp_wmb();
2830 +       rq = task_rq_lock(p, &flags);
2831 +       old_state = p->state;
2832 +       if (!(old_state & state))
2833 +               goto out;
2834 +
2835 +       if (p->se.on_rq)
2836 +               goto out_running;
2837 +
2838 +       cpu = task_cpu(p);
2839 +       orig_cpu = cpu;
2840 +       this_cpu = smp_processor_id();
2841 +
2842 +#ifdef CONFIG_SMP
2843 +       if (unlikely(task_running(rq, p)))
2844 +               goto out_activate;
2845 +
2846 +       cpu = p->sched_class->select_task_rq(p, sync);
2847 +       if (cpu != orig_cpu) {
2848 +               set_task_cpu(p, cpu);
2849 +               task_rq_unlock(rq, &flags);
2850 +               /* might preempt at this point */
2851 +               rq = task_rq_lock(p, &flags);
2852 +               old_state = p->state;
2853 +
2854 +       /* we need to unhold suspended tasks
2855 +       if (old_state & TASK_ONHOLD) {
2856 +               vx_unhold_task(p, rq);
2857 +               old_state = p->state;
2858 +       } */
2859 +               if (!(old_state & state))
2860 +                       goto out;
2861 +               if (p->se.on_rq)
2862 +                       goto out_running;
2863 +
2864 +               this_cpu = smp_processor_id();
2865 +               cpu = task_cpu(p);
2866 +       }
2867 +
2868 +#ifdef CONFIG_SCHEDSTATS
2869 +       schedstat_inc(rq, ttwu_count);
2870 +       if (cpu == this_cpu)
2871 +               schedstat_inc(rq, ttwu_local);
2872 +       else {
2873 +               struct sched_domain *sd;
2874 +               for_each_domain(this_cpu, sd) {
2875 +                       if (cpu_isset(cpu, sd->span)) {
2876 +                               schedstat_inc(sd, ttwu_wake_remote);
2877 +                               break;
2878 +                       }
2879 +               }
2880 +       }
2881 +#endif /* CONFIG_SCHEDSTATS */
2882 +
2883 +out_activate:
2884 +#endif /* CONFIG_SMP */
2885 +       schedstat_inc(p, se.nr_wakeups);
2886 +       if (sync)
2887 +               schedstat_inc(p, se.nr_wakeups_sync);
2888 +       if (orig_cpu != cpu)
2889 +               schedstat_inc(p, se.nr_wakeups_migrate);
2890 +       if (cpu == this_cpu)
2891 +               schedstat_inc(p, se.nr_wakeups_local);
2892 +       else
2893 +               schedstat_inc(p, se.nr_wakeups_remote);
2894 +       update_rq_clock(rq);
2895 +       activate_task(rq, p, 1);
2896 +       success = 1;
2897 +
2898 +out_running:
2899 +       trace_mark(kernel_sched_wakeup,
2900 +               "pid %d state %ld ## rq %p task %p rq->curr %p",
2901 +               p->pid, p->state, rq, p, rq->curr);
2902 +       check_preempt_curr(rq, p);
2903 +
2904 +       p->state = TASK_RUNNING;
2905 +#ifdef CONFIG_SMP
2906 +       if (p->sched_class->task_wake_up)
2907 +               p->sched_class->task_wake_up(rq, p);
2908 +#endif
2909 +out:
2910 +       current->se.last_wakeup = current->se.sum_exec_runtime;
2911 +
2912 +       task_rq_unlock(rq, &flags);
2913 +
2914 +       return success;
2915 +}
2916 +
2917 +int wake_up_process(struct task_struct *p)
2918 +{
2919 +       return try_to_wake_up(p, TASK_ALL, 0);
2920 +}
2921 +EXPORT_SYMBOL(wake_up_process);
2922 +
2923 +int wake_up_state(struct task_struct *p, unsigned int state)
2924 +{
2925 +       return try_to_wake_up(p, state, 0);
2926 +}
2927 +
2928 +/*
2929 + * Perform scheduler related setup for a newly forked process p.
2930 + * p is forked by current.
2931 + *
2932 + * __sched_fork() is basic setup used by init_idle() too:
2933 + */
2934 +static void __sched_fork(struct task_struct *p)
2935 +{
2936 +       p->se.exec_start                = 0;
2937 +       p->se.sum_exec_runtime          = 0;
2938 +       p->se.prev_sum_exec_runtime     = 0;
2939 +       p->se.last_wakeup               = 0;
2940 +       p->se.avg_overlap               = 0;
2941 +
2942 +#ifdef CONFIG_SCHEDSTATS
2943 +       p->se.wait_start                = 0;
2944 +       p->se.sum_sleep_runtime         = 0;
2945 +       p->se.sleep_start               = 0;
2946 +       p->se.block_start               = 0;
2947 +       p->se.sleep_max                 = 0;
2948 +       p->se.block_max                 = 0;
2949 +       p->se.exec_max                  = 0;
2950 +       p->se.slice_max                 = 0;
2951 +       p->se.wait_max                  = 0;
2952 +#endif
2953 +
2954 +       INIT_LIST_HEAD(&p->rt.run_list);
2955 +       p->se.on_rq = 0;
2956 +       INIT_LIST_HEAD(&p->se.group_node);
2957 +
2958 +#ifdef CONFIG_PREEMPT_NOTIFIERS
2959 +       INIT_HLIST_HEAD(&p->preempt_notifiers);
2960 +#endif
2961 +
2962 +       /*
2963 +        * We mark the process as running here, but have not actually
2964 +        * inserted it onto the runqueue yet. This guarantees that
2965 +        * nobody will actually run it, and a signal or other external
2966 +        * event cannot wake it up and insert it on the runqueue either.
2967 +        */
2968 +       p->state = TASK_RUNNING;
2969 +}
2970 +
2971 +/*
2972 + * fork()/clone()-time setup:
2973 + */
2974 +void sched_fork(struct task_struct *p, int clone_flags)
2975 +{
2976 +       int cpu = get_cpu();
2977 +
2978 +       __sched_fork(p);
2979 +
2980 +#ifdef CONFIG_SMP
2981 +       cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2982 +#endif
2983 +       set_task_cpu(p, cpu);
2984 +
2985 +       /*
2986 +        * Make sure we do not leak PI boosting priority to the child:
2987 +        */
2988 +       p->prio = current->normal_prio;
2989 +       if (!rt_prio(p->prio))
2990 +               p->sched_class = &fair_sched_class;
2991 +
2992 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2993 +       if (likely(sched_info_on()))
2994 +               memset(&p->sched_info, 0, sizeof(p->sched_info));
2995 +#endif
2996 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2997 +       p->oncpu = 0;
2998 +#endif
2999 +#ifdef CONFIG_PREEMPT
3000 +       /* Want to start with kernel preemption disabled. */
3001 +       task_thread_info(p)->preempt_count = 1;
3002 +#endif
3003 +       put_cpu();
3004 +}
3005 +
3006 +/*
3007 + * wake_up_new_task - wake up a newly created task for the first time.
3008 + *
3009 + * This function will do some initial scheduler statistics housekeeping
3010 + * that must be done for every newly created context, then puts the task
3011 + * on the runqueue and wakes it.
3012 + */
3013 +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
3014 +{
3015 +       unsigned long flags;
3016 +       struct rq *rq;
3017 +
3018 +       rq = task_rq_lock(p, &flags);
3019 +       BUG_ON(p->state != TASK_RUNNING);
3020 +       update_rq_clock(rq);
3021 +
3022 +       p->prio = effective_prio(p);
3023 +
3024 +       if (!p->sched_class->task_new || !current->se.on_rq) {
3025 +               activate_task(rq, p, 0);
3026 +       } else {
3027 +               /*
3028 +                * Let the scheduling class do new task startup
3029 +                * management (if any):
3030 +                */
3031 +               p->sched_class->task_new(rq, p);
3032 +               inc_nr_running(rq);
3033 +       }
3034 +       trace_mark(kernel_sched_wakeup_new,
3035 +               "pid %d state %ld ## rq %p task %p rq->curr %p",
3036 +               p->pid, p->state, rq, p, rq->curr);
3037 +       check_preempt_curr(rq, p);
3038 +#ifdef CONFIG_SMP
3039 +       if (p->sched_class->task_wake_up)
3040 +               p->sched_class->task_wake_up(rq, p);
3041 +#endif
3042 +       task_rq_unlock(rq, &flags);
3043 +}
3044 +
3045 +#ifdef CONFIG_PREEMPT_NOTIFIERS
3046 +
3047 +/**
3048 + * preempt_notifier_register - tell me when current is being being preempted & rescheduled
3049 + * @notifier: notifier struct to register
3050 + */
3051 +void preempt_notifier_register(struct preempt_notifier *notifier)
3052 +{
3053 +       hlist_add_head(&notifier->link, &current->preempt_notifiers);
3054 +}
3055 +EXPORT_SYMBOL_GPL(preempt_notifier_register);
3056 +
3057 +/**
3058 + * preempt_notifier_unregister - no longer interested in preemption notifications
3059 + * @notifier: notifier struct to unregister
3060 + *
3061 + * This is safe to call from within a preemption notifier.
3062 + */
3063 +void preempt_notifier_unregister(struct preempt_notifier *notifier)
3064 +{
3065 +       hlist_del(&notifier->link);
3066 +}
3067 +EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
3068 +
3069 +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3070 +{
3071 +       struct preempt_notifier *notifier;
3072 +       struct hlist_node *node;
3073 +
3074 +       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
3075 +               notifier->ops->sched_in(notifier, raw_smp_processor_id());
3076 +}
3077 +
3078 +static void
3079 +fire_sched_out_preempt_notifiers(struct task_struct *curr,
3080 +                                struct task_struct *next)
3081 +{
3082 +       struct preempt_notifier *notifier;
3083 +       struct hlist_node *node;
3084 +
3085 +       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
3086 +               notifier->ops->sched_out(notifier, next);
3087 +}
3088 +
3089 +#else /* !CONFIG_PREEMPT_NOTIFIERS */
3090 +
3091 +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3092 +{
3093 +}
3094 +
3095 +static void
3096 +fire_sched_out_preempt_notifiers(struct task_struct *curr,
3097 +                                struct task_struct *next)
3098 +{
3099 +}
3100 +
3101 +#endif /* CONFIG_PREEMPT_NOTIFIERS */
3102 +
3103 +/**
3104 + * prepare_task_switch - prepare to switch tasks
3105 + * @rq: the runqueue preparing to switch
3106 + * @prev: the current task that is being switched out
3107 + * @next: the task we are going to switch to.
3108 + *
3109 + * This is called with the rq lock held and interrupts off. It must
3110 + * be paired with a subsequent finish_task_switch after the context
3111 + * switch.
3112 + *
3113 + * prepare_task_switch sets up locking and calls architecture specific
3114 + * hooks.
3115 + */
3116 +static inline void
3117 +prepare_task_switch(struct rq *rq, struct task_struct *prev,
3118 +                   struct task_struct *next)
3119 +{
3120 +       fire_sched_out_preempt_notifiers(prev, next);
3121 +       prepare_lock_switch(rq, next);
3122 +       prepare_arch_switch(next);
3123 +}
3124 +
3125 +/**
3126 + * finish_task_switch - clean up after a task-switch
3127 + * @rq: runqueue associated with task-switch
3128 + * @prev: the thread we just switched away from.
3129 + *
3130 + * finish_task_switch must be called after the context switch, paired
3131 + * with a prepare_task_switch call before the context switch.
3132 + * finish_task_switch will reconcile locking set up by prepare_task_switch,
3133 + * and do any other architecture-specific cleanup actions.
3134 + *
3135 + * Note that we may have delayed dropping an mm in context_switch(). If
3136 + * so, we finish that here outside of the runqueue lock. (Doing it
3137 + * with the lock held can cause deadlocks; see schedule() for
3138 + * details.)
3139 + */
3140 +static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3141 +       __releases(rq->lock)
3142 +{
3143 +       struct mm_struct *mm = rq->prev_mm;
3144 +       long prev_state;
3145 +
3146 +       rq->prev_mm = NULL;
3147 +
3148 +       /*
3149 +        * A task struct has one reference for the use as "current".
3150 +        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
3151 +        * schedule one last time. The schedule call will never return, and
3152 +        * the scheduled task must drop that reference.
3153 +        * The test for TASK_DEAD must occur while the runqueue locks are
3154 +        * still held, otherwise prev could be scheduled on another cpu, die
3155 +        * there before we look at prev->state, and then the reference would
3156 +        * be dropped twice.
3157 +        *              Manfred Spraul <manfred@colorfullife.com>
3158 +        */
3159 +       prev_state = prev->state;
3160 +       finish_arch_switch(prev);
3161 +       finish_lock_switch(rq, prev);
3162 +#ifdef CONFIG_SMP
3163 +       if (current->sched_class->post_schedule)
3164 +               current->sched_class->post_schedule(rq);
3165 +#endif
3166 +
3167 +       fire_sched_in_preempt_notifiers(current);
3168 +       if (mm)
3169 +               mmdrop(mm);
3170 +       if (unlikely(prev_state == TASK_DEAD)) {
3171 +               /*
3172 +                * Remove function-return probe instances associated with this
3173 +                * task and put them back on the free list.
3174 +                */
3175 +               kprobe_flush_task(prev);
3176 +               put_task_struct(prev);
3177 +       }
3178 +}
3179 +
3180 +/**
3181 + * schedule_tail - first thing a freshly forked thread must call.
3182 + * @prev: the thread we just switched away from.
3183 + */
3184 +asmlinkage void schedule_tail(struct task_struct *prev)
3185 +       __releases(rq->lock)
3186 +{
3187 +       struct rq *rq = this_rq();
3188 +
3189 +       finish_task_switch(rq, prev);
3190 +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
3191 +       /* In this case, finish_task_switch does not reenable preemption */
3192 +       preempt_enable();
3193 +#endif
3194 +       if (current->set_child_tid)
3195 +               put_user(task_pid_vnr(current), current->set_child_tid);
3196 +}
3197 +
3198 +/*
3199 + * context_switch - switch to the new MM and the new
3200 + * thread's register state.
3201 + */
3202 +static inline void
3203 +context_switch(struct rq *rq, struct task_struct *prev,
3204 +              struct task_struct *next)
3205 +{
3206 +       struct mm_struct *mm, *oldmm;
3207 +
3208 +       prepare_task_switch(rq, prev, next);
3209 +       trace_mark(kernel_sched_schedule,
3210 +               "prev_pid %d next_pid %d prev_state %ld "
3211 +               "## rq %p prev %p next %p",
3212 +               prev->pid, next->pid, prev->state,
3213 +               rq, prev, next);
3214 +       mm = next->mm;
3215 +       oldmm = prev->active_mm;
3216 +       /*
3217 +        * For paravirt, this is coupled with an exit in switch_to to
3218 +        * combine the page table reload and the switch backend into
3219 +        * one hypercall.
3220 +        */
3221 +       arch_enter_lazy_cpu_mode();
3222 +
3223 +       if (unlikely(!mm)) {
3224 +               next->active_mm = oldmm;
3225 +               atomic_inc(&oldmm->mm_count);
3226 +               enter_lazy_tlb(oldmm, next);
3227 +       } else
3228 +               switch_mm(oldmm, mm, next);
3229 +
3230 +       if (unlikely(!prev->mm)) {
3231 +               prev->active_mm = NULL;
3232 +               rq->prev_mm = oldmm;
3233 +       }
3234 +       /*
3235 +        * Since the runqueue lock will be released by the next
3236 +        * task (which is an invalid locking op but in the case
3237 +        * of the scheduler it's an obvious special-case), so we
3238 +        * do an early lockdep release here:
3239 +        */
3240 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
3241 +       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3242 +#endif
3243 +
3244 +       /* Here we just switch the register state and the stack. */
3245 +       switch_to(prev, next, prev);
3246 +
3247 +       barrier();
3248 +       /*
3249 +        * this_rq must be evaluated again because prev may have moved
3250 +        * CPUs since it called schedule(), thus the 'rq' on its stack
3251 +        * frame will be invalid.
3252 +        */
3253 +       finish_task_switch(this_rq(), prev);
3254 +}
3255 +
3256 +/*
3257 + * nr_running, nr_uninterruptible and nr_context_switches:
3258 + *
3259 + * externally visible scheduler statistics: current number of runnable
3260 + * threads, current number of uninterruptible-sleeping threads, total
3261 + * number of context switches performed since bootup.
3262 + */
3263 +unsigned long nr_running(void)
3264 +{
3265 +       unsigned long i, sum = 0;
3266 +
3267 +       for_each_online_cpu(i)
3268 +               sum += cpu_rq(i)->nr_running;
3269 +
3270 +       return sum;
3271 +}
3272 +
3273 +unsigned long nr_uninterruptible(void)
3274 +{
3275 +       unsigned long i, sum = 0;
3276 +
3277 +       for_each_possible_cpu(i)
3278 +               sum += cpu_rq(i)->nr_uninterruptible;
3279 +
3280 +       /*
3281 +        * Since we read the counters lockless, it might be slightly
3282 +        * inaccurate. Do not allow it to go below zero though:
3283 +        */
3284 +       if (unlikely((long)sum < 0))
3285 +               sum = 0;
3286 +
3287 +       return sum;
3288 +}
3289 +
3290 +unsigned long long nr_context_switches(void)
3291 +{
3292 +       int i;
3293 +       unsigned long long sum = 0;
3294 +
3295 +       for_each_possible_cpu(i)
3296 +               sum += cpu_rq(i)->nr_switches;
3297 +
3298 +       return sum;
3299 +}
3300 +
3301 +unsigned long nr_iowait(void)
3302 +{
3303 +       unsigned long i, sum = 0;
3304 +
3305 +       for_each_possible_cpu(i)
3306 +               sum += atomic_read(&cpu_rq(i)->nr_iowait);
3307 +
3308 +       return sum;
3309 +}
3310 +
3311 +unsigned long nr_active(void)
3312 +{
3313 +       unsigned long i, running = 0, uninterruptible = 0;
3314 +
3315 +       for_each_online_cpu(i) {
3316 +               running += cpu_rq(i)->nr_running;
3317 +               uninterruptible += cpu_rq(i)->nr_uninterruptible;
3318 +       }
3319 +
3320 +       if (unlikely((long)uninterruptible < 0))
3321 +               uninterruptible = 0;
3322 +
3323 +       return running + uninterruptible;
3324 +}
3325 +
3326 +/*
3327 + * Update rq->cpu_load[] statistics. This function is usually called every
3328 + * scheduler tick (TICK_NSEC).
3329 + */
3330 +static void update_cpu_load(struct rq *this_rq)
3331 +{
3332 +       unsigned long this_load = this_rq->load.weight;
3333 +       int i, scale;
3334 +
3335 +       this_rq->nr_load_updates++;
3336 +
3337 +       /* Update our load: */
3338 +       for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3339 +               unsigned long old_load, new_load;
3340 +
3341 +               /* scale is effectively 1 << i now, and >> i divides by scale */
3342 +
3343 +               old_load = this_rq->cpu_load[i];
3344 +               new_load = this_load;
3345 +               /*
3346 +                * Round up the averaging division if load is increasing. This
3347 +                * prevents us from getting stuck on 9 if the load is 10, for
3348 +                * example.
3349 +                */
3350 +               if (new_load > old_load)
3351 +                       new_load += scale-1;
3352 +               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3353 +       }
3354 +}
3355 +
3356 +#ifdef CONFIG_SMP
3357 +
3358 +/*
3359 + * double_rq_lock - safely lock two runqueues
3360 + *
3361 + * Note this does not disable interrupts like task_rq_lock,
3362 + * you need to do so manually before calling.
3363 + */
3364 +static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3365 +       __acquires(rq1->lock)
3366 +       __acquires(rq2->lock)
3367 +{
3368 +       BUG_ON(!irqs_disabled());
3369 +       if (rq1 == rq2) {
3370 +               spin_lock(&rq1->lock);
3371 +               __acquire(rq2->lock);   /* Fake it out ;) */
3372 +       } else {
3373 +               if (rq1 < rq2) {
3374 +                       spin_lock(&rq1->lock);
3375 +                       spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3376 +               } else {
3377 +                       spin_lock(&rq2->lock);
3378 +                       spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3379 +               }
3380 +       }
3381 +       update_rq_clock(rq1);
3382 +       update_rq_clock(rq2);
3383 +}
3384 +
3385 +/*
3386 + * double_rq_unlock - safely unlock two runqueues
3387 + *
3388 + * Note this does not restore interrupts like task_rq_unlock,
3389 + * you need to do so manually after calling.
3390 + */
3391 +static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3392 +       __releases(rq1->lock)
3393 +       __releases(rq2->lock)
3394 +{
3395 +       spin_unlock(&rq1->lock);
3396 +       if (rq1 != rq2)
3397 +               spin_unlock(&rq2->lock);
3398 +       else
3399 +               __release(rq2->lock);
3400 +}
3401 +
3402 +/*
3403 + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
3404 + */
3405 +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
3406 +       __releases(this_rq->lock)
3407 +       __acquires(busiest->lock)
3408 +       __acquires(this_rq->lock)
3409 +{
3410 +       int ret = 0;
3411 +
3412 +       if (unlikely(!irqs_disabled())) {
3413 +               /* printk() doesn't work good under rq->lock */
3414 +               spin_unlock(&this_rq->lock);
3415 +               BUG_ON(1);
3416 +       }
3417 +       if (unlikely(!spin_trylock(&busiest->lock))) {
3418 +               if (busiest < this_rq) {
3419 +                       spin_unlock(&this_rq->lock);
3420 +                       spin_lock(&busiest->lock);
3421 +                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
3422 +                       ret = 1;
3423 +               } else
3424 +                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
3425 +       }
3426 +       return ret;
3427 +}
3428 +
3429 +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
3430 +       __releases(busiest->lock)
3431 +{
3432 +       spin_unlock(&busiest->lock);
3433 +       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
3434 +}
3435 +
3436 +/*
3437 + * If dest_cpu is allowed for this process, migrate the task to it.
3438 + * This is accomplished by forcing the cpu_allowed mask to only
3439 + * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3440 + * the cpu_allowed mask is restored.
3441 + */
3442 +static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3443 +{
3444 +       struct migration_req req;
3445 +       unsigned long flags;
3446 +       struct rq *rq;
3447 +
3448 +       rq = task_rq_lock(p, &flags);
3449 +       if (!cpu_isset(dest_cpu, p->cpus_allowed)
3450 +           || unlikely(!cpu_active(dest_cpu)))
3451 +               goto out;
3452 +
3453 +       /* force the process onto the specified CPU */
3454 +       if (migrate_task(p, dest_cpu, &req)) {
3455 +               /* Need to wait for migration thread (might exit: take ref). */
3456 +               struct task_struct *mt = rq->migration_thread;
3457 +
3458 +               get_task_struct(mt);
3459 +               task_rq_unlock(rq, &flags);
3460 +               wake_up_process(mt);
3461 +               put_task_struct(mt);
3462 +               wait_for_completion(&req.done);
3463 +
3464 +               return;
3465 +       }
3466 +out:
3467 +       task_rq_unlock(rq, &flags);
3468 +}
3469 +
3470 +/*
3471 + * sched_exec - execve() is a valuable balancing opportunity, because at
3472 + * this point the task has the smallest effective memory and cache footprint.
3473 + */
3474 +void sched_exec(void)
3475 +{
3476 +       int new_cpu, this_cpu = get_cpu();
3477 +       new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
3478 +       put_cpu();
3479 +       if (new_cpu != this_cpu)
3480 +               sched_migrate_task(current, new_cpu);
3481 +}
3482 +
3483 +/*
3484 + * pull_task - move a task from a remote runqueue to the local runqueue.
3485 + * Both runqueues must be locked.
3486 + */
3487 +static void pull_task(struct rq *src_rq, struct task_struct *p,
3488 +                     struct rq *this_rq, int this_cpu)
3489 +{
3490 +       deactivate_task(src_rq, p, 0);
3491 +       set_task_cpu(p, this_cpu);
3492 +       activate_task(this_rq, p, 0);
3493 +       /*
3494 +        * Note that idle threads have a prio of MAX_PRIO, for this test
3495 +        * to be always true for them.
3496 +        */
3497 +       check_preempt_curr(this_rq, p);
3498 +}
3499 +
3500 +/*
3501 + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3502 + */
3503 +static
3504 +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3505 +                    struct sched_domain *sd, enum cpu_idle_type idle,
3506 +                    int *all_pinned)
3507 +{
3508 +       /*
3509 +        * We do not migrate tasks that are:
3510 +        * 1) running (obviously), or
3511 +        * 2) cannot be migrated to this CPU due to cpus_allowed, or
3512 +        * 3) are cache-hot on their current CPU.
3513 +        */
3514 +       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
3515 +               schedstat_inc(p, se.nr_failed_migrations_affine);
3516 +               return 0;
3517 +       }
3518 +       *all_pinned = 0;
3519 +
3520 +       if (task_running(rq, p)) {
3521 +               schedstat_inc(p, se.nr_failed_migrations_running);
3522 +               return 0;
3523 +       }
3524 +
3525 +       /*
3526 +        * Aggressive migration if:
3527 +        * 1) task is cache cold, or
3528 +        * 2) too many balance attempts have failed.
3529 +        */
3530 +
3531 +       if (!task_hot(p, rq->clock, sd) ||
3532 +                       sd->nr_balance_failed > sd->cache_nice_tries) {
3533 +#ifdef CONFIG_SCHEDSTATS
3534 +               if (task_hot(p, rq->clock, sd)) {
3535 +                       schedstat_inc(sd, lb_hot_gained[idle]);
3536 +                       schedstat_inc(p, se.nr_forced_migrations);
3537 +               }
3538 +#endif
3539 +               return 1;
3540 +       }
3541 +
3542 +       if (task_hot(p, rq->clock, sd)) {
3543 +               schedstat_inc(p, se.nr_failed_migrations_hot);
3544 +               return 0;
3545 +       }
3546 +       return 1;
3547 +}
3548 +
3549 +static unsigned long
3550 +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3551 +             unsigned long max_load_move, struct sched_domain *sd,
3552 +             enum cpu_idle_type idle, int *all_pinned,
3553 +             int *this_best_prio, struct rq_iterator *iterator)
3554 +{
3555 +       int loops = 0, pulled = 0, pinned = 0;
3556 +       struct task_struct *p;
3557 +       long rem_load_move = max_load_move;
3558 +
3559 +       if (max_load_move == 0)
3560 +               goto out;
3561 +
3562 +       pinned = 1;
3563 +
3564 +       /*
3565 +        * Start the load-balancing iterator:
3566 +        */
3567 +       p = iterator->start(iterator->arg);
3568 +next:
3569 +       if (!p || loops++ > sysctl_sched_nr_migrate)
3570 +               goto out;
3571 +
3572 +       if ((p->se.load.weight >> 1) > rem_load_move ||
3573 +           !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3574 +               p = iterator->next(iterator->arg);
3575 +               goto next;
3576 +       }
3577 +
3578 +       pull_task(busiest, p, this_rq, this_cpu);
3579 +       pulled++;
3580 +       rem_load_move -= p->se.load.weight;
3581 +
3582 +       /*
3583 +        * We only want to steal up to the prescribed amount of weighted load.
3584 +        */
3585 +       if (rem_load_move > 0) {
3586 +               if (p->prio < *this_best_prio)
3587 +                       *this_best_prio = p->prio;
3588 +               p = iterator->next(iterator->arg);
3589 +               goto next;
3590 +       }
3591 +out:
3592 +       /*
3593 +        * Right now, this is one of only two places pull_task() is called,
3594 +        * so we can safely collect pull_task() stats here rather than
3595 +        * inside pull_task().
3596 +        */
3597 +       schedstat_add(sd, lb_gained[idle], pulled);
3598 +
3599 +       if (all_pinned)
3600 +               *all_pinned = pinned;
3601 +
3602 +       return max_load_move - rem_load_move;
3603 +}
3604 +
3605 +/*
3606 + * move_tasks tries to move up to max_load_move weighted load from busiest to
3607 + * this_rq, as part of a balancing operation within domain "sd".
3608 + * Returns 1 if successful and 0 otherwise.
3609 + *
3610 + * Called with both runqueues locked.
3611 + */
3612 +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3613 +                     unsigned long max_load_move,
3614 +                     struct sched_domain *sd, enum cpu_idle_type idle,
3615 +                     int *all_pinned)
3616 +{
3617 +       const struct sched_class *class = sched_class_highest;
3618 +       unsigned long total_load_moved = 0;
3619 +       int this_best_prio = this_rq->curr->prio;
3620 +
3621 +       do {
3622 +               total_load_moved +=
3623 +                       class->load_balance(this_rq, this_cpu, busiest,
3624 +                               max_load_move - total_load_moved,
3625 +                               sd, idle, all_pinned, &this_best_prio);
3626 +               class = class->next;
3627 +
3628 +               if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3629 +                       break;
3630 +
3631 +       } while (class && max_load_move > total_load_moved);
3632 +
3633 +       return total_load_moved > 0;
3634 +}
3635 +
3636 +static int
3637 +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3638 +                  struct sched_domain *sd, enum cpu_idle_type idle,
3639 +                  struct rq_iterator *iterator)
3640 +{
3641 +       struct task_struct *p = iterator->start(iterator->arg);
3642 +       int pinned = 0;
3643 +
3644 +       while (p) {
3645 +               if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3646 +                       pull_task(busiest, p, this_rq, this_cpu);
3647 +                       /*
3648 +                        * Right now, this is only the second place pull_task()
3649 +                        * is called, so we can safely collect pull_task()
3650 +                        * stats here rather than inside pull_task().
3651 +                        */
3652 +                       schedstat_inc(sd, lb_gained[idle]);
3653 +
3654 +                       return 1;
3655 +               }
3656 +               p = iterator->next(iterator->arg);
3657 +       }
3658 +
3659 +       return 0;
3660 +}
3661 +
3662 +/*
3663 + * move_one_task tries to move exactly one task from busiest to this_rq, as
3664 + * part of active balancing operations within "domain".
3665 + * Returns 1 if successful and 0 otherwise.
3666 + *
3667 + * Called with both runqueues locked.
3668 + */
3669 +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3670 +                        struct sched_domain *sd, enum cpu_idle_type idle)
3671 +{
3672 +       const struct sched_class *class;
3673 +
3674 +       for (class = sched_class_highest; class; class = class->next)
3675 +               if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3676 +                       return 1;
3677 +
3678 +       return 0;
3679 +}
3680 +
3681 +/*
3682 + * find_busiest_group finds and returns the busiest CPU group within the
3683 + * domain. It calculates and returns the amount of weighted load which
3684 + * should be moved to restore balance via the imbalance parameter.
3685 + */
3686 +static struct sched_group *
3687 +find_busiest_group(struct sched_domain *sd, int this_cpu,
3688 +                  unsigned long *imbalance, enum cpu_idle_type idle,
3689 +                  int *sd_idle, const cpumask_t *cpus, int *balance)
3690 +{
3691 +       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3692 +       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
3693 +       unsigned long max_pull;
3694 +       unsigned long busiest_load_per_task, busiest_nr_running;
3695 +       unsigned long this_load_per_task, this_nr_running;
3696 +       int load_idx, group_imb = 0;
3697 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3698 +       int power_savings_balance = 1;
3699 +       unsigned long leader_nr_running = 0, min_load_per_task = 0;
3700 +       unsigned long min_nr_running = ULONG_MAX;
3701 +       struct sched_group *group_min = NULL, *group_leader = NULL;
3702 +#endif
3703 +
3704 +       max_load = this_load = total_load = total_pwr = 0;
3705 +       busiest_load_per_task = busiest_nr_running = 0;
3706 +       this_load_per_task = this_nr_running = 0;
3707 +
3708 +       if (idle == CPU_NOT_IDLE)
3709 +               load_idx = sd->busy_idx;
3710 +       else if (idle == CPU_NEWLY_IDLE)
3711 +               load_idx = sd->newidle_idx;
3712 +       else
3713 +               load_idx = sd->idle_idx;
3714 +
3715 +       do {
3716 +               unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
3717 +               int local_group;
3718 +               int i;
3719 +               int __group_imb = 0;
3720 +               unsigned int balance_cpu = -1, first_idle_cpu = 0;
3721 +               unsigned long sum_nr_running, sum_weighted_load;
3722 +               unsigned long sum_avg_load_per_task;
3723 +               unsigned long avg_load_per_task;
3724 +
3725 +               local_group = cpu_isset(this_cpu, group->cpumask);
3726 +
3727 +               if (local_group)
3728 +                       balance_cpu = first_cpu(group->cpumask);
3729 +
3730 +               /* Tally up the load of all CPUs in the group */
3731 +               sum_weighted_load = sum_nr_running = avg_load = 0;
3732 +               sum_avg_load_per_task = avg_load_per_task = 0;
3733 +
3734 +               max_cpu_load = 0;
3735 +               min_cpu_load = ~0UL;
3736 +
3737 +               for_each_cpu_mask_nr(i, group->cpumask) {
3738 +                       struct rq *rq;
3739 +
3740 +                       if (!cpu_isset(i, *cpus))
3741 +                               continue;
3742 +
3743 +                       rq = cpu_rq(i);
3744 +
3745 +                       if (*sd_idle && rq->nr_running)
3746 +                               *sd_idle = 0;
3747 +
3748 +                       /* Bias balancing toward cpus of our domain */
3749 +                       if (local_group) {
3750 +                               if (idle_cpu(i) && !first_idle_cpu) {
3751 +                                       first_idle_cpu = 1;
3752 +                                       balance_cpu = i;
3753 +                               }
3754 +
3755 +                               load = target_load(i, load_idx);
3756 +                       } else {
3757 +                               load = source_load(i, load_idx);
3758 +                               if (load > max_cpu_load)
3759 +                                       max_cpu_load = load;
3760 +                               if (min_cpu_load > load)
3761 +                                       min_cpu_load = load;
3762 +                       }
3763 +
3764 +                       avg_load += load;
3765 +                       sum_nr_running += rq->nr_running;
3766 +                       sum_weighted_load += weighted_cpuload(i);
3767 +
3768 +                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
3769 +               }
3770 +
3771 +               /*
3772 +                * First idle cpu or the first cpu(busiest) in this sched group
3773 +                * is eligible for doing load balancing at this and above
3774 +                * domains. In the newly idle case, we will allow all the cpu's
3775 +                * to do the newly idle load balance.
3776 +                */
3777 +               if (idle != CPU_NEWLY_IDLE && local_group &&
3778 +                   balance_cpu != this_cpu && balance) {
3779 +                       *balance = 0;
3780 +                       goto ret;
3781 +               }
3782 +
3783 +               total_load += avg_load;
3784 +               total_pwr += group->__cpu_power;
3785 +
3786 +               /* Adjust by relative CPU power of the group */
3787 +               avg_load = sg_div_cpu_power(group,
3788 +                               avg_load * SCHED_LOAD_SCALE);
3789 +
3790 +
3791 +               /*
3792 +                * Consider the group unbalanced when the imbalance is larger
3793 +                * than the average weight of two tasks.
3794 +                *
3795 +                * APZ: with cgroup the avg task weight can vary wildly and
3796 +                *      might not be a suitable number - should we keep a
3797 +                *      normalized nr_running number somewhere that negates
3798 +                *      the hierarchy?
3799 +                */
3800 +               avg_load_per_task = sg_div_cpu_power(group,
3801 +                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
3802 +
3803 +               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3804 +                       __group_imb = 1;
3805 +
3806 +               group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3807 +
3808 +               if (local_group) {
3809 +                       this_load = avg_load;
3810 +                       this = group;
3811 +                       this_nr_running = sum_nr_running;
3812 +                       this_load_per_task = sum_weighted_load;
3813 +               } else if (avg_load > max_load &&
3814 +                          (sum_nr_running > group_capacity || __group_imb)) {
3815 +                       max_load = avg_load;
3816 +                       busiest = group;
3817 +                       busiest_nr_running = sum_nr_running;
3818 +                       busiest_load_per_task = sum_weighted_load;
3819 +                       group_imb = __group_imb;
3820 +               }
3821 +
3822 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3823 +               /*
3824 +                * Busy processors will not participate in power savings
3825 +                * balance.
3826 +                */
3827 +               if (idle == CPU_NOT_IDLE ||
3828 +                               !(sd->flags & SD_POWERSAVINGS_BALANCE))
3829 +                       goto group_next;
3830 +
3831 +               /*
3832 +                * If the local group is idle or completely loaded
3833 +                * no need to do power savings balance at this domain
3834 +                */
3835 +               if (local_group && (this_nr_running >= group_capacity ||
3836 +                                   !this_nr_running))
3837 +                       power_savings_balance = 0;
3838 +
3839 +               /*
3840 +                * If a group is already running at full capacity or idle,
3841 +                * don't include that group in power savings calculations
3842 +                */
3843 +               if (!power_savings_balance || sum_nr_running >= group_capacity
3844 +                   || !sum_nr_running)
3845 +                       goto group_next;
3846 +
3847 +               /*
3848 +                * Calculate the group which has the least non-idle load.
3849 +                * This is the group from where we need to pick up the load
3850 +                * for saving power
3851 +                */
3852 +               if ((sum_nr_running < min_nr_running) ||
3853 +                   (sum_nr_running == min_nr_running &&
3854 +                    first_cpu(group->cpumask) <
3855 +                    first_cpu(group_min->cpumask))) {
3856 +                       group_min = group;
3857 +                       min_nr_running = sum_nr_running;
3858 +                       min_load_per_task = sum_weighted_load /
3859 +                                               sum_nr_running;
3860 +               }
3861 +
3862 +               /*
3863 +                * Calculate the group which is almost near its
3864 +                * capacity but still has some space to pick up some load
3865 +                * from other group and save more power
3866 +                */
3867 +               if (sum_nr_running <= group_capacity - 1) {
3868 +                       if (sum_nr_running > leader_nr_running ||
3869 +                           (sum_nr_running == leader_nr_running &&
3870 +                            first_cpu(group->cpumask) >
3871 +                             first_cpu(group_leader->cpumask))) {
3872 +                               group_leader = group;
3873 +                               leader_nr_running = sum_nr_running;
3874 +                       }
3875 +               }
3876 +group_next:
3877 +#endif
3878 +               group = group->next;
3879 +       } while (group != sd->groups);
3880 +
3881 +       if (!busiest || this_load >= max_load || busiest_nr_running == 0)
3882 +               goto out_balanced;
3883 +
3884 +       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3885 +
3886 +       if (this_load >= avg_load ||
3887 +                       100*max_load <= sd->imbalance_pct*this_load)
3888 +               goto out_balanced;
3889 +
3890 +       busiest_load_per_task /= busiest_nr_running;
3891 +       if (group_imb)
3892 +               busiest_load_per_task = min(busiest_load_per_task, avg_load);
3893 +
3894 +       /*
3895 +        * We're trying to get all the cpus to the average_load, so we don't
3896 +        * want to push ourselves above the average load, nor do we wish to
3897 +        * reduce the max loaded cpu below the average load, as either of these
3898 +        * actions would just result in more rebalancing later, and ping-pong
3899 +        * tasks around. Thus we look for the minimum possible imbalance.
3900 +        * Negative imbalances (*we* are more loaded than anyone else) will
3901 +        * be counted as no imbalance for these purposes -- we can't fix that
3902 +        * by pulling tasks to us. Be careful of negative numbers as they'll
3903 +        * appear as very large values with unsigned longs.
3904 +        */
3905 +       if (max_load <= busiest_load_per_task)
3906 +               goto out_balanced;
3907 +
3908 +       /*
3909 +        * In the presence of smp nice balancing, certain scenarios can have
3910 +        * max load less than avg load(as we skip the groups at or below
3911 +        * its cpu_power, while calculating max_load..)
3912 +        */
3913 +       if (max_load < avg_load) {
3914 +               *imbalance = 0;
3915 +               goto small_imbalance;
3916 +       }
3917 +
3918 +       /* Don't want to pull so many tasks that a group would go idle */
3919 +       max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
3920 +
3921 +       /* How much load to actually move to equalise the imbalance */
3922 +       *imbalance = min(max_pull * busiest->__cpu_power,
3923 +                               (avg_load - this_load) * this->__cpu_power)
3924 +                       / SCHED_LOAD_SCALE;
3925 +
3926 +       /*
3927 +        * if *imbalance is less than the average load per runnable task
3928 +        * there is no gaurantee that any tasks will be moved so we'll have
3929 +        * a think about bumping its value to force at least one task to be
3930 +        * moved
3931 +        */
3932 +       if (*imbalance < busiest_load_per_task) {
3933 +               unsigned long tmp, pwr_now, pwr_move;
3934 +               unsigned int imbn;
3935 +
3936 +small_imbalance:
3937 +               pwr_move = pwr_now = 0;
3938 +               imbn = 2;
3939 +               if (this_nr_running) {
3940 +                       this_load_per_task /= this_nr_running;
3941 +                       if (busiest_load_per_task > this_load_per_task)
3942 +                               imbn = 1;
3943 +               } else
3944 +                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
3945 +
3946 +               if (max_load - this_load + 2*busiest_load_per_task >=
3947 +                                       busiest_load_per_task * imbn) {
3948 +                       *imbalance = busiest_load_per_task;
3949 +                       return busiest;
3950 +               }
3951 +
3952 +               /*
3953 +                * OK, we don't have enough imbalance to justify moving tasks,
3954 +                * however we may be able to increase total CPU power used by
3955 +                * moving them.
3956 +                */
3957 +
3958 +               pwr_now += busiest->__cpu_power *
3959 +                               min(busiest_load_per_task, max_load);
3960 +               pwr_now += this->__cpu_power *
3961 +                               min(this_load_per_task, this_load);
3962 +               pwr_now /= SCHED_LOAD_SCALE;
3963 +
3964 +               /* Amount of load we'd subtract */
3965 +               tmp = sg_div_cpu_power(busiest,
3966 +                               busiest_load_per_task * SCHED_LOAD_SCALE);
3967 +               if (max_load > tmp)
3968 +                       pwr_move += busiest->__cpu_power *
3969 +                               min(busiest_load_per_task, max_load - tmp);
3970 +
3971 +               /* Amount of load we'd add */
3972 +               if (max_load * busiest->__cpu_power <
3973 +                               busiest_load_per_task * SCHED_LOAD_SCALE)
3974 +                       tmp = sg_div_cpu_power(this,
3975 +                                       max_load * busiest->__cpu_power);
3976 +               else
3977 +                       tmp = sg_div_cpu_power(this,
3978 +                               busiest_load_per_task * SCHED_LOAD_SCALE);
3979 +               pwr_move += this->__cpu_power *
3980 +                               min(this_load_per_task, this_load + tmp);
3981 +               pwr_move /= SCHED_LOAD_SCALE;
3982 +
3983 +               /* Move if we gain throughput */
3984 +               if (pwr_move > pwr_now)
3985 +                       *imbalance = busiest_load_per_task;
3986 +       }
3987 +
3988 +       return busiest;
3989 +
3990 +out_balanced:
3991 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3992 +       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3993 +               goto ret;
3994 +
3995 +       if (this == group_leader && group_leader != group_min) {
3996 +               *imbalance = min_load_per_task;
3997 +               return group_min;
3998 +       }
3999 +#endif
4000 +ret:
4001 +       *imbalance = 0;
4002 +       return NULL;
4003 +}
4004 +
4005 +/*
4006 + * find_busiest_queue - find the busiest runqueue among the cpus in group.
4007 + */
4008 +static struct rq *
4009 +find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4010 +                  unsigned long imbalance, const cpumask_t *cpus)
4011 +{
4012 +       struct rq *busiest = NULL, *rq;
4013 +       unsigned long max_load = 0;
4014 +       int i;
4015 +
4016 +       for_each_cpu_mask_nr(i, group->cpumask) {
4017 +               unsigned long wl;
4018 +
4019 +               if (!cpu_isset(i, *cpus))
4020 +                       continue;
4021 +
4022 +               rq = cpu_rq(i);
4023 +               wl = weighted_cpuload(i);
4024 +
4025 +               if (rq->nr_running == 1 && wl > imbalance)
4026 +                       continue;
4027 +
4028 +               if (wl > max_load) {
4029 +                       max_load = wl;
4030 +                       busiest = rq;
4031 +               }
4032 +       }
4033 +
4034 +       return busiest;
4035 +}
4036 +
4037 +/*
4038 + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4039 + * so long as it is large enough.
4040 + */
4041 +#define MAX_PINNED_INTERVAL    512
4042 +
4043 +/*
4044 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
4045 + * tasks if there is an imbalance.
4046 + */
4047 +static int load_balance(int this_cpu, struct rq *this_rq,
4048 +                       struct sched_domain *sd, enum cpu_idle_type idle,
4049 +                       int *balance, cpumask_t *cpus)
4050 +{
4051 +       int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4052 +       struct sched_group *group;
4053 +       unsigned long imbalance;
4054 +       struct rq *busiest;
4055 +       unsigned long flags;
4056 +
4057 +       cpus_setall(*cpus);
4058 +
4059 +       /*
4060 +        * When power savings policy is enabled for the parent domain, idle
4061 +        * sibling can pick up load irrespective of busy siblings. In this case,
4062 +        * let the state of idle sibling percolate up as CPU_IDLE, instead of
4063 +        * portraying it as CPU_NOT_IDLE.
4064 +        */
4065 +       if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4066 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4067 +               sd_idle = 1;
4068 +
4069 +       schedstat_inc(sd, lb_count[idle]);
4070 +
4071 +redo:
4072 +       update_shares(sd);
4073 +       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4074 +                                  cpus, balance);
4075 +
4076 +       if (*balance == 0)
4077 +               goto out_balanced;
4078 +
4079 +       if (!group) {
4080 +               schedstat_inc(sd, lb_nobusyg[idle]);
4081 +               goto out_balanced;
4082 +       }
4083 +
4084 +       busiest = find_busiest_queue(group, idle, imbalance, cpus);
4085 +       if (!busiest) {
4086 +               schedstat_inc(sd, lb_nobusyq[idle]);
4087 +               goto out_balanced;
4088 +       }
4089 +
4090 +       BUG_ON(busiest == this_rq);
4091 +
4092 +       schedstat_add(sd, lb_imbalance[idle], imbalance);
4093 +
4094 +       ld_moved = 0;
4095 +       if (busiest->nr_running > 1) {
4096 +               /*
4097 +                * Attempt to move tasks. If find_busiest_group has found
4098 +                * an imbalance but busiest->nr_running <= 1, the group is
4099 +                * still unbalanced. ld_moved simply stays zero, so it is
4100 +                * correctly treated as an imbalance.
4101 +                */
4102 +               local_irq_save(flags);
4103 +               double_rq_lock(this_rq, busiest);
4104 +               ld_moved = move_tasks(this_rq, this_cpu, busiest,
4105 +                                     imbalance, sd, idle, &all_pinned);
4106 +               double_rq_unlock(this_rq, busiest);
4107 +               local_irq_restore(flags);
4108 +
4109 +               /*
4110 +                * some other cpu did the load balance for us.
4111 +                */
4112 +               if (ld_moved && this_cpu != smp_processor_id())
4113 +                       resched_cpu(this_cpu);
4114 +
4115 +               /* All tasks on this runqueue were pinned by CPU affinity */
4116 +               if (unlikely(all_pinned)) {
4117 +                       cpu_clear(cpu_of(busiest), *cpus);
4118 +                       if (!cpus_empty(*cpus))
4119 +                               goto redo;
4120 +                       goto out_balanced;
4121 +               }
4122 +       }
4123 +
4124 +       if (!ld_moved) {
4125 +               schedstat_inc(sd, lb_failed[idle]);
4126 +               sd->nr_balance_failed++;
4127 +
4128 +               if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4129 +
4130 +                       spin_lock_irqsave(&busiest->lock, flags);
4131 +
4132 +                       /* don't kick the migration_thread, if the curr
4133 +                        * task on busiest cpu can't be moved to this_cpu
4134 +                        */
4135 +                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
4136 +                               spin_unlock_irqrestore(&busiest->lock, flags);
4137 +                               all_pinned = 1;
4138 +                               goto out_one_pinned;
4139 +                       }
4140 +
4141 +                       if (!busiest->active_balance) {
4142 +                               busiest->active_balance = 1;
4143 +                               busiest->push_cpu = this_cpu;
4144 +                               active_balance = 1;
4145 +                       }
4146 +                       spin_unlock_irqrestore(&busiest->lock, flags);
4147 +                       if (active_balance)
4148 +                               wake_up_process(busiest->migration_thread);
4149 +
4150 +                       /*
4151 +                        * We've kicked active balancing, reset the failure
4152 +                        * counter.
4153 +                        */
4154 +                       sd->nr_balance_failed = sd->cache_nice_tries+1;
4155 +               }
4156 +       } else
4157 +               sd->nr_balance_failed = 0;
4158 +
4159 +       if (likely(!active_balance)) {
4160 +               /* We were unbalanced, so reset the balancing interval */
4161 +               sd->balance_interval = sd->min_interval;
4162 +       } else {
4163 +               /*
4164 +                * If we've begun active balancing, start to back off. This
4165 +                * case may not be covered by the all_pinned logic if there
4166 +                * is only 1 task on the busy runqueue (because we don't call
4167 +                * move_tasks).
4168 +                */
4169 +               if (sd->balance_interval < sd->max_interval)
4170 +                       sd->balance_interval *= 2;
4171 +       }
4172 +
4173 +       if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4174 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4175 +               ld_moved = -1;
4176 +
4177 +       goto out;
4178 +
4179 +out_balanced:
4180 +       schedstat_inc(sd, lb_balanced[idle]);
4181 +
4182 +       sd->nr_balance_failed = 0;
4183 +
4184 +out_one_pinned:
4185 +       /* tune up the balancing interval */
4186 +       if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4187 +                       (sd->balance_interval < sd->max_interval))
4188 +               sd->balance_interval *= 2;
4189 +
4190 +       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4191 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4192 +               ld_moved = -1;
4193 +       else
4194 +               ld_moved = 0;
4195 +out:
4196 +       if (ld_moved)
4197 +               update_shares(sd);
4198 +       return ld_moved;
4199 +}
4200 +
4201 +/*
4202 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
4203 + * tasks if there is an imbalance.
4204 + *
4205 + * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4206 + * this_rq is locked.
4207 + */
4208 +static int
4209 +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
4210 +                       cpumask_t *cpus)
4211 +{
4212 +       struct sched_group *group;
4213 +       struct rq *busiest = NULL;
4214 +       unsigned long imbalance;
4215 +       int ld_moved = 0;
4216 +       int sd_idle = 0;
4217 +       int all_pinned = 0;
4218 +
4219 +       cpus_setall(*cpus);
4220 +
4221 +       /*
4222 +        * When power savings policy is enabled for the parent domain, idle
4223 +        * sibling can pick up load irrespective of busy siblings. In this case,
4224 +        * let the state of idle sibling percolate up as IDLE, instead of
4225 +        * portraying it as CPU_NOT_IDLE.
4226 +        */
4227 +       if (sd->flags & SD_SHARE_CPUPOWER &&
4228 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4229 +               sd_idle = 1;
4230 +
4231 +       schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4232 +redo:
4233 +       update_shares_locked(this_rq, sd);
4234 +       group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4235 +                                  &sd_idle, cpus, NULL);
4236 +       if (!group) {
4237 +               schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4238 +               goto out_balanced;
4239 +       }
4240 +
4241 +       busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4242 +       if (!busiest) {
4243 +               schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4244 +               goto out_balanced;
4245 +       }
4246 +
4247 +       BUG_ON(busiest == this_rq);
4248 +
4249 +       schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4250 +
4251 +       ld_moved = 0;
4252 +       if (busiest->nr_running > 1) {
4253 +               /* Attempt to move tasks */
4254 +               double_lock_balance(this_rq, busiest);
4255 +               /* this_rq->clock is already updated */
4256 +               update_rq_clock(busiest);
4257 +               ld_moved = move_tasks(this_rq, this_cpu, busiest,
4258 +                                       imbalance, sd, CPU_NEWLY_IDLE,
4259 +                                       &all_pinned);
4260 +               double_unlock_balance(this_rq, busiest);
4261 +
4262 +               if (unlikely(all_pinned)) {
4263 +                       cpu_clear(cpu_of(busiest), *cpus);
4264 +                       if (!cpus_empty(*cpus))
4265 +                               goto redo;
4266 +               }
4267 +       }
4268 +
4269 +       if (!ld_moved) {
4270 +               schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4271 +               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4272 +                   !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4273 +                       return -1;
4274 +       } else
4275 +               sd->nr_balance_failed = 0;
4276 +
4277 +       update_shares_locked(this_rq, sd);
4278 +       return ld_moved;
4279 +
4280 +out_balanced:
4281 +       schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4282 +       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4283 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4284 +               return -1;
4285 +       sd->nr_balance_failed = 0;
4286 +
4287 +       return 0;
4288 +}
4289 +
4290 +/*
4291 + * idle_balance is called by schedule() if this_cpu is about to become
4292 + * idle. Attempts to pull tasks from other CPUs.
4293 + */
4294 +static void idle_balance(int this_cpu, struct rq *this_rq)
4295 +{
4296 +       struct sched_domain *sd;
4297 +       int pulled_task = -1;
4298 +       unsigned long next_balance = jiffies + HZ;
4299 +       cpumask_t tmpmask;
4300 +
4301 +       for_each_domain(this_cpu, sd) {
4302 +               unsigned long interval;
4303 +
4304 +               if (!(sd->flags & SD_LOAD_BALANCE))
4305 +                       continue;
4306 +
4307 +               if (sd->flags & SD_BALANCE_NEWIDLE)
4308 +                       /* If we've pulled tasks over stop searching: */
4309 +                       pulled_task = load_balance_newidle(this_cpu, this_rq,
4310 +                                                          sd, &tmpmask);
4311 +
4312 +               interval = msecs_to_jiffies(sd->balance_interval);
4313 +               if (time_after(next_balance, sd->last_balance + interval))
4314 +                       next_balance = sd->last_balance + interval;
4315 +               if (pulled_task)
4316 +                       break;
4317 +       }
4318 +       if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4319 +               /*
4320 +                * We are going idle. next_balance may be set based on
4321 +                * a busy processor. So reset next_balance.
4322 +                */
4323 +               this_rq->next_balance = next_balance;
4324 +       }
4325 +}
4326 +
4327 +/*
4328 + * active_load_balance is run by migration threads. It pushes running tasks
4329 + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4330 + * running on each physical CPU where possible, and avoids physical /
4331 + * logical imbalances.
4332 + *
4333 + * Called with busiest_rq locked.
4334 + */
4335 +static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4336 +{
4337 +       int target_cpu = busiest_rq->push_cpu;
4338 +       struct sched_domain *sd;
4339 +       struct rq *target_rq;
4340 +
4341 +       /* Is there any task to move? */
4342 +       if (busiest_rq->nr_running <= 1)
4343 +               return;
4344 +
4345 +       target_rq = cpu_rq(target_cpu);
4346 +
4347 +       /*
4348 +        * This condition is "impossible", if it occurs
4349 +        * we need to fix it. Originally reported by
4350 +        * Bjorn Helgaas on a 128-cpu setup.
4351 +        */
4352 +       BUG_ON(busiest_rq == target_rq);
4353 +
4354 +       /* move a task from busiest_rq to target_rq */
4355 +       double_lock_balance(busiest_rq, target_rq);
4356 +       update_rq_clock(busiest_rq);
4357 +       update_rq_clock(target_rq);
4358 +
4359 +       /* Search for an sd spanning us and the target CPU. */
4360 +       for_each_domain(target_cpu, sd) {
4361 +               if ((sd->flags & SD_LOAD_BALANCE) &&
4362 +                   cpu_isset(busiest_cpu, sd->span))
4363 +                               break;
4364 +       }
4365 +
4366 +       if (likely(sd)) {
4367 +               schedstat_inc(sd, alb_count);
4368 +
4369 +               if (move_one_task(target_rq, target_cpu, busiest_rq,
4370 +                                 sd, CPU_IDLE))
4371 +                       schedstat_inc(sd, alb_pushed);
4372 +               else
4373 +                       schedstat_inc(sd, alb_failed);
4374 +       }
4375 +       double_unlock_balance(busiest_rq, target_rq);
4376 +}
4377 +
4378 +#ifdef CONFIG_NO_HZ
4379 +static struct {
4380 +       atomic_t load_balancer;
4381 +       cpumask_t cpu_mask;
4382 +} nohz ____cacheline_aligned = {
4383 +       .load_balancer = ATOMIC_INIT(-1),
4384 +       .cpu_mask = CPU_MASK_NONE,
4385 +};
4386 +
4387 +/*
4388 + * This routine will try to nominate the ilb (idle load balancing)
4389 + * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4390 + * load balancing on behalf of all those cpus. If all the cpus in the system
4391 + * go into this tickless mode, then there will be no ilb owner (as there is
4392 + * no need for one) and all the cpus will sleep till the next wakeup event
4393 + * arrives...
4394 + *
4395 + * For the ilb owner, tick is not stopped. And this tick will be used
4396 + * for idle load balancing. ilb owner will still be part of
4397 + * nohz.cpu_mask..
4398 + *
4399 + * While stopping the tick, this cpu will become the ilb owner if there
4400 + * is no other owner. And will be the owner till that cpu becomes busy
4401 + * or if all cpus in the system stop their ticks at which point
4402 + * there is no need for ilb owner.
4403 + *
4404 + * When the ilb owner becomes busy, it nominates another owner, during the
4405 + * next busy scheduler_tick()
4406 + */
4407 +int select_nohz_load_balancer(int stop_tick)
4408 +{
4409 +       int cpu = smp_processor_id();
4410 +
4411 +       if (stop_tick) {
4412 +               cpu_set(cpu, nohz.cpu_mask);
4413 +               cpu_rq(cpu)->in_nohz_recently = 1;
4414 +
4415 +               /*
4416 +                * If we are going offline and still the leader, give up!
4417 +                */
4418 +               if (!cpu_active(cpu) &&
4419 +                   atomic_read(&nohz.load_balancer) == cpu) {
4420 +                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4421 +                               BUG();
4422 +                       return 0;
4423 +               }
4424 +
4425 +               /* time for ilb owner also to sleep */
4426 +               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
4427 +                       if (atomic_read(&nohz.load_balancer) == cpu)
4428 +                               atomic_set(&nohz.load_balancer, -1);
4429 +                       return 0;
4430 +               }
4431 +
4432 +               if (atomic_read(&nohz.load_balancer) == -1) {
4433 +                       /* make me the ilb owner */
4434 +                       if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4435 +                               return 1;
4436 +               } else if (atomic_read(&nohz.load_balancer) == cpu)
4437 +                       return 1;
4438 +       } else {
4439 +               if (!cpu_isset(cpu, nohz.cpu_mask))
4440 +                       return 0;
4441 +
4442 +               cpu_clear(cpu, nohz.cpu_mask);
4443 +
4444 +               if (atomic_read(&nohz.load_balancer) == cpu)
4445 +                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4446 +                               BUG();
4447 +       }
4448 +       return 0;
4449 +}
4450 +#endif
4451 +
4452 +static DEFINE_SPINLOCK(balancing);
4453 +
4454 +/*
4455 + * It checks each scheduling domain to see if it is due to be balanced,
4456 + * and initiates a balancing operation if so.
4457 + *
4458 + * Balancing parameters are set up in arch_init_sched_domains.
4459 + */
4460 +static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4461 +{
4462 +       int balance = 1;
4463 +       struct rq *rq = cpu_rq(cpu);
4464 +       unsigned long interval;
4465 +       struct sched_domain *sd;
4466 +       /* Earliest time when we have to do rebalance again */
4467 +       unsigned long next_balance = jiffies + 60*HZ;
4468 +       int update_next_balance = 0;
4469 +       int need_serialize;
4470 +       cpumask_t tmp;
4471 +
4472 +       for_each_domain(cpu, sd) {
4473 +               if (!(sd->flags & SD_LOAD_BALANCE))
4474 +                       continue;
4475 +
4476 +               interval = sd->balance_interval;
4477 +               if (idle != CPU_IDLE)
4478 +                       interval *= sd->busy_factor;
4479 +
4480 +               /* scale ms to jiffies */
4481 +               interval = msecs_to_jiffies(interval);
4482 +               if (unlikely(!interval))
4483 +                       interval = 1;
4484 +               if (interval > HZ*NR_CPUS/10)
4485 +                       interval = HZ*NR_CPUS/10;
4486 +
4487 +               need_serialize = sd->flags & SD_SERIALIZE;
4488 +
4489 +               if (need_serialize) {
4490 +                       if (!spin_trylock(&balancing))
4491 +                               goto out;
4492 +               }
4493 +
4494 +               if (time_after_eq(jiffies, sd->last_balance + interval)) {
4495 +                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
4496 +                               /*
4497 +                                * We've pulled tasks over so either we're no
4498 +                                * longer idle, or one of our SMT siblings is
4499 +                                * not idle.
4500 +                                */
4501 +                               idle = CPU_NOT_IDLE;
4502 +                       }
4503 +                       sd->last_balance = jiffies;
4504 +               }
4505 +               if (need_serialize)
4506 +                       spin_unlock(&balancing);
4507 +out:
4508 +               if (time_after(next_balance, sd->last_balance + interval)) {
4509 +                       next_balance = sd->last_balance + interval;
4510 +                       update_next_balance = 1;
4511 +               }
4512 +
4513 +               /*
4514 +                * Stop the load balance at this level. There is another
4515 +                * CPU in our sched group which is doing load balancing more
4516 +                * actively.
4517 +                */
4518 +               if (!balance)
4519 +                       break;
4520 +       }
4521 +
4522 +       /*
4523 +        * next_balance will be updated only when there is a need.
4524 +        * When the cpu is attached to null domain for ex, it will not be
4525 +        * updated.
4526 +        */
4527 +       if (likely(update_next_balance))
4528 +               rq->next_balance = next_balance;
4529 +}
4530 +
4531 +/*
4532 + * run_rebalance_domains is triggered when needed from the scheduler tick.
4533 + * In CONFIG_NO_HZ case, the idle load balance owner will do the
4534 + * rebalancing for all the cpus for whom scheduler ticks are stopped.
4535 + */
4536 +static void run_rebalance_domains(struct softirq_action *h)
4537 +{
4538 +       int this_cpu = smp_processor_id();
4539 +       struct rq *this_rq = cpu_rq(this_cpu);
4540 +       enum cpu_idle_type idle = this_rq->idle_at_tick ?
4541 +                                               CPU_IDLE : CPU_NOT_IDLE;
4542 +
4543 +       rebalance_domains(this_cpu, idle);
4544 +
4545 +#ifdef CONFIG_NO_HZ
4546 +       /*
4547 +        * If this cpu is the owner for idle load balancing, then do the
4548 +        * balancing on behalf of the other idle cpus whose ticks are
4549 +        * stopped.
4550 +        */
4551 +       if (this_rq->idle_at_tick &&
4552 +           atomic_read(&nohz.load_balancer) == this_cpu) {
4553 +               cpumask_t cpus = nohz.cpu_mask;
4554 +               struct rq *rq;
4555 +               int balance_cpu;
4556 +
4557 +               cpu_clear(this_cpu, cpus);
4558 +               for_each_cpu_mask_nr(balance_cpu, cpus) {
4559 +                       /*
4560 +                        * If this cpu gets work to do, stop the load balancing
4561 +                        * work being done for other cpus. Next load
4562 +                        * balancing owner will pick it up.
4563 +                        */
4564 +                       if (need_resched())
4565 +                               break;
4566 +
4567 +                       rebalance_domains(balance_cpu, CPU_IDLE);
4568 +
4569 +                       rq = cpu_rq(balance_cpu);
4570 +                       if (time_after(this_rq->next_balance, rq->next_balance))
4571 +                               this_rq->next_balance = rq->next_balance;
4572 +               }
4573 +       }
4574 +#endif
4575 +}
4576 +
4577 +/*
4578 + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4579 + *
4580 + * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4581 + * idle load balancing owner or decide to stop the periodic load balancing,
4582 + * if the whole system is idle.
4583 + */
4584 +static inline void trigger_load_balance(struct rq *rq, int cpu)
4585 +{
4586 +#ifdef CONFIG_NO_HZ
4587 +       /*
4588 +        * If we were in the nohz mode recently and busy at the current
4589 +        * scheduler tick, then check if we need to nominate new idle
4590 +        * load balancer.
4591 +        */
4592 +       if (rq->in_nohz_recently && !rq->idle_at_tick) {
4593 +               rq->in_nohz_recently = 0;
4594 +
4595 +               if (atomic_read(&nohz.load_balancer) == cpu) {
4596 +                       cpu_clear(cpu, nohz.cpu_mask);
4597 +                       atomic_set(&nohz.load_balancer, -1);
4598 +               }
4599 +
4600 +               if (atomic_read(&nohz.load_balancer) == -1) {
4601 +                       /*
4602 +                        * simple selection for now: Nominate the
4603 +                        * first cpu in the nohz list to be the next
4604 +                        * ilb owner.
4605 +                        *
4606 +                        * TBD: Traverse the sched domains and nominate
4607 +                        * the nearest cpu in the nohz.cpu_mask.
4608 +                        */
4609 +                       int ilb = first_cpu(nohz.cpu_mask);
4610 +
4611 +                       if (ilb < nr_cpu_ids)
4612 +                               resched_cpu(ilb);
4613 +               }
4614 +       }
4615 +
4616 +       /*
4617 +        * If this cpu is idle and doing idle load balancing for all the
4618 +        * cpus with ticks stopped, is it time for that to stop?
4619 +        */
4620 +       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4621 +           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
4622 +               resched_cpu(cpu);
4623 +               return;
4624 +       }
4625 +
4626 +       /*
4627 +        * If this cpu is idle and the idle load balancing is done by
4628 +        * someone else, then no need raise the SCHED_SOFTIRQ
4629 +        */
4630 +       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4631 +           cpu_isset(cpu, nohz.cpu_mask))
4632 +               return;
4633 +#endif
4634 +       if (time_after_eq(jiffies, rq->next_balance))
4635 +               raise_softirq(SCHED_SOFTIRQ);
4636 +}
4637 +
4638 +#else  /* CONFIG_SMP */
4639 +
4640 +/*
4641 + * on UP we do not need to balance between CPUs:
4642 + */
4643 +static inline void idle_balance(int cpu, struct rq *rq)
4644 +{
4645 +}
4646 +
4647 +#endif
4648 +
4649 +DEFINE_PER_CPU(struct kernel_stat, kstat);
4650 +
4651 +EXPORT_PER_CPU_SYMBOL(kstat);
4652 +
4653 +/*
4654 + * Return p->sum_exec_runtime plus any more ns on the sched_clock
4655 + * that have not yet been banked in case the task is currently running.
4656 + */
4657 +unsigned long long task_sched_runtime(struct task_struct *p)
4658 +{
4659 +       unsigned long flags;
4660 +       u64 ns, delta_exec;
4661 +       struct rq *rq;
4662 +
4663 +       rq = task_rq_lock(p, &flags);
4664 +       ns = p->se.sum_exec_runtime;
4665 +       if (task_current(rq, p)) {
4666 +               update_rq_clock(rq);
4667 +               delta_exec = rq->clock - p->se.exec_start;
4668 +               if ((s64)delta_exec > 0)
4669 +                       ns += delta_exec;
4670 +       }
4671 +       task_rq_unlock(rq, &flags);
4672 +
4673 +       return ns;
4674 +}
4675 +
4676 +/*
4677 + * Account user cpu time to a process.
4678 + * @p: the process that the cpu time gets accounted to
4679 + * @cputime: the cpu time spent in user space since the last update
4680 + */
4681 +void account_user_time(struct task_struct *p, cputime_t cputime)
4682 +{
4683 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4684 +       struct vx_info *vxi = p->vx_info;  /* p is _always_ current */
4685 +       cputime64_t tmp;
4686 +       int nice = (TASK_NICE(p) > 0);
4687 +
4688 +       p->utime = cputime_add(p->utime, cputime);
4689 +       vx_account_user(vxi, cputime, nice);
4690 +
4691 +       /* Add user time to cpustat. */
4692 +       tmp = cputime_to_cputime64(cputime);
4693 +       if (nice)
4694 +               cpustat->nice = cputime64_add(cpustat->nice, tmp);
4695 +       else
4696 +               cpustat->user = cputime64_add(cpustat->user, tmp);
4697 +       /* Account for user time used */
4698 +       acct_update_integrals(p);
4699 +}
4700 +
4701 +/*
4702 + * Account guest cpu time to a process.
4703 + * @p: the process that the cpu time gets accounted to
4704 + * @cputime: the cpu time spent in virtual machine since the last update
4705 + */
4706 +static void account_guest_time(struct task_struct *p, cputime_t cputime)
4707 +{
4708 +       cputime64_t tmp;
4709 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4710 +
4711 +       tmp = cputime_to_cputime64(cputime);
4712 +
4713 +       p->utime = cputime_add(p->utime, cputime);
4714 +       p->gtime = cputime_add(p->gtime, cputime);
4715 +
4716 +       cpustat->user = cputime64_add(cpustat->user, tmp);
4717 +       cpustat->guest = cputime64_add(cpustat->guest, tmp);
4718 +}
4719 +
4720 +/*
4721 + * Account scaled user cpu time to a process.
4722 + * @p: the process that the cpu time gets accounted to
4723 + * @cputime: the cpu time spent in user space since the last update
4724 + */
4725 +void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4726 +{
4727 +       p->utimescaled = cputime_add(p->utimescaled, cputime);
4728 +}
4729 +
4730 +/*
4731 + * Account system cpu time to a process.
4732 + * @p: the process that the cpu time gets accounted to
4733 + * @hardirq_offset: the offset to subtract from hardirq_count()
4734 + * @cputime: the cpu time spent in kernel space since the last update
4735 + */
4736 +void account_system_time(struct task_struct *p, int hardirq_offset,
4737 +                        cputime_t cputime)
4738 +{
4739 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4740 +       struct vx_info *vxi = p->vx_info;  /* p is _always_ current */
4741 +       struct rq *rq = this_rq();
4742 +       cputime64_t tmp;
4743 +
4744 +       if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4745 +               account_guest_time(p, cputime);
4746 +               return;
4747 +       }
4748 +
4749 +       p->stime = cputime_add(p->stime, cputime);
4750 +       vx_account_system(vxi, cputime, (p == rq->idle));
4751 +
4752 +       /* Add system time to cpustat. */
4753 +       tmp = cputime_to_cputime64(cputime);
4754 +       if (hardirq_count() - hardirq_offset)
4755 +               cpustat->irq = cputime64_add(cpustat->irq, tmp);
4756 +       else if (softirq_count())
4757 +               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4758 +       else if (p != rq->idle)
4759 +               cpustat->system = cputime64_add(cpustat->system, tmp);
4760 +       else if (atomic_read(&rq->nr_iowait) > 0)
4761 +               cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4762 +       else
4763 +               cpustat->idle = cputime64_add(cpustat->idle, tmp);
4764 +       /* Account for system time used */
4765 +       acct_update_integrals(p);
4766 +}
4767 +
4768 +/*
4769 + * Account scaled system cpu time to a process.
4770 + * @p: the process that the cpu time gets accounted to
4771 + * @hardirq_offset: the offset to subtract from hardirq_count()
4772 + * @cputime: the cpu time spent in kernel space since the last update
4773 + */
4774 +void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
4775 +{
4776 +       p->stimescaled = cputime_add(p->stimescaled, cputime);
4777 +}
4778 +
4779 +/*
4780 + * Account for involuntary wait time.
4781 + * @p: the process from which the cpu time has been stolen
4782 + * @steal: the cpu time spent in involuntary wait
4783 + */
4784 +void account_steal_time(struct task_struct *p, cputime_t steal)
4785 +{
4786 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4787 +       cputime64_t tmp = cputime_to_cputime64(steal);
4788 +       struct rq *rq = this_rq();
4789 +
4790 +       if (p == rq->idle) {
4791 +               p->stime = cputime_add(p->stime, steal);
4792 +               if (atomic_read(&rq->nr_iowait) > 0)
4793 +                       cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4794 +               else
4795 +                       cpustat->idle = cputime64_add(cpustat->idle, tmp);
4796 +       } else
4797 +               cpustat->steal = cputime64_add(cpustat->steal, tmp);
4798 +}
4799 +
4800 +/*
4801 + * Use precise platform statistics if available:
4802 + */
4803 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4804 +cputime_t task_utime(struct task_struct *p)
4805 +{
4806 +       return p->utime;
4807 +}
4808 +
4809 +cputime_t task_stime(struct task_struct *p)
4810 +{
4811 +       return p->stime;
4812 +}
4813 +#else
4814 +cputime_t task_utime(struct task_struct *p)
4815 +{
4816 +       clock_t utime = cputime_to_clock_t(p->utime),
4817 +               total = utime + cputime_to_clock_t(p->stime);
4818 +       u64 temp;
4819 +
4820 +       /*
4821 +        * Use CFS's precise accounting:
4822 +        */
4823 +       temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4824 +
4825 +       if (total) {
4826 +               temp *= utime;
4827 +               do_div(temp, total);
4828 +       }
4829 +       utime = (clock_t)temp;
4830 +
4831 +       p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4832 +       return p->prev_utime;
4833 +}
4834 +
4835 +cputime_t task_stime(struct task_struct *p)
4836 +{
4837 +       clock_t stime;
4838 +
4839 +       /*
4840 +        * Use CFS's precise accounting. (we subtract utime from
4841 +        * the total, to make sure the total observed by userspace
4842 +        * grows monotonically - apps rely on that):
4843 +        */
4844 +       stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4845 +                       cputime_to_clock_t(task_utime(p));
4846 +
4847 +       if (stime >= 0)
4848 +               p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4849 +
4850 +       return p->prev_stime;
4851 +}
4852 +#endif
4853 +
4854 +inline cputime_t task_gtime(struct task_struct *p)
4855 +{
4856 +       return p->gtime;
4857 +}
4858 +
4859 +/*
4860 + * This function gets called by the timer code, with HZ frequency.
4861 + * We call it with interrupts disabled.
4862 + *
4863 + * It also gets called by the fork code, when changing the parent's
4864 + * timeslices.
4865 + */
4866 +void scheduler_tick(void)
4867 +{
4868 +       int cpu = smp_processor_id();
4869 +       struct rq *rq = cpu_rq(cpu);
4870 +       struct task_struct *curr = rq->curr;
4871 +
4872 +       sched_clock_tick();
4873 +
4874 +       spin_lock(&rq->lock);
4875 +       update_rq_clock(rq);
4876 +       update_cpu_load(rq);
4877 +       curr->sched_class->task_tick(rq, curr, 0);
4878 +       spin_unlock(&rq->lock);
4879 +
4880 +#ifdef CONFIG_SMP
4881 +       rq->idle_at_tick = idle_cpu(cpu);
4882 +       trigger_load_balance(rq, cpu);
4883 +#endif
4884 +}
4885 +
4886 +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4887 +                               defined(CONFIG_PREEMPT_TRACER))
4888 +
4889 +static inline unsigned long get_parent_ip(unsigned long addr)
4890 +{
4891 +       if (in_lock_functions(addr)) {
4892 +               addr = CALLER_ADDR2;
4893 +               if (in_lock_functions(addr))
4894 +                       addr = CALLER_ADDR3;
4895 +       }
4896 +       return addr;
4897 +}
4898 +
4899 +void __kprobes add_preempt_count(int val)
4900 +{
4901 +#ifdef CONFIG_DEBUG_PREEMPT
4902 +       /*
4903 +        * Underflow?
4904 +        */
4905 +       if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4906 +               return;
4907 +#endif
4908 +       preempt_count() += val;
4909 +#ifdef CONFIG_DEBUG_PREEMPT
4910 +       /*
4911 +        * Spinlock count overflowing soon?
4912 +        */
4913 +       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4914 +                               PREEMPT_MASK - 10);
4915 +#endif
4916 +       if (preempt_count() == val)
4917 +               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4918 +}
4919 +EXPORT_SYMBOL(add_preempt_count);
4920 +
4921 +void __kprobes sub_preempt_count(int val)
4922 +{
4923 +#ifdef CONFIG_DEBUG_PREEMPT
4924 +       /*
4925 +        * Underflow?
4926 +        */
4927 +       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4928 +               return;
4929 +       /*
4930 +        * Is the spinlock portion underflowing?
4931 +        */
4932 +       if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4933 +                       !(preempt_count() & PREEMPT_MASK)))
4934 +               return;
4935 +#endif
4936 +
4937 +       if (preempt_count() == val)
4938 +               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4939 +       preempt_count() -= val;
4940 +}
4941 +EXPORT_SYMBOL(sub_preempt_count);
4942 +
4943 +#endif
4944 +
4945 +/*
4946 + * Print scheduling while atomic bug:
4947 + */
4948 +static noinline void __schedule_bug(struct task_struct *prev)
4949 +{
4950 +       struct pt_regs *regs = get_irq_regs();
4951 +
4952 +       printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4953 +               prev->comm, prev->pid, preempt_count());
4954 +
4955 +       debug_show_held_locks(prev);
4956 +       print_modules();
4957 +       if (irqs_disabled())
4958 +               print_irqtrace_events(prev);
4959 +
4960 +       if (regs)
4961 +               show_regs(regs);
4962 +       else
4963 +               dump_stack();
4964 +}
4965 +
4966 +/*
4967 + * Various schedule()-time debugging checks and statistics:
4968 + */
4969 +static inline void schedule_debug(struct task_struct *prev)
4970 +{
4971 +       /*
4972 +        * Test if we are atomic. Since do_exit() needs to call into
4973 +        * schedule() atomically, we ignore that path for now.
4974 +        * Otherwise, whine if we are scheduling when we should not be.
4975 +        */
4976 +       if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4977 +               __schedule_bug(prev);
4978 +
4979 +       profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4980 +
4981 +       schedstat_inc(this_rq(), sched_count);
4982 +#ifdef CONFIG_SCHEDSTATS
4983 +       if (unlikely(prev->lock_depth >= 0)) {
4984 +               schedstat_inc(this_rq(), bkl_count);
4985 +               schedstat_inc(prev, sched_info.bkl_count);
4986 +       }
4987 +#endif
4988 +}
4989 +
4990 +/*
4991 + * Pick up the highest-prio task:
4992 + */
4993 +static inline struct task_struct *
4994 +pick_next_task(struct rq *rq, struct task_struct *prev)
4995 +{
4996 +       const struct sched_class *class;
4997 +       struct task_struct *p;
4998 +
4999 +       /*
5000 +        * Optimization: we know that if all tasks are in
5001 +        * the fair class we can call that function directly:
5002 +        */
5003 +       if (likely(rq->nr_running == rq->cfs.nr_running)) {
5004 +               p = fair_sched_class.pick_next_task(rq);
5005 +               if (likely(p))
5006 +                       return p;
5007 +       }
5008 +
5009 +       class = sched_class_highest;
5010 +       for ( ; ; ) {
5011 +               p = class->pick_next_task(rq);
5012 +               if (p)
5013 +                       return p;
5014 +               /*
5015 +                * Will never be NULL as the idle class always
5016 +                * returns a non-NULL p:
5017 +                */
5018 +               class = class->next;
5019 +       }
5020 +}
5021 +
5022 +void (*rec_event)(void *,unsigned int) = NULL;
5023 +EXPORT_SYMBOL(rec_event);
5024 +#ifdef CONFIG_CHOPSTIX
5025 +
5026 +struct event_spec {
5027 +    unsigned long pc;
5028 +    unsigned long dcookie;
5029 +    unsigned int count;
5030 +    unsigned int reason;
5031 +};
5032 +
5033 +/* To support safe calling from asm */
5034 +asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
5035 +    struct pt_regs *regs;
5036 +    struct event_spec *es = event_signature_in->event_data;
5037 +    regs = task_pt_regs(current);
5038 +    event_signature_in->task=current;
5039 +    es->pc=regs->ip;
5040 +    event_signature_in->count=1;
5041 +    (*rec_event)(event_signature_in, count);
5042 +}
5043 +#endif
5044 +
5045 +/*
5046 + * schedule() is the main scheduler function.
5047 + */
5048 +asmlinkage void __sched schedule(void)
5049 +{
5050 +       struct task_struct *prev, *next;
5051 +       unsigned long *switch_count;
5052 +       struct rq *rq;
5053 +       int cpu;
5054 +
5055 +need_resched:
5056 +       preempt_disable();
5057 +       cpu = smp_processor_id();
5058 +       rq = cpu_rq(cpu);
5059 +       rcu_qsctr_inc(cpu);
5060 +       prev = rq->curr;
5061 +       switch_count = &prev->nivcsw;
5062 +
5063 +       release_kernel_lock(prev);
5064 +need_resched_nonpreemptible:
5065 +
5066 +       schedule_debug(prev);
5067 +
5068 +       if (sched_feat(HRTICK))
5069 +               hrtick_clear(rq);
5070 +
5071 +       /*
5072 +        * Do the rq-clock update outside the rq lock:
5073 +        */
5074 +       local_irq_disable();
5075 +       update_rq_clock(rq);
5076 +       spin_lock(&rq->lock);
5077 +       clear_tsk_need_resched(prev);
5078 +
5079 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5080 +               if (unlikely(signal_pending_state(prev->state, prev)))
5081 +                       prev->state = TASK_RUNNING;
5082 +               else
5083 +                       deactivate_task(rq, prev, 1);
5084 +               switch_count = &prev->nvcsw;
5085 +       }
5086 +
5087 +#ifdef CONFIG_SMP
5088 +       if (prev->sched_class->pre_schedule)
5089 +               prev->sched_class->pre_schedule(rq, prev);
5090 +#endif
5091 +
5092 +       if (unlikely(!rq->nr_running))
5093 +               idle_balance(cpu, rq);
5094 +
5095 +       prev->sched_class->put_prev_task(rq, prev);
5096 +       next = pick_next_task(rq, prev);
5097 +
5098 +       if (likely(prev != next)) {
5099 +               sched_info_switch(prev, next);
5100 +
5101 +               rq->nr_switches++;
5102 +               rq->curr = next;
5103 +               ++*switch_count;
5104 +
5105 +               context_switch(rq, prev, next); /* unlocks the rq */
5106 +               /*
5107 +                * the context switch might have flipped the stack from under
5108 +                * us, hence refresh the local variables.
5109 +                */
5110 +               cpu = smp_processor_id();
5111 +               rq = cpu_rq(cpu);
5112 +       } else
5113 +               spin_unlock_irq(&rq->lock);
5114 +
5115 +       if (unlikely(reacquire_kernel_lock(current) < 0))
5116 +               goto need_resched_nonpreemptible;
5117 +
5118 +       preempt_enable_no_resched();
5119 +       if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
5120 +               goto need_resched;
5121 +}
5122 +EXPORT_SYMBOL(schedule);
5123 +
5124 +#ifdef CONFIG_PREEMPT
5125 +/*
5126 + * this is the entry point to schedule() from in-kernel preemption
5127 + * off of preempt_enable. Kernel preemptions off return from interrupt
5128 + * occur there and call schedule directly.
5129 + */
5130 +asmlinkage void __sched preempt_schedule(void)
5131 +{
5132 +       struct thread_info *ti = current_thread_info();
5133 +
5134 +       /*
5135 +        * If there is a non-zero preempt_count or interrupts are disabled,
5136 +        * we do not want to preempt the current task. Just return..
5137 +        */
5138 +       if (likely(ti->preempt_count || irqs_disabled()))
5139 +               return;
5140 +
5141 +       do {
5142 +               add_preempt_count(PREEMPT_ACTIVE);
5143 +               schedule();
5144 +               sub_preempt_count(PREEMPT_ACTIVE);
5145 +
5146 +               /*
5147 +                * Check again in case we missed a preemption opportunity
5148 +                * between schedule and now.
5149 +                */
5150 +               barrier();
5151 +       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
5152 +}
5153 +EXPORT_SYMBOL(preempt_schedule);
5154 +
5155 +/*
5156 + * this is the entry point to schedule() from kernel preemption
5157 + * off of irq context.
5158 + * Note, that this is called and return with irqs disabled. This will
5159 + * protect us against recursive calling from irq.
5160 + */
5161 +asmlinkage void __sched preempt_schedule_irq(void)
5162 +{
5163 +       struct thread_info *ti = current_thread_info();
5164 +
5165 +       /* Catch callers which need to be fixed */
5166 +       BUG_ON(ti->preempt_count || !irqs_disabled());
5167 +
5168 +       do {
5169 +               add_preempt_count(PREEMPT_ACTIVE);
5170 +               local_irq_enable();
5171 +               schedule();
5172 +               local_irq_disable();
5173 +               sub_preempt_count(PREEMPT_ACTIVE);
5174 +
5175 +               /*
5176 +                * Check again in case we missed a preemption opportunity
5177 +                * between schedule and now.
5178 +                */
5179 +               barrier();
5180 +       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
5181 +}
5182 +
5183 +#endif /* CONFIG_PREEMPT */
5184 +
5185 +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
5186 +                         void *key)
5187 +{
5188 +       return try_to_wake_up(curr->private, mode, sync);
5189 +}
5190 +EXPORT_SYMBOL(default_wake_function);
5191 +
5192 +/*
5193 + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
5194 + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
5195 + * number) then we wake all the non-exclusive tasks and one exclusive task.
5196 + *
5197 + * There are circumstances in which we can try to wake a task which has already
5198 + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5199 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
5200 + */
5201 +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5202 +                            int nr_exclusive, int sync, void *key)
5203 +{
5204 +       wait_queue_t *curr, *next;
5205 +
5206 +       list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5207 +               unsigned flags = curr->flags;
5208 +
5209 +               if (curr->func(curr, mode, sync, key) &&
5210 +                               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5211 +                       break;
5212 +       }
5213 +}
5214 +
5215 +/**
5216 + * __wake_up - wake up threads blocked on a waitqueue.
5217 + * @q: the waitqueue
5218 + * @mode: which threads
5219 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
5220 + * @key: is directly passed to the wakeup function
5221 + */
5222 +void __wake_up(wait_queue_head_t *q, unsigned int mode,
5223 +                       int nr_exclusive, void *key)
5224 +{
5225 +       unsigned long flags;
5226 +
5227 +       spin_lock_irqsave(&q->lock, flags);
5228 +       __wake_up_common(q, mode, nr_exclusive, 0, key);
5229 +       spin_unlock_irqrestore(&q->lock, flags);
5230 +}
5231 +EXPORT_SYMBOL(__wake_up);
5232 +
5233 +/*
5234 + * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
5235 + */
5236 +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5237 +{
5238 +       __wake_up_common(q, mode, 1, 0, NULL);
5239 +}
5240 +
5241 +/**
5242 + * __wake_up_sync - wake up threads blocked on a waitqueue.
5243 + * @q: the waitqueue
5244 + * @mode: which threads
5245 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
5246 + *
5247 + * The sync wakeup differs that the waker knows that it will schedule
5248 + * away soon, so while the target thread will be woken up, it will not
5249 + * be migrated to another CPU - ie. the two threads are 'synchronized'
5250 + * with each other. This can prevent needless bouncing between CPUs.
5251 + *
5252 + * On UP it can prevent extra preemption.
5253 + */
5254 +void
5255 +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5256 +{
5257 +       unsigned long flags;
5258 +       int sync = 1;
5259 +
5260 +       if (unlikely(!q))
5261 +               return;
5262 +
5263 +       if (unlikely(!nr_exclusive))
5264 +               sync = 0;
5265 +
5266 +       spin_lock_irqsave(&q->lock, flags);
5267 +       __wake_up_common(q, mode, nr_exclusive, sync, NULL);
5268 +       spin_unlock_irqrestore(&q->lock, flags);
5269 +}
5270 +EXPORT_SYMBOL_GPL(__wake_up_sync);     /* For internal use only */
5271 +
5272 +void complete(struct completion *x)
5273 +{
5274 +       unsigned long flags;
5275 +
5276 +       spin_lock_irqsave(&x->wait.lock, flags);
5277 +       x->done++;
5278 +       __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
5279 +       spin_unlock_irqrestore(&x->wait.lock, flags);
5280 +}
5281 +EXPORT_SYMBOL(complete);
5282 +
5283 +void complete_all(struct completion *x)
5284 +{
5285 +       unsigned long flags;
5286 +
5287 +       spin_lock_irqsave(&x->wait.lock, flags);
5288 +       x->done += UINT_MAX/2;
5289 +       __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
5290 +       spin_unlock_irqrestore(&x->wait.lock, flags);
5291 +}
5292 +EXPORT_SYMBOL(complete_all);
5293 +
5294 +static inline long __sched
5295 +do_wait_for_common(struct completion *x, long timeout, int state)
5296 +{
5297 +       if (!x->done) {
5298 +               DECLARE_WAITQUEUE(wait, current);
5299 +
5300 +               wait.flags |= WQ_FLAG_EXCLUSIVE;
5301 +               __add_wait_queue_tail(&x->wait, &wait);
5302 +               do {
5303 +                       if ((state == TASK_INTERRUPTIBLE &&
5304 +                            signal_pending(current)) ||
5305 +                           (state == TASK_KILLABLE &&
5306 +                            fatal_signal_pending(current))) {
5307 +                               timeout = -ERESTARTSYS;
5308 +                               break;
5309 +                       }
5310 +                       __set_current_state(state);
5311 +                       spin_unlock_irq(&x->wait.lock);
5312 +                       timeout = schedule_timeout(timeout);
5313 +                       spin_lock_irq(&x->wait.lock);
5314 +               } while (!x->done && timeout);
5315 +               __remove_wait_queue(&x->wait, &wait);
5316 +               if (!x->done)
5317 +                       return timeout;
5318 +       }
5319 +       x->done--;
5320 +       return timeout ?: 1;
5321 +}
5322 +
5323 +static long __sched
5324 +wait_for_common(struct completion *x, long timeout, int state)
5325 +{
5326 +       might_sleep();
5327 +
5328 +       spin_lock_irq(&x->wait.lock);
5329 +       timeout = do_wait_for_common(x, timeout, state);
5330 +       spin_unlock_irq(&x->wait.lock);
5331 +       return timeout;
5332 +}
5333 +
5334 +void __sched wait_for_completion(struct completion *x)
5335 +{
5336 +       wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
5337 +}
5338 +EXPORT_SYMBOL(wait_for_completion);
5339 +
5340 +unsigned long __sched
5341 +wait_for_completion_timeout(struct completion *x, unsigned long timeout)
5342 +{
5343 +       return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
5344 +}
5345 +EXPORT_SYMBOL(wait_for_completion_timeout);
5346 +
5347 +int __sched wait_for_completion_interruptible(struct completion *x)
5348 +{
5349 +       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
5350 +       if (t == -ERESTARTSYS)
5351 +               return t;
5352 +       return 0;
5353 +}
5354 +EXPORT_SYMBOL(wait_for_completion_interruptible);
5355 +
5356 +unsigned long __sched
5357 +wait_for_completion_interruptible_timeout(struct completion *x,
5358 +                                         unsigned long timeout)
5359 +{
5360 +       return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
5361 +}
5362 +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
5363 +
5364 +int __sched wait_for_completion_killable(struct completion *x)
5365 +{
5366 +       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
5367 +       if (t == -ERESTARTSYS)
5368 +               return t;
5369 +       return 0;
5370 +}
5371 +EXPORT_SYMBOL(wait_for_completion_killable);
5372 +
5373 +/**
5374 + *     try_wait_for_completion - try to decrement a completion without blocking
5375 + *     @x:     completion structure
5376 + *
5377 + *     Returns: 0 if a decrement cannot be done without blocking
5378 + *              1 if a decrement succeeded.
5379 + *
5380 + *     If a completion is being used as a counting completion,
5381 + *     attempt to decrement the counter without blocking. This
5382 + *     enables us to avoid waiting if the resource the completion
5383 + *     is protecting is not available.
5384 + */
5385 +bool try_wait_for_completion(struct completion *x)
5386 +{
5387 +       int ret = 1;
5388 +
5389 +       spin_lock_irq(&x->wait.lock);
5390 +       if (!x->done)
5391 +               ret = 0;
5392 +       else
5393 +               x->done--;
5394 +       spin_unlock_irq(&x->wait.lock);
5395 +       return ret;
5396 +}
5397 +EXPORT_SYMBOL(try_wait_for_completion);
5398 +
5399 +/**
5400 + *     completion_done - Test to see if a completion has any waiters
5401 + *     @x:     completion structure
5402 + *
5403 + *     Returns: 0 if there are waiters (wait_for_completion() in progress)
5404 + *              1 if there are no waiters.
5405 + *
5406 + */
5407 +bool completion_done(struct completion *x)
5408 +{
5409 +       int ret = 1;
5410 +
5411 +       spin_lock_irq(&x->wait.lock);
5412 +       if (!x->done)
5413 +               ret = 0;
5414 +       spin_unlock_irq(&x->wait.lock);
5415 +       return ret;
5416 +}
5417 +EXPORT_SYMBOL(completion_done);
5418 +
5419 +static long __sched
5420 +sleep_on_common(wait_queue_head_t *q, int state, long timeout)
5421 +{
5422 +       unsigned long flags;
5423 +       wait_queue_t wait;
5424 +
5425 +       init_waitqueue_entry(&wait, current);
5426 +
5427 +       __set_current_state(state);
5428 +
5429 +       spin_lock_irqsave(&q->lock, flags);
5430 +       __add_wait_queue(q, &wait);
5431 +       spin_unlock(&q->lock);
5432 +       timeout = schedule_timeout(timeout);
5433 +       spin_lock_irq(&q->lock);
5434 +       __remove_wait_queue(q, &wait);
5435 +       spin_unlock_irqrestore(&q->lock, flags);
5436 +
5437 +       return timeout;
5438 +}
5439 +
5440 +void __sched interruptible_sleep_on(wait_queue_head_t *q)
5441 +{
5442 +       sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5443 +}
5444 +EXPORT_SYMBOL(interruptible_sleep_on);
5445 +
5446 +long __sched
5447 +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
5448 +{
5449 +       return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
5450 +}
5451 +EXPORT_SYMBOL(interruptible_sleep_on_timeout);
5452 +
5453 +void __sched sleep_on(wait_queue_head_t *q)
5454 +{
5455 +       sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5456 +}
5457 +EXPORT_SYMBOL(sleep_on);
5458 +
5459 +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
5460 +{
5461 +       return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
5462 +}
5463 +EXPORT_SYMBOL(sleep_on_timeout);
5464 +
5465 +#ifdef CONFIG_RT_MUTEXES
5466 +
5467 +/*
5468 + * rt_mutex_setprio - set the current priority of a task
5469 + * @p: task
5470 + * @prio: prio value (kernel-internal form)
5471 + *
5472 + * This function changes the 'effective' priority of a task. It does
5473 + * not touch ->normal_prio like __setscheduler().
5474 + *
5475 + * Used by the rt_mutex code to implement priority inheritance logic.
5476 + */
5477 +void rt_mutex_setprio(struct task_struct *p, int prio)
5478 +{
5479 +       unsigned long flags;
5480 +       int oldprio, on_rq, running;
5481 +       struct rq *rq;
5482 +       const struct sched_class *prev_class = p->sched_class;
5483 +
5484 +       BUG_ON(prio < 0 || prio > MAX_PRIO);
5485 +
5486 +       rq = task_rq_lock(p, &flags);
5487 +       update_rq_clock(rq);
5488 +
5489 +       oldprio = p->prio;
5490 +       on_rq = p->se.on_rq;
5491 +       running = task_current(rq, p);
5492 +       if (on_rq)
5493 +               dequeue_task(rq, p, 0);
5494 +       if (running)
5495 +               p->sched_class->put_prev_task(rq, p);
5496 +
5497 +       if (rt_prio(prio))
5498 +               p->sched_class = &rt_sched_class;
5499 +       else
5500 +               p->sched_class = &fair_sched_class;
5501 +
5502 +       p->prio = prio;
5503 +
5504 +       if (running)
5505 +               p->sched_class->set_curr_task(rq);
5506 +       if (on_rq) {
5507 +               enqueue_task(rq, p, 0);
5508 +
5509 +               check_class_changed(rq, p, prev_class, oldprio, running);
5510 +       }
5511 +       task_rq_unlock(rq, &flags);
5512 +}
5513 +
5514 +#endif
5515 +
5516 +void set_user_nice(struct task_struct *p, long nice)
5517 +{
5518 +       int old_prio, delta, on_rq;
5519 +       unsigned long flags;
5520 +       struct rq *rq;
5521 +
5522 +       if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
5523 +               return;
5524 +       /*
5525 +        * We have to be careful, if called from sys_setpriority(),
5526 +        * the task might be in the middle of scheduling on another CPU.
5527 +        */
5528 +       rq = task_rq_lock(p, &flags);
5529 +       update_rq_clock(rq);
5530 +       /*
5531 +        * The RT priorities are set via sched_setscheduler(), but we still
5532 +        * allow the 'normal' nice value to be set - but as expected
5533 +        * it wont have any effect on scheduling until the task is
5534 +        * SCHED_FIFO/SCHED_RR:
5535 +        */
5536 +       if (task_has_rt_policy(p)) {
5537 +               p->static_prio = NICE_TO_PRIO(nice);
5538 +               goto out_unlock;
5539 +       }
5540 +       on_rq = p->se.on_rq;
5541 +       if (on_rq)
5542 +               dequeue_task(rq, p, 0);
5543 +
5544 +       p->static_prio = NICE_TO_PRIO(nice);
5545 +       set_load_weight(p);
5546 +       old_prio = p->prio;
5547 +       p->prio = effective_prio(p);
5548 +       delta = p->prio - old_prio;
5549 +
5550 +       if (on_rq) {
5551 +               enqueue_task(rq, p, 0);
5552 +               /*
5553 +                * If the task increased its priority or is running and
5554 +                * lowered its priority, then reschedule its CPU:
5555 +                */
5556 +               if (delta < 0 || (delta > 0 && task_running(rq, p)))
5557 +                       resched_task(rq->curr);
5558 +       }
5559 +out_unlock:
5560 +       task_rq_unlock(rq, &flags);
5561 +}
5562 +EXPORT_SYMBOL(set_user_nice);
5563 +
5564 +/*
5565 + * can_nice - check if a task can reduce its nice value
5566 + * @p: task
5567 + * @nice: nice value
5568 + */
5569 +int can_nice(const struct task_struct *p, const int nice)
5570 +{
5571 +       /* convert nice value [19,-20] to rlimit style value [1,40] */
5572 +       int nice_rlim = 20 - nice;
5573 +
5574 +       return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
5575 +               capable(CAP_SYS_NICE));
5576 +}
5577 +
5578 +#ifdef __ARCH_WANT_SYS_NICE
5579 +
5580 +/*
5581 + * sys_nice - change the priority of the current process.
5582 + * @increment: priority increment
5583 + *
5584 + * sys_setpriority is a more generic, but much slower function that
5585 + * does similar things.
5586 + */
5587 +SYSCALL_DEFINE1(nice, int, increment)
5588 +{
5589 +       long nice, retval;
5590 +
5591 +       /*
5592 +        * Setpriority might change our priority at the same moment.
5593 +        * We don't have to worry. Conceptually one call occurs first
5594 +        * and we have a single winner.
5595 +        */
5596 +       if (increment < -40)
5597 +               increment = -40;
5598 +       if (increment > 40)
5599 +               increment = 40;
5600 +
5601 +       nice = PRIO_TO_NICE(current->static_prio) + increment;
5602 +       if (nice < -20)
5603 +               nice = -20;
5604 +       if (nice > 19)
5605 +               nice = 19;
5606 +
5607 +       if (increment < 0 && !can_nice(current, nice))
5608 +               return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM;
5609 +
5610 +       retval = security_task_setnice(current, nice);
5611 +       if (retval)
5612 +               return retval;
5613 +
5614 +       set_user_nice(current, nice);
5615 +       return 0;
5616 +}
5617 +
5618 +#endif
5619 +
5620 +/**
5621 + * task_prio - return the priority value of a given task.
5622 + * @p: the task in question.
5623 + *
5624 + * This is the priority value as seen by users in /proc.
5625 + * RT tasks are offset by -200. Normal tasks are centered
5626 + * around 0, value goes from -16 to +15.
5627 + */
5628 +int task_prio(const struct task_struct *p)
5629 +{
5630 +       return p->prio - MAX_RT_PRIO;
5631 +}
5632 +
5633 +/**
5634 + * task_nice - return the nice value of a given task.
5635 + * @p: the task in question.
5636 + */
5637 +int task_nice(const struct task_struct *p)
5638 +{
5639 +       return TASK_NICE(p);
5640 +}
5641 +EXPORT_SYMBOL(task_nice);
5642 +
5643 +/**
5644 + * idle_cpu - is a given cpu idle currently?
5645 + * @cpu: the processor in question.
5646 + */
5647 +int idle_cpu(int cpu)
5648 +{
5649 +       return cpu_curr(cpu) == cpu_rq(cpu)->idle;
5650 +}
5651 +
5652 +/**
5653 + * idle_task - return the idle task for a given cpu.
5654 + * @cpu: the processor in question.
5655 + */
5656 +struct task_struct *idle_task(int cpu)
5657 +{
5658 +       return cpu_rq(cpu)->idle;
5659 +}
5660 +
5661 +/**
5662 + * find_process_by_pid - find a process with a matching PID value.
5663 + * @pid: the pid in question.
5664 + */
5665 +static struct task_struct *find_process_by_pid(pid_t pid)
5666 +{
5667 +       return pid ? find_task_by_vpid(pid) : current;
5668 +}
5669 +
5670 +/* Actually do priority change: must hold rq lock. */
5671 +static void
5672 +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5673 +{
5674 +       BUG_ON(p->se.on_rq);
5675 +
5676 +       p->policy = policy;
5677 +       switch (p->policy) {
5678 +       case SCHED_NORMAL:
5679 +       case SCHED_BATCH:
5680 +       case SCHED_IDLE:
5681 +               p->sched_class = &fair_sched_class;
5682 +               break;
5683 +       case SCHED_FIFO:
5684 +       case SCHED_RR:
5685 +               p->sched_class = &rt_sched_class;
5686 +               break;
5687 +       }
5688 +
5689 +       p->rt_priority = prio;
5690 +       p->normal_prio = normal_prio(p);
5691 +       /* we are holding p->pi_lock already */
5692 +       p->prio = rt_mutex_getprio(p);
5693 +       set_load_weight(p);
5694 +}
5695 +
5696 +static int __sched_setscheduler(struct task_struct *p, int policy,
5697 +                               struct sched_param *param, bool user)
5698 +{
5699 +       int retval, oldprio, oldpolicy = -1, on_rq, running;
5700 +       unsigned long flags;
5701 +       const struct sched_class *prev_class = p->sched_class;
5702 +       struct rq *rq;
5703 +
5704 +       /* may grab non-irq protected spin_locks */
5705 +       BUG_ON(in_interrupt());
5706 +recheck:
5707 +       /* double check policy once rq lock held */
5708 +       if (policy < 0)
5709 +               policy = oldpolicy = p->policy;
5710 +       else if (policy != SCHED_FIFO && policy != SCHED_RR &&
5711 +                       policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5712 +                       policy != SCHED_IDLE)
5713 +               return -EINVAL;
5714 +       /*
5715 +        * Valid priorities for SCHED_FIFO and SCHED_RR are
5716 +        * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5717 +        * SCHED_BATCH and SCHED_IDLE is 0.
5718 +        */
5719 +       if (param->sched_priority < 0 ||
5720 +           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5721 +           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5722 +               return -EINVAL;
5723 +       if (rt_policy(policy) != (param->sched_priority != 0))
5724 +               return -EINVAL;
5725 +
5726 +       /*
5727 +        * Allow unprivileged RT tasks to decrease priority:
5728 +        */
5729 +       if (user && !capable(CAP_SYS_NICE)) {
5730 +               if (rt_policy(policy)) {
5731 +                       unsigned long rlim_rtprio;
5732 +
5733 +                       if (!lock_task_sighand(p, &flags))
5734 +                               return -ESRCH;
5735 +                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
5736 +                       unlock_task_sighand(p, &flags);
5737 +
5738 +                       /* can't set/change the rt policy */
5739 +                       if (policy != p->policy && !rlim_rtprio)
5740 +                               return -EPERM;
5741 +
5742 +                       /* can't increase priority */
5743 +                       if (param->sched_priority > p->rt_priority &&
5744 +                           param->sched_priority > rlim_rtprio)
5745 +                               return -EPERM;
5746 +               }
5747 +               /*
5748 +                * Like positive nice levels, dont allow tasks to
5749 +                * move out of SCHED_IDLE either:
5750 +                */
5751 +               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
5752 +                       return -EPERM;
5753 +
5754 +               /* can't change other user's priorities */
5755 +               if ((current->euid != p->euid) &&
5756 +                   (current->euid != p->uid))
5757 +                       return -EPERM;
5758 +       }
5759 +
5760 +       if (user) {
5761 +#ifdef CONFIG_RT_GROUP_SCHED
5762 +               /*
5763 +                * Do not allow realtime tasks into groups that have no runtime
5764 +                * assigned.
5765 +                */
5766 +               if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
5767 +                       return -EPERM;
5768 +#endif
5769 +
5770 +               retval = security_task_setscheduler(p, policy, param);
5771 +               if (retval)
5772 +                       return retval;
5773 +       }
5774 +
5775 +       /*
5776 +        * make sure no PI-waiters arrive (or leave) while we are
5777 +        * changing the priority of the task:
5778 +        */
5779 +       spin_lock_irqsave(&p->pi_lock, flags);
5780 +       /*
5781 +        * To be able to change p->policy safely, the apropriate
5782 +        * runqueue lock must be held.
5783 +        */
5784 +       rq = __task_rq_lock(p);
5785 +       /* recheck policy now with rq lock held */
5786 +       if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5787 +               policy = oldpolicy = -1;
5788 +               __task_rq_unlock(rq);
5789 +               spin_unlock_irqrestore(&p->pi_lock, flags);
5790 +               goto recheck;
5791 +       }
5792 +       update_rq_clock(rq);
5793 +       on_rq = p->se.on_rq;
5794 +       running = task_current(rq, p);
5795 +       if (on_rq)
5796 +               deactivate_task(rq, p, 0);
5797 +       if (running)
5798 +               p->sched_class->put_prev_task(rq, p);
5799 +
5800 +       oldprio = p->prio;
5801 +       __setscheduler(rq, p, policy, param->sched_priority);
5802 +
5803 +       if (running)
5804 +               p->sched_class->set_curr_task(rq);
5805 +       if (on_rq) {
5806 +               activate_task(rq, p, 0);
5807 +
5808 +               check_class_changed(rq, p, prev_class, oldprio, running);
5809 +       }
5810 +       __task_rq_unlock(rq);
5811 +       spin_unlock_irqrestore(&p->pi_lock, flags);
5812 +
5813 +       rt_mutex_adjust_pi(p);
5814 +
5815 +       return 0;
5816 +}
5817 +
5818 +/**
5819 + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5820 + * @p: the task in question.
5821 + * @policy: new policy.
5822 + * @param: structure containing the new RT priority.
5823 + *
5824 + * NOTE that the task may be already dead.
5825 + */
5826 +int sched_setscheduler(struct task_struct *p, int policy,
5827 +                      struct sched_param *param)
5828 +{
5829 +       return __sched_setscheduler(p, policy, param, true);
5830 +}
5831 +EXPORT_SYMBOL_GPL(sched_setscheduler);
5832 +
5833 +/**
5834 + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5835 + * @p: the task in question.
5836 + * @policy: new policy.
5837 + * @param: structure containing the new RT priority.
5838 + *
5839 + * Just like sched_setscheduler, only don't bother checking if the
5840 + * current context has permission.  For example, this is needed in
5841 + * stop_machine(): we create temporary high priority worker threads,
5842 + * but our caller might not have that capability.
5843 + */
5844 +int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5845 +                              struct sched_param *param)
5846 +{
5847 +       return __sched_setscheduler(p, policy, param, false);
5848 +}
5849 +
5850 +static int
5851 +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5852 +{
5853 +       struct sched_param lparam;
5854 +       struct task_struct *p;
5855 +       int retval;
5856 +
5857 +       if (!param || pid < 0)
5858 +               return -EINVAL;
5859 +       if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5860 +               return -EFAULT;
5861 +
5862 +       rcu_read_lock();
5863 +       retval = -ESRCH;
5864 +       p = find_process_by_pid(pid);
5865 +       if (p != NULL)
5866 +               retval = sched_setscheduler(p, policy, &lparam);
5867 +       rcu_read_unlock();
5868 +
5869 +       return retval;
5870 +}
5871 +
5872 +/**
5873 + * sys_sched_setscheduler - set/change the scheduler policy and RT priority
5874 + * @pid: the pid in question.
5875 + * @policy: new policy.
5876 + * @param: structure containing the new RT priority.
5877 + */
5878 +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5879 +               struct sched_param __user *, param)
5880 +{
5881 +       /* negative values for policy are not valid */
5882 +       if (policy < 0)
5883 +               return -EINVAL;
5884 +
5885 +       return do_sched_setscheduler(pid, policy, param);
5886 +}
5887 +
5888 +/**
5889 + * sys_sched_setparam - set/change the RT priority of a thread
5890 + * @pid: the pid in question.
5891 + * @param: structure containing the new RT priority.
5892 + */
5893 +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5894 +{
5895 +       return do_sched_setscheduler(pid, -1, param);
5896 +}
5897 +
5898 +/**
5899 + * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5900 + * @pid: the pid in question.
5901 + */
5902 +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5903 +{
5904 +       struct task_struct *p;
5905 +       int retval;
5906 +
5907 +       if (pid < 0)
5908 +               return -EINVAL;
5909 +
5910 +       retval = -ESRCH;
5911 +       read_lock(&tasklist_lock);
5912 +       p = find_process_by_pid(pid);
5913 +       if (p) {
5914 +               retval = security_task_getscheduler(p);
5915 +               if (!retval)
5916 +                       retval = p->policy;
5917 +       }
5918 +       read_unlock(&tasklist_lock);
5919 +       return retval;
5920 +}
5921 +
5922 +/**
5923 + * sys_sched_getscheduler - get the RT priority of a thread
5924 + * @pid: the pid in question.
5925 + * @param: structure containing the RT priority.
5926 + */
5927 +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5928 +{
5929 +       struct sched_param lp;
5930 +       struct task_struct *p;
5931 +       int retval;
5932 +
5933 +       if (!param || pid < 0)
5934 +               return -EINVAL;
5935 +
5936 +       read_lock(&tasklist_lock);
5937 +       p = find_process_by_pid(pid);
5938 +       retval = -ESRCH;
5939 +       if (!p)
5940 +               goto out_unlock;
5941 +
5942 +       retval = security_task_getscheduler(p);
5943 +       if (retval)
5944 +               goto out_unlock;
5945 +
5946 +       lp.sched_priority = p->rt_priority;
5947 +       read_unlock(&tasklist_lock);
5948 +
5949 +       /*
5950 +        * This one might sleep, we cannot do it with a spinlock held ...
5951 +        */
5952 +       retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5953 +
5954 +       return retval;
5955 +
5956 +out_unlock:
5957 +       read_unlock(&tasklist_lock);
5958 +       return retval;
5959 +}
5960 +
5961 +long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5962 +{
5963 +       cpumask_t cpus_allowed;
5964 +       cpumask_t new_mask = *in_mask;
5965 +       struct task_struct *p;
5966 +       int retval;
5967 +
5968 +       get_online_cpus();
5969 +       read_lock(&tasklist_lock);
5970 +
5971 +       p = find_process_by_pid(pid);
5972 +       if (!p) {
5973 +               read_unlock(&tasklist_lock);
5974 +               put_online_cpus();
5975 +               return -ESRCH;
5976 +       }
5977 +
5978 +       /*
5979 +        * It is not safe to call set_cpus_allowed with the
5980 +        * tasklist_lock held. We will bump the task_struct's
5981 +        * usage count and then drop tasklist_lock.
5982 +        */
5983 +       get_task_struct(p);
5984 +       read_unlock(&tasklist_lock);
5985 +
5986 +
5987 +       retval = -EPERM;
5988 +       if ((current->euid != p->euid) && (current->euid != p->uid) &&
5989 +                       !capable(CAP_SYS_NICE))
5990 +               goto out_unlock;
5991 +
5992 +       retval = security_task_setscheduler(p, 0, NULL);
5993 +       if (retval)
5994 +               goto out_unlock;
5995 +
5996 +       cpuset_cpus_allowed(p, &cpus_allowed);
5997 +       cpus_and(new_mask, new_mask, cpus_allowed);
5998 + again:
5999 +       retval = set_cpus_allowed_ptr(p, &new_mask);
6000 +
6001 +       if (!retval) {
6002 +               cpuset_cpus_allowed(p, &cpus_allowed);
6003 +               if (!cpus_subset(new_mask, cpus_allowed)) {
6004 +                       /*
6005 +                        * We must have raced with a concurrent cpuset
6006 +                        * update. Just reset the cpus_allowed to the
6007 +                        * cpuset's cpus_allowed
6008 +                        */
6009 +                       new_mask = cpus_allowed;
6010 +                       goto again;
6011 +               }
6012 +       }
6013 +out_unlock:
6014 +       put_task_struct(p);
6015 +       put_online_cpus();
6016 +       return retval;
6017 +}
6018 +
6019 +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
6020 +                            cpumask_t *new_mask)
6021 +{
6022 +       if (len < sizeof(cpumask_t)) {
6023 +               memset(new_mask, 0, sizeof(cpumask_t));
6024 +       } else if (len > sizeof(cpumask_t)) {
6025 +               len = sizeof(cpumask_t);
6026 +       }
6027 +       return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
6028 +}
6029 +
6030 +/**
6031 + * sys_sched_setaffinity - set the cpu affinity of a process
6032 + * @pid: pid of the process
6033 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
6034 + * @user_mask_ptr: user-space pointer to the new cpu mask
6035 + */
6036 +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6037 +               unsigned long __user *, user_mask_ptr)
6038 +{
6039 +       cpumask_t new_mask;
6040 +       int retval;
6041 +
6042 +       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
6043 +       if (retval)
6044 +               return retval;
6045 +
6046 +       return sched_setaffinity(pid, &new_mask);
6047 +}
6048 +
6049 +long sched_getaffinity(pid_t pid, cpumask_t *mask)
6050 +{
6051 +       struct task_struct *p;
6052 +       int retval;
6053 +
6054 +       get_online_cpus();
6055 +       read_lock(&tasklist_lock);
6056 +
6057 +       retval = -ESRCH;
6058 +       p = find_process_by_pid(pid);
6059 +       if (!p)
6060 +               goto out_unlock;
6061 +
6062 +       retval = security_task_getscheduler(p);
6063 +       if (retval)
6064 +               goto out_unlock;
6065 +
6066 +       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
6067 +
6068 +out_unlock:
6069 +       read_unlock(&tasklist_lock);
6070 +       put_online_cpus();
6071 +
6072 +       return retval;
6073 +}
6074 +
6075 +/**
6076 + * sys_sched_getaffinity - get the cpu affinity of a process
6077 + * @pid: pid of the process
6078 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
6079 + * @user_mask_ptr: user-space pointer to hold the current cpu mask
6080 + */
6081 +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6082 +               unsigned long __user *, user_mask_ptr)
6083 +{
6084 +       int ret;
6085 +       cpumask_t mask;
6086 +
6087 +       if (len < sizeof(cpumask_t))
6088 +               return -EINVAL;
6089 +
6090 +       ret = sched_getaffinity(pid, &mask);
6091 +       if (ret < 0)
6092 +               return ret;
6093 +
6094 +       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
6095 +               return -EFAULT;
6096 +
6097 +       return sizeof(cpumask_t);
6098 +}
6099 +
6100 +/**
6101 + * sys_sched_yield - yield the current processor to other threads.
6102 + *
6103 + * This function yields the current CPU to other tasks. If there are no
6104 + * other threads running on this CPU then this function will return.
6105 + */
6106 +SYSCALL_DEFINE0(sched_yield)
6107 +{
6108 +       struct rq *rq = this_rq_lock();
6109 +
6110 +       schedstat_inc(rq, yld_count);
6111 +       current->sched_class->yield_task(rq);
6112 +
6113 +       /*
6114 +        * Since we are going to call schedule() anyway, there's
6115 +        * no need to preempt or enable interrupts:
6116 +        */
6117 +       __release(rq->lock);
6118 +       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6119 +       _raw_spin_unlock(&rq->lock);
6120 +       preempt_enable_no_resched();
6121 +
6122 +       schedule();
6123 +
6124 +       return 0;
6125 +}
6126 +
6127 +static void __cond_resched(void)
6128 +{
6129 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6130 +       __might_sleep(__FILE__, __LINE__);
6131 +#endif
6132 +       /*
6133 +        * The BKS might be reacquired before we have dropped
6134 +        * PREEMPT_ACTIVE, which could trigger a second
6135 +        * cond_resched() call.
6136 +        */
6137 +       do {
6138 +               add_preempt_count(PREEMPT_ACTIVE);
6139 +               schedule();
6140 +               sub_preempt_count(PREEMPT_ACTIVE);
6141 +       } while (need_resched());
6142 +}
6143 +
6144 +int __sched _cond_resched(void)
6145 +{
6146 +       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
6147 +                                       system_state == SYSTEM_RUNNING) {
6148 +               __cond_resched();
6149 +               return 1;
6150 +       }
6151 +       return 0;
6152 +}
6153 +EXPORT_SYMBOL(_cond_resched);
6154 +
6155 +/*
6156 + * cond_resched_lock() - if a reschedule is pending, drop the given lock,
6157 + * call schedule, and on return reacquire the lock.
6158 + *
6159 + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6160 + * operations here to prevent schedule() from being called twice (once via
6161 + * spin_unlock(), once by hand).
6162 + */
6163 +int cond_resched_lock(spinlock_t *lock)
6164 +{
6165 +       int resched = need_resched() && system_state == SYSTEM_RUNNING;
6166 +       int ret = 0;
6167 +
6168 +       if (spin_needbreak(lock) || resched) {
6169 +               spin_unlock(lock);
6170 +               if (resched && need_resched())
6171 +                       __cond_resched();
6172 +               else
6173 +                       cpu_relax();
6174 +               ret = 1;
6175 +               spin_lock(lock);
6176 +       }
6177 +       return ret;
6178 +}
6179 +EXPORT_SYMBOL(cond_resched_lock);
6180 +
6181 +int __sched cond_resched_softirq(void)
6182 +{
6183 +       BUG_ON(!in_softirq());
6184 +
6185 +       if (need_resched() && system_state == SYSTEM_RUNNING) {
6186 +               local_bh_enable();
6187 +               __cond_resched();
6188 +               local_bh_disable();
6189 +               return 1;
6190 +       }
6191 +       return 0;
6192 +}
6193 +EXPORT_SYMBOL(cond_resched_softirq);
6194 +
6195 +/**
6196 + * yield - yield the current processor to other threads.
6197 + *
6198 + * This is a shortcut for kernel-space yielding - it marks the
6199 + * thread runnable and calls sys_sched_yield().
6200 + */
6201 +void __sched yield(void)
6202 +{
6203 +       set_current_state(TASK_RUNNING);
6204 +       sys_sched_yield();
6205 +}
6206 +EXPORT_SYMBOL(yield);
6207 +
6208 +/*
6209 + * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6210 + * that process accounting knows that this is a task in IO wait state.
6211 + *
6212 + * But don't do that if it is a deliberate, throttling IO wait (this task
6213 + * has set its backing_dev_info: the queue against which it should throttle)
6214 + */
6215 +void __sched io_schedule(void)
6216 +{
6217 +       struct rq *rq = &__raw_get_cpu_var(runqueues);
6218 +
6219 +       delayacct_blkio_start();
6220 +       atomic_inc(&rq->nr_iowait);
6221 +       schedule();
6222 +       atomic_dec(&rq->nr_iowait);
6223 +       delayacct_blkio_end();
6224 +}
6225 +EXPORT_SYMBOL(io_schedule);
6226 +
6227 +long __sched io_schedule_timeout(long timeout)
6228 +{
6229 +       struct rq *rq = &__raw_get_cpu_var(runqueues);
6230 +       long ret;
6231 +
6232 +       delayacct_blkio_start();
6233 +       atomic_inc(&rq->nr_iowait);
6234 +       ret = schedule_timeout(timeout);
6235 +       atomic_dec(&rq->nr_iowait);
6236 +       delayacct_blkio_end();
6237 +       return ret;
6238 +}
6239 +
6240 +/**
6241 + * sys_sched_get_priority_max - return maximum RT priority.
6242 + * @policy: scheduling class.
6243 + *
6244 + * this syscall returns the maximum rt_priority that can be used
6245 + * by a given scheduling class.
6246 + */
6247 +SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6248 +{
6249 +       int ret = -EINVAL;
6250 +
6251 +       switch (policy) {
6252 +       case SCHED_FIFO:
6253 +       case SCHED_RR:
6254 +               ret = MAX_USER_RT_PRIO-1;
6255 +               break;
6256 +       case SCHED_NORMAL:
6257 +       case SCHED_BATCH:
6258 +       case SCHED_IDLE:
6259 +               ret = 0;
6260 +               break;
6261 +       }
6262 +       return ret;
6263 +}
6264 +
6265 +/**
6266 + * sys_sched_get_priority_min - return minimum RT priority.
6267 + * @policy: scheduling class.
6268 + *
6269 + * this syscall returns the minimum rt_priority that can be used
6270 + * by a given scheduling class.
6271 + */
6272 +SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6273 +{
6274 +       int ret = -EINVAL;
6275 +
6276 +       switch (policy) {
6277 +       case SCHED_FIFO:
6278 +       case SCHED_RR:
6279 +               ret = 1;
6280 +               break;
6281 +       case SCHED_NORMAL:
6282 +       case SCHED_BATCH:
6283 +       case SCHED_IDLE:
6284 +               ret = 0;
6285 +       }
6286 +       return ret;
6287 +}
6288 +
6289 +/**
6290 + * sys_sched_rr_get_interval - return the default timeslice of a process.
6291 + * @pid: pid of the process.
6292 + * @interval: userspace pointer to the timeslice value.
6293 + *
6294 + * this syscall writes the default timeslice value of a given process
6295 + * into the user-space timespec buffer. A value of '0' means infinity.
6296 + */
6297 +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6298 +               struct timespec __user *, interval)
6299 +{
6300 +       struct task_struct *p;
6301 +       unsigned int time_slice;
6302 +       int retval;
6303 +       struct timespec t;
6304 +
6305 +       if (pid < 0)
6306 +               return -EINVAL;
6307 +
6308 +       retval = -ESRCH;
6309 +       read_lock(&tasklist_lock);
6310 +       p = find_process_by_pid(pid);
6311 +       if (!p)
6312 +               goto out_unlock;
6313 +
6314 +       retval = security_task_getscheduler(p);
6315 +       if (retval)
6316 +               goto out_unlock;
6317 +
6318 +       /*
6319 +        * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6320 +        * tasks that are on an otherwise idle runqueue:
6321 +        */
6322 +       time_slice = 0;
6323 +       if (p->policy == SCHED_RR) {
6324 +               time_slice = DEF_TIMESLICE;
6325 +       } else if (p->policy != SCHED_FIFO) {
6326 +               struct sched_entity *se = &p->se;
6327 +               unsigned long flags;
6328 +               struct rq *rq;
6329 +
6330 +               rq = task_rq_lock(p, &flags);
6331 +               if (rq->cfs.load.weight)
6332 +                       time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6333 +               task_rq_unlock(rq, &flags);
6334 +       }
6335 +       read_unlock(&tasklist_lock);
6336 +       jiffies_to_timespec(time_slice, &t);
6337 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6338 +       return retval;
6339 +
6340 +out_unlock:
6341 +       read_unlock(&tasklist_lock);
6342 +       return retval;
6343 +}
6344 +
6345 +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
6346 +
6347 +void sched_show_task(struct task_struct *p)
6348 +{
6349 +       unsigned long free = 0;
6350 +       unsigned state;
6351 +
6352 +       state = p->state ? __ffs(p->state) + 1 : 0;
6353 +       printk(KERN_INFO "%-13.13s %c", p->comm,
6354 +               state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
6355 +#if BITS_PER_LONG == 32
6356 +       if (state == TASK_RUNNING)
6357 +               printk(KERN_CONT " running  ");
6358 +       else
6359 +               printk(KERN_CONT " %08lx ", thread_saved_pc(p));
6360 +#else
6361 +       if (state == TASK_RUNNING)
6362 +               printk(KERN_CONT "  running task    ");
6363 +       else
6364 +               printk(KERN_CONT " %016lx ", thread_saved_pc(p));
6365 +#endif
6366 +#ifdef CONFIG_DEBUG_STACK_USAGE
6367 +       {
6368 +               unsigned long *n = end_of_stack(p);
6369 +               while (!*n)
6370 +                       n++;
6371 +               free = (unsigned long)n - (unsigned long)end_of_stack(p);
6372 +       }
6373 +#endif
6374 +       printk(KERN_CONT "%5lu %5d %6d\n", free,
6375 +               task_pid_nr(p), task_pid_nr(p->real_parent));
6376 +
6377 +       show_stack(p, NULL);
6378 +}
6379 +
6380 +void show_state_filter(unsigned long state_filter)
6381 +{
6382 +       struct task_struct *g, *p;
6383 +
6384 +#if BITS_PER_LONG == 32
6385 +       printk(KERN_INFO
6386 +               "  task                PC stack   pid father\n");
6387 +#else
6388 +       printk(KERN_INFO
6389 +               "  task                        PC stack   pid father\n");
6390 +#endif
6391 +       read_lock(&tasklist_lock);
6392 +       do_each_thread(g, p) {
6393 +               /*
6394 +                * reset the NMI-timeout, listing all files on a slow
6395 +                * console might take alot of time:
6396 +                */
6397 +               touch_nmi_watchdog();
6398 +               if (!state_filter || (p->state & state_filter))
6399 +                       sched_show_task(p);
6400 +       } while_each_thread(g, p);
6401 +
6402 +       touch_all_softlockup_watchdogs();
6403 +
6404 +#ifdef CONFIG_SCHED_DEBUG
6405 +       sysrq_sched_debug_show();
6406 +#endif
6407 +       read_unlock(&tasklist_lock);
6408 +       /*
6409 +        * Only show locks if all tasks are dumped:
6410 +        */
6411 +       if (state_filter == -1)
6412 +               debug_show_all_locks();
6413 +}
6414 +
6415 +void __cpuinit init_idle_bootup_task(struct task_struct *idle)
6416 +{
6417 +       idle->sched_class = &idle_sched_class;
6418 +}
6419 +
6420 +/**
6421 + * init_idle - set up an idle thread for a given CPU
6422 + * @idle: task in question
6423 + * @cpu: cpu the idle task belongs to
6424 + *
6425 + * NOTE: this function does not set the idle thread's NEED_RESCHED
6426 + * flag, to make booting more robust.
6427 + */
6428 +void __cpuinit init_idle(struct task_struct *idle, int cpu)
6429 +{
6430 +       struct rq *rq = cpu_rq(cpu);
6431 +       unsigned long flags;
6432 +
6433 +       __sched_fork(idle);
6434 +       idle->se.exec_start = sched_clock();
6435 +
6436 +       idle->prio = idle->normal_prio = MAX_PRIO;
6437 +       idle->cpus_allowed = cpumask_of_cpu(cpu);
6438 +       __set_task_cpu(idle, cpu);
6439 +
6440 +       spin_lock_irqsave(&rq->lock, flags);
6441 +       rq->curr = rq->idle = idle;
6442 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6443 +       idle->oncpu = 1;
6444 +#endif
6445 +       spin_unlock_irqrestore(&rq->lock, flags);
6446 +
6447 +       /* Set the preempt count _outside_ the spinlocks! */
6448 +#if defined(CONFIG_PREEMPT)
6449 +       task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
6450 +#else
6451 +       task_thread_info(idle)->preempt_count = 0;
6452 +#endif
6453 +       /*
6454 +        * The idle tasks have their own, simple scheduling class:
6455 +        */
6456 +       idle->sched_class = &idle_sched_class;
6457 +}
6458 +
6459 +/*
6460 + * In a system that switches off the HZ timer nohz_cpu_mask
6461 + * indicates which cpus entered this state. This is used
6462 + * in the rcu update to wait only for active cpus. For system
6463 + * which do not switch off the HZ timer nohz_cpu_mask should
6464 + * always be CPU_MASK_NONE.
6465 + */
6466 +cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
6467 +
6468 +/*
6469 + * Increase the granularity value when there are more CPUs,
6470 + * because with more CPUs the 'effective latency' as visible
6471 + * to users decreases. But the relationship is not linear,
6472 + * so pick a second-best guess by going with the log2 of the
6473 + * number of CPUs.
6474 + *
6475 + * This idea comes from the SD scheduler of Con Kolivas:
6476 + */
6477 +static inline void sched_init_granularity(void)
6478 +{
6479 +       unsigned int factor = 1 + ilog2(num_online_cpus());
6480 +       const unsigned long limit = 200000000;
6481 +
6482 +       sysctl_sched_min_granularity *= factor;
6483 +       if (sysctl_sched_min_granularity > limit)
6484 +               sysctl_sched_min_granularity = limit;
6485 +
6486 +       sysctl_sched_latency *= factor;
6487 +       if (sysctl_sched_latency > limit)
6488 +               sysctl_sched_latency = limit;
6489 +
6490 +       sysctl_sched_wakeup_granularity *= factor;
6491 +
6492 +       sysctl_sched_shares_ratelimit *= factor;
6493 +}
6494 +
6495 +#ifdef CONFIG_SMP
6496 +/*
6497 + * This is how migration works:
6498 + *
6499 + * 1) we queue a struct migration_req structure in the source CPU's
6500 + *    runqueue and wake up that CPU's migration thread.
6501 + * 2) we down() the locked semaphore => thread blocks.
6502 + * 3) migration thread wakes up (implicitly it forces the migrated
6503 + *    thread off the CPU)
6504 + * 4) it gets the migration request and checks whether the migrated
6505 + *    task is still in the wrong runqueue.
6506 + * 5) if it's in the wrong runqueue then the migration thread removes
6507 + *    it and puts it into the right queue.
6508 + * 6) migration thread up()s the semaphore.
6509 + * 7) we wake up and the migration is done.
6510 + */
6511 +
6512 +/*
6513 + * Change a given task's CPU affinity. Migrate the thread to a
6514 + * proper CPU and schedule it away if the CPU it's executing on
6515 + * is removed from the allowed bitmask.
6516 + *
6517 + * NOTE: the caller must have a valid reference to the task, the
6518 + * task must not exit() & deallocate itself prematurely. The
6519 + * call is not atomic; no spinlocks may be held.
6520 + */
6521 +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
6522 +{
6523 +       struct migration_req req;
6524 +       unsigned long flags;
6525 +       struct rq *rq;
6526 +       int ret = 0;
6527 +
6528 +       rq = task_rq_lock(p, &flags);
6529 +       if (!cpus_intersects(*new_mask, cpu_online_map)) {
6530 +               ret = -EINVAL;
6531 +               goto out;
6532 +       }
6533 +
6534 +       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
6535 +                    !cpus_equal(p->cpus_allowed, *new_mask))) {
6536 +               ret = -EINVAL;
6537 +               goto out;
6538 +       }
6539 +
6540 +       if (p->sched_class->set_cpus_allowed)
6541 +               p->sched_class->set_cpus_allowed(p, new_mask);
6542 +       else {
6543 +               p->cpus_allowed = *new_mask;
6544 +               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
6545 +       }
6546 +
6547 +       /* Can the task run on the task's current CPU? If so, we're done */
6548 +       if (cpu_isset(task_cpu(p), *new_mask))
6549 +               goto out;
6550 +
6551 +       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
6552 +               /* Need help from migration thread: drop lock and wait. */
6553 +               task_rq_unlock(rq, &flags);
6554 +               wake_up_process(rq->migration_thread);
6555 +               wait_for_completion(&req.done);
6556 +               tlb_migrate_finish(p->mm);
6557 +               return 0;
6558 +       }
6559 +out:
6560 +       task_rq_unlock(rq, &flags);
6561 +
6562 +       return ret;
6563 +}
6564 +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
6565 +
6566 +/*
6567 + * Move (not current) task off this cpu, onto dest cpu. We're doing
6568 + * this because either it can't run here any more (set_cpus_allowed()
6569 + * away from this CPU, or CPU going down), or because we're
6570 + * attempting to rebalance this task on exec (sched_exec).
6571 + *
6572 + * So we race with normal scheduler movements, but that's OK, as long
6573 + * as the task is no longer on this CPU.
6574 + *
6575 + * Returns non-zero if task was successfully migrated.
6576 + */
6577 +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6578 +{
6579 +       struct rq *rq_dest, *rq_src;
6580 +       int ret = 0, on_rq;
6581 +
6582 +       if (unlikely(!cpu_active(dest_cpu)))
6583 +               return ret;
6584 +
6585 +       rq_src = cpu_rq(src_cpu);
6586 +       rq_dest = cpu_rq(dest_cpu);
6587 +
6588 +       double_rq_lock(rq_src, rq_dest);
6589 +       /* Already moved. */
6590 +       if (task_cpu(p) != src_cpu)
6591 +               goto done;
6592 +       /* Affinity changed (again). */
6593 +       if (!cpu_isset(dest_cpu, p->cpus_allowed))
6594 +               goto fail;
6595 +
6596 +       on_rq = p->se.on_rq;
6597 +       if (on_rq)
6598 +               deactivate_task(rq_src, p, 0);
6599 +
6600 +       set_task_cpu(p, dest_cpu);
6601 +       if (on_rq) {
6602 +               activate_task(rq_dest, p, 0);
6603 +               check_preempt_curr(rq_dest, p);
6604 +       }
6605 +done:
6606 +       ret = 1;
6607 +fail:
6608 +       double_rq_unlock(rq_src, rq_dest);
6609 +       return ret;
6610 +}
6611 +
6612 +/*
6613 + * migration_thread - this is a highprio system thread that performs
6614 + * thread migration by bumping thread off CPU then 'pushing' onto
6615 + * another runqueue.
6616 + */
6617 +static int migration_thread(void *data)
6618 +{
6619 +       int cpu = (long)data;
6620 +       struct rq *rq;
6621 +
6622 +       rq = cpu_rq(cpu);
6623 +       BUG_ON(rq->migration_thread != current);
6624 +
6625 +       set_current_state(TASK_INTERRUPTIBLE);
6626 +       while (!kthread_should_stop()) {
6627 +               struct migration_req *req;
6628 +               struct list_head *head;
6629 +
6630 +               spin_lock_irq(&rq->lock);
6631 +
6632 +               if (cpu_is_offline(cpu)) {
6633 +                       spin_unlock_irq(&rq->lock);
6634 +                       goto wait_to_die;
6635 +               }
6636 +
6637 +               if (rq->active_balance) {
6638 +                       active_load_balance(rq, cpu);
6639 +                       rq->active_balance = 0;
6640 +               }
6641 +
6642 +               head = &rq->migration_queue;
6643 +
6644 +               if (list_empty(head)) {
6645 +                       spin_unlock_irq(&rq->lock);
6646 +                       schedule();
6647 +                       set_current_state(TASK_INTERRUPTIBLE);
6648 +                       continue;
6649 +               }
6650 +               req = list_entry(head->next, struct migration_req, list);
6651 +               list_del_init(head->next);
6652 +
6653 +               spin_unlock(&rq->lock);
6654 +               __migrate_task(req->task, cpu, req->dest_cpu);
6655 +               local_irq_enable();
6656 +
6657 +               complete(&req->done);
6658 +       }
6659 +       __set_current_state(TASK_RUNNING);
6660 +       return 0;
6661 +
6662 +wait_to_die:
6663 +       /* Wait for kthread_stop */
6664 +       set_current_state(TASK_INTERRUPTIBLE);
6665 +       while (!kthread_should_stop()) {
6666 +               schedule();
6667 +               set_current_state(TASK_INTERRUPTIBLE);
6668 +       }
6669 +       __set_current_state(TASK_RUNNING);
6670 +       return 0;
6671 +}
6672 +
6673 +#ifdef CONFIG_HOTPLUG_CPU
6674 +
6675 +static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6676 +{
6677 +       int ret;
6678 +
6679 +       local_irq_disable();
6680 +       ret = __migrate_task(p, src_cpu, dest_cpu);
6681 +       local_irq_enable();
6682 +       return ret;
6683 +}
6684 +
6685 +/*
6686 + * Figure out where task on dead CPU should go, use force if necessary.
6687 + * NOTE: interrupts should be disabled by the caller
6688 + */
6689 +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6690 +{
6691 +       unsigned long flags;
6692 +       cpumask_t mask;
6693 +       struct rq *rq;
6694 +       int dest_cpu;
6695 +
6696 +       do {
6697 +               /* On same node? */
6698 +               mask = node_to_cpumask(cpu_to_node(dead_cpu));
6699 +               cpus_and(mask, mask, p->cpus_allowed);
6700 +               dest_cpu = any_online_cpu(mask);
6701 +
6702 +               /* On any allowed CPU? */
6703 +               if (dest_cpu >= nr_cpu_ids)
6704 +                       dest_cpu = any_online_cpu(p->cpus_allowed);
6705 +
6706 +               /* No more Mr. Nice Guy. */
6707 +               if (dest_cpu >= nr_cpu_ids) {
6708 +                       cpumask_t cpus_allowed;
6709 +
6710 +                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
6711 +                       /*
6712 +                        * Try to stay on the same cpuset, where the
6713 +                        * current cpuset may be a subset of all cpus.
6714 +                        * The cpuset_cpus_allowed_locked() variant of
6715 +                        * cpuset_cpus_allowed() will not block. It must be
6716 +                        * called within calls to cpuset_lock/cpuset_unlock.
6717 +                        */
6718 +                       rq = task_rq_lock(p, &flags);
6719 +                       p->cpus_allowed = cpus_allowed;
6720 +                       dest_cpu = any_online_cpu(p->cpus_allowed);
6721 +                       task_rq_unlock(rq, &flags);
6722 +
6723 +                       /*
6724 +                        * Don't tell them about moving exiting tasks or
6725 +                        * kernel threads (both mm NULL), since they never
6726 +                        * leave kernel.
6727 +                        */
6728 +                       if (p->mm && printk_ratelimit()) {
6729 +                               printk(KERN_INFO "process %d (%s) no "
6730 +                                      "longer affine to cpu%d\n",
6731 +                                       task_pid_nr(p), p->comm, dead_cpu);
6732 +                       }
6733 +               }
6734 +       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
6735 +}
6736 +
6737 +/*
6738 + * While a dead CPU has no uninterruptible tasks queued at this point,
6739 + * it might still have a nonzero ->nr_uninterruptible counter, because
6740 + * for performance reasons the counter is not stricly tracking tasks to
6741 + * their home CPUs. So we just add the counter to another CPU's counter,
6742 + * to keep the global sum constant after CPU-down:
6743 + */
6744 +static void migrate_nr_uninterruptible(struct rq *rq_src)
6745 +{
6746 +       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
6747 +       unsigned long flags;
6748 +
6749 +       local_irq_save(flags);
6750 +       double_rq_lock(rq_src, rq_dest);
6751 +       rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6752 +       rq_src->nr_uninterruptible = 0;
6753 +       double_rq_unlock(rq_src, rq_dest);
6754 +       local_irq_restore(flags);
6755 +}
6756 +
6757 +/* Run through task list and migrate tasks from the dead cpu. */
6758 +static void migrate_live_tasks(int src_cpu)
6759 +{
6760 +       struct task_struct *p, *t;
6761 +
6762 +       read_lock(&tasklist_lock);
6763 +
6764 +       do_each_thread(t, p) {
6765 +               if (p == current)
6766 +                       continue;
6767 +
6768 +               if (task_cpu(p) == src_cpu)
6769 +                       move_task_off_dead_cpu(src_cpu, p);
6770 +       } while_each_thread(t, p);
6771 +
6772 +       read_unlock(&tasklist_lock);
6773 +}
6774 +
6775 +/*
6776 + * Schedules idle task to be the next runnable task on current CPU.
6777 + * It does so by boosting its priority to highest possible.
6778 + * Used by CPU offline code.
6779 + */
6780 +void sched_idle_next(void)
6781 +{
6782 +       int this_cpu = smp_processor_id();
6783 +       struct rq *rq = cpu_rq(this_cpu);
6784 +       struct task_struct *p = rq->idle;
6785 +       unsigned long flags;
6786 +
6787 +       /* cpu has to be offline */
6788 +       BUG_ON(cpu_online(this_cpu));
6789 +
6790 +       /*
6791 +        * Strictly not necessary since rest of the CPUs are stopped by now
6792 +        * and interrupts disabled on the current cpu.
6793 +        */
6794 +       spin_lock_irqsave(&rq->lock, flags);
6795 +
6796 +       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6797 +
6798 +       update_rq_clock(rq);
6799 +       activate_task(rq, p, 0);
6800 +
6801 +       spin_unlock_irqrestore(&rq->lock, flags);
6802 +}
6803 +
6804 +/*
6805 + * Ensures that the idle task is using init_mm right before its cpu goes
6806 + * offline.
6807 + */
6808 +void idle_task_exit(void)
6809 +{
6810 +       struct mm_struct *mm = current->active_mm;
6811 +
6812 +       BUG_ON(cpu_online(smp_processor_id()));
6813 +
6814 +       if (mm != &init_mm)
6815 +               switch_mm(mm, &init_mm, current);
6816 +       mmdrop(mm);
6817 +}
6818 +
6819 +/* called under rq->lock with disabled interrupts */
6820 +static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6821 +{
6822 +       struct rq *rq = cpu_rq(dead_cpu);
6823 +
6824 +       /* Must be exiting, otherwise would be on tasklist. */
6825 +       BUG_ON(!p->exit_state);
6826 +
6827 +       /* Cannot have done final schedule yet: would have vanished. */
6828 +       BUG_ON(p->state == TASK_DEAD);
6829 +
6830 +       get_task_struct(p);
6831 +
6832 +       /*
6833 +        * Drop lock around migration; if someone else moves it,
6834 +        * that's OK. No task can be added to this CPU, so iteration is
6835 +        * fine.
6836 +        */
6837 +       spin_unlock_irq(&rq->lock);
6838 +       move_task_off_dead_cpu(dead_cpu, p);
6839 +       spin_lock_irq(&rq->lock);
6840 +
6841 +       put_task_struct(p);
6842 +}
6843 +
6844 +/* release_task() removes task from tasklist, so we won't find dead tasks. */
6845 +static void migrate_dead_tasks(unsigned int dead_cpu)
6846 +{
6847 +       struct rq *rq = cpu_rq(dead_cpu);
6848 +       struct task_struct *next;
6849 +
6850 +       for ( ; ; ) {
6851 +               if (!rq->nr_running)
6852 +                       break;
6853 +               update_rq_clock(rq);
6854 +               next = pick_next_task(rq, rq->curr);
6855 +               if (!next)
6856 +                       break;
6857 +               next->sched_class->put_prev_task(rq, next);
6858 +               migrate_dead(dead_cpu, next);
6859 +
6860 +       }
6861 +}
6862 +#endif /* CONFIG_HOTPLUG_CPU */
6863 +
6864 +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6865 +
6866 +static struct ctl_table sd_ctl_dir[] = {
6867 +       {
6868 +               .procname       = "sched_domain",
6869 +               .mode           = 0555,
6870 +       },
6871 +       {0, },
6872 +};
6873 +
6874 +static struct ctl_table sd_ctl_root[] = {
6875 +       {
6876 +               .ctl_name       = CTL_KERN,
6877 +               .procname       = "kernel",
6878 +               .mode           = 0555,
6879 +               .child          = sd_ctl_dir,
6880 +       },
6881 +       {0, },
6882 +};
6883 +
6884 +static struct ctl_table *sd_alloc_ctl_entry(int n)
6885 +{
6886 +       struct ctl_table *entry =
6887 +               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6888 +
6889 +       return entry;
6890 +}
6891 +
6892 +static void sd_free_ctl_entry(struct ctl_table **tablep)
6893 +{
6894 +       struct ctl_table *entry;
6895 +
6896 +       /*
6897 +        * In the intermediate directories, both the child directory and
6898 +        * procname are dynamically allocated and could fail but the mode
6899 +        * will always be set. In the lowest directory the names are
6900 +        * static strings and all have proc handlers.
6901 +        */
6902 +       for (entry = *tablep; entry->mode; entry++) {
6903 +               if (entry->child)
6904 +                       sd_free_ctl_entry(&entry->child);
6905 +               if (entry->proc_handler == NULL)
6906 +                       kfree(entry->procname);
6907 +       }
6908 +
6909 +       kfree(*tablep);
6910 +       *tablep = NULL;
6911 +}
6912 +
6913 +static void
6914 +set_table_entry(struct ctl_table *entry,
6915 +               const char *procname, void *data, int maxlen,
6916 +               mode_t mode, proc_handler *proc_handler)
6917 +{
6918 +       entry->procname = procname;
6919 +       entry->data = data;
6920 +       entry->maxlen = maxlen;
6921 +       entry->mode = mode;
6922 +       entry->proc_handler = proc_handler;
6923 +}
6924 +
6925 +static struct ctl_table *
6926 +sd_alloc_ctl_domain_table(struct sched_domain *sd)
6927 +{
6928 +       struct ctl_table *table = sd_alloc_ctl_entry(12);
6929 +
6930 +       if (table == NULL)
6931 +               return NULL;
6932 +
6933 +       set_table_entry(&table[0], "min_interval", &sd->min_interval,
6934 +               sizeof(long), 0644, proc_doulongvec_minmax);
6935 +       set_table_entry(&table[1], "max_interval", &sd->max_interval,
6936 +               sizeof(long), 0644, proc_doulongvec_minmax);
6937 +       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6938 +               sizeof(int), 0644, proc_dointvec_minmax);
6939 +       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6940 +               sizeof(int), 0644, proc_dointvec_minmax);
6941 +       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6942 +               sizeof(int), 0644, proc_dointvec_minmax);
6943 +       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6944 +               sizeof(int), 0644, proc_dointvec_minmax);
6945 +       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6946 +               sizeof(int), 0644, proc_dointvec_minmax);
6947 +       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6948 +               sizeof(int), 0644, proc_dointvec_minmax);
6949 +       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6950 +               sizeof(int), 0644, proc_dointvec_minmax);
6951 +       set_table_entry(&table[9], "cache_nice_tries",
6952 +               &sd->cache_nice_tries,
6953 +               sizeof(int), 0644, proc_dointvec_minmax);
6954 +       set_table_entry(&table[10], "flags", &sd->flags,
6955 +               sizeof(int), 0644, proc_dointvec_minmax);
6956 +       /* &table[11] is terminator */
6957 +
6958 +       return table;
6959 +}
6960 +
6961 +static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6962 +{
6963 +       struct ctl_table *entry, *table;
6964 +       struct sched_domain *sd;
6965 +       int domain_num = 0, i;
6966 +       char buf[32];
6967 +
6968 +       for_each_domain(cpu, sd)
6969 +               domain_num++;
6970 +       entry = table = sd_alloc_ctl_entry(domain_num + 1);
6971 +       if (table == NULL)
6972 +               return NULL;
6973 +
6974 +       i = 0;
6975 +       for_each_domain(cpu, sd) {
6976 +               snprintf(buf, 32, "domain%d", i);
6977 +               entry->procname = kstrdup(buf, GFP_KERNEL);
6978 +               entry->mode = 0555;
6979 +               entry->child = sd_alloc_ctl_domain_table(sd);
6980 +               entry++;
6981 +               i++;
6982 +       }
6983 +       return table;
6984 +}
6985 +
6986 +static struct ctl_table_header *sd_sysctl_header;
6987 +static void register_sched_domain_sysctl(void)
6988 +{
6989 +       int i, cpu_num = num_online_cpus();
6990 +       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6991 +       char buf[32];
6992 +
6993 +       WARN_ON(sd_ctl_dir[0].child);
6994 +       sd_ctl_dir[0].child = entry;
6995 +
6996 +       if (entry == NULL)
6997 +               return;
6998 +
6999 +       for_each_online_cpu(i) {
7000 +               snprintf(buf, 32, "cpu%d", i);
7001 +               entry->procname = kstrdup(buf, GFP_KERNEL);
7002 +               entry->mode = 0555;
7003 +               entry->child = sd_alloc_ctl_cpu_table(i);
7004 +               entry++;
7005 +       }
7006 +
7007 +       WARN_ON(sd_sysctl_header);
7008 +       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
7009 +}
7010 +
7011 +/* may be called multiple times per register */
7012 +static void unregister_sched_domain_sysctl(void)
7013 +{
7014 +       if (sd_sysctl_header)
7015 +               unregister_sysctl_table(sd_sysctl_header);
7016 +       sd_sysctl_header = NULL;
7017 +       if (sd_ctl_dir[0].child)
7018 +               sd_free_ctl_entry(&sd_ctl_dir[0].child);
7019 +}
7020 +#else
7021 +static void register_sched_domain_sysctl(void)
7022 +{
7023 +}
7024 +static void unregister_sched_domain_sysctl(void)
7025 +{
7026 +}
7027 +#endif
7028 +
7029 +static void set_rq_online(struct rq *rq)
7030 +{
7031 +       if (!rq->online) {
7032 +               const struct sched_class *class;
7033 +
7034 +               cpu_set(rq->cpu, rq->rd->online);
7035 +               rq->online = 1;
7036 +
7037 +               for_each_class(class) {
7038 +                       if (class->rq_online)
7039 +                               class->rq_online(rq);
7040 +               }
7041 +       }
7042 +}
7043 +
7044 +static void set_rq_offline(struct rq *rq)
7045 +{
7046 +       if (rq->online) {
7047 +               const struct sched_class *class;
7048 +
7049 +               for_each_class(class) {
7050 +                       if (class->rq_offline)
7051 +                               class->rq_offline(rq);
7052 +               }
7053 +
7054 +               cpu_clear(rq->cpu, rq->rd->online);
7055 +               rq->online = 0;
7056 +       }
7057 +}
7058 +
7059 +/*
7060 + * migration_call - callback that gets triggered when a CPU is added.
7061 + * Here we can start up the necessary migration thread for the new CPU.
7062 + */
7063 +static int __cpuinit
7064 +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7065 +{
7066 +       struct task_struct *p;
7067 +       int cpu = (long)hcpu;
7068 +       unsigned long flags;
7069 +       struct rq *rq;
7070 +
7071 +       switch (action) {
7072 +
7073 +       case CPU_UP_PREPARE:
7074 +       case CPU_UP_PREPARE_FROZEN:
7075 +               p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
7076 +               if (IS_ERR(p))
7077 +                       return NOTIFY_BAD;
7078 +               kthread_bind(p, cpu);
7079 +               /* Must be high prio: stop_machine expects to yield to it. */
7080 +               rq = task_rq_lock(p, &flags);
7081 +               __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7082 +               task_rq_unlock(rq, &flags);
7083 +               cpu_rq(cpu)->migration_thread = p;
7084 +               break;
7085 +
7086 +       case CPU_ONLINE:
7087 +       case CPU_ONLINE_FROZEN:
7088 +               /* Strictly unnecessary, as first user will wake it. */
7089 +               wake_up_process(cpu_rq(cpu)->migration_thread);
7090 +
7091 +               /* Update our root-domain */
7092 +               rq = cpu_rq(cpu);
7093 +               spin_lock_irqsave(&rq->lock, flags);
7094 +               if (rq->rd) {
7095 +                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
7096 +
7097 +                       set_rq_online(rq);
7098 +               }
7099 +               spin_unlock_irqrestore(&rq->lock, flags);
7100 +               break;
7101 +
7102 +#ifdef CONFIG_HOTPLUG_CPU
7103 +       case CPU_UP_CANCELED:
7104 +       case CPU_UP_CANCELED_FROZEN:
7105 +               if (!cpu_rq(cpu)->migration_thread)
7106 +                       break;
7107 +               /* Unbind it from offline cpu so it can run. Fall thru. */
7108 +               kthread_bind(cpu_rq(cpu)->migration_thread,
7109 +                            any_online_cpu(cpu_online_map));
7110 +               kthread_stop(cpu_rq(cpu)->migration_thread);
7111 +               cpu_rq(cpu)->migration_thread = NULL;
7112 +               break;
7113 +
7114 +       case CPU_DEAD:
7115 +       case CPU_DEAD_FROZEN:
7116 +               cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
7117 +               migrate_live_tasks(cpu);
7118 +               rq = cpu_rq(cpu);
7119 +               kthread_stop(rq->migration_thread);
7120 +               rq->migration_thread = NULL;
7121 +               /* Idle task back to normal (off runqueue, low prio) */
7122 +               spin_lock_irq(&rq->lock);
7123 +               update_rq_clock(rq);
7124 +               deactivate_task(rq, rq->idle, 0);
7125 +               rq->idle->static_prio = MAX_PRIO;
7126 +               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7127 +               rq->idle->sched_class = &idle_sched_class;
7128 +               migrate_dead_tasks(cpu);
7129 +               spin_unlock_irq(&rq->lock);
7130 +               cpuset_unlock();
7131 +               migrate_nr_uninterruptible(rq);
7132 +               BUG_ON(rq->nr_running != 0);
7133 +
7134 +               /*
7135 +                * No need to migrate the tasks: it was best-effort if
7136 +                * they didn't take sched_hotcpu_mutex. Just wake up
7137 +                * the requestors.
7138 +                */
7139 +               spin_lock_irq(&rq->lock);
7140 +               while (!list_empty(&rq->migration_queue)) {
7141 +                       struct migration_req *req;
7142 +
7143 +                       req = list_entry(rq->migration_queue.next,
7144 +                                        struct migration_req, list);
7145 +                       list_del_init(&req->list);
7146 +                       spin_unlock_irq(&rq->lock);
7147 +                       complete(&req->done);
7148 +                       spin_lock_irq(&rq->lock);
7149 +               }
7150 +               spin_unlock_irq(&rq->lock);
7151 +               break;
7152 +
7153 +       case CPU_DYING:
7154 +       case CPU_DYING_FROZEN:
7155 +               /* Update our root-domain */
7156 +               rq = cpu_rq(cpu);
7157 +               spin_lock_irqsave(&rq->lock, flags);
7158 +               if (rq->rd) {
7159 +                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
7160 +                       set_rq_offline(rq);
7161 +               }
7162 +               spin_unlock_irqrestore(&rq->lock, flags);
7163 +               break;
7164 +#endif
7165 +       }
7166 +       return NOTIFY_OK;
7167 +}
7168 +
7169 +/* Register at highest priority so that task migration (migrate_all_tasks)
7170 + * happens before everything else.
7171 + */
7172 +static struct notifier_block __cpuinitdata migration_notifier = {
7173 +       .notifier_call = migration_call,
7174 +       .priority = 10
7175 +};
7176 +
7177 +static int __init migration_init(void)
7178 +{
7179 +       void *cpu = (void *)(long)smp_processor_id();
7180 +       int err;
7181 +
7182 +       /* Start one for the boot CPU: */
7183 +       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
7184 +       BUG_ON(err == NOTIFY_BAD);
7185 +       migration_call(&migration_notifier, CPU_ONLINE, cpu);
7186 +       register_cpu_notifier(&migration_notifier);
7187 +
7188 +       return err;
7189 +}
7190 +early_initcall(migration_init);
7191 +#endif
7192 +
7193 +#ifdef CONFIG_SMP
7194 +
7195 +#ifdef CONFIG_SCHED_DEBUG
7196 +
7197 +static inline const char *sd_level_to_string(enum sched_domain_level lvl)
7198 +{
7199 +       switch (lvl) {
7200 +       case SD_LV_NONE:
7201 +                       return "NONE";
7202 +       case SD_LV_SIBLING:
7203 +                       return "SIBLING";
7204 +       case SD_LV_MC:
7205 +                       return "MC";
7206 +       case SD_LV_CPU:
7207 +                       return "CPU";
7208 +       case SD_LV_NODE:
7209 +                       return "NODE";
7210 +       case SD_LV_ALLNODES:
7211 +                       return "ALLNODES";
7212 +       case SD_LV_MAX:
7213 +                       return "MAX";
7214 +
7215 +       }
7216 +       return "MAX";
7217 +}
7218 +
7219 +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7220 +                                 cpumask_t *groupmask)
7221 +{
7222 +       struct sched_group *group = sd->groups;
7223 +       char str[256];
7224 +
7225 +       cpulist_scnprintf(str, sizeof(str), sd->span);
7226 +       cpus_clear(*groupmask);
7227 +
7228 +       printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
7229 +
7230 +       if (!(sd->flags & SD_LOAD_BALANCE)) {
7231 +               printk("does not load-balance\n");
7232 +               if (sd->parent)
7233 +                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
7234 +                                       " has parent");
7235 +               return -1;
7236 +       }
7237 +
7238 +       printk(KERN_CONT "span %s level %s\n",
7239 +               str, sd_level_to_string(sd->level));
7240 +
7241 +       if (!cpu_isset(cpu, sd->span)) {
7242 +               printk(KERN_ERR "ERROR: domain->span does not contain "
7243 +                               "CPU%d\n", cpu);
7244 +       }
7245 +       if (!cpu_isset(cpu, group->cpumask)) {
7246 +               printk(KERN_ERR "ERROR: domain->groups does not contain"
7247 +                               " CPU%d\n", cpu);
7248 +       }
7249 +
7250 +       printk(KERN_DEBUG "%*s groups:", level + 1, "");
7251 +       do {
7252 +               if (!group) {
7253 +                       printk("\n");
7254 +                       printk(KERN_ERR "ERROR: group is NULL\n");
7255 +                       break;
7256 +               }
7257 +
7258 +               if (!group->__cpu_power) {
7259 +                       printk(KERN_CONT "\n");
7260 +                       printk(KERN_ERR "ERROR: domain->cpu_power not "
7261 +                                       "set\n");
7262 +                       break;
7263 +               }
7264 +
7265 +               if (!cpus_weight(group->cpumask)) {
7266 +                       printk(KERN_CONT "\n");
7267 +                       printk(KERN_ERR "ERROR: empty group\n");
7268 +                       break;
7269 +               }
7270 +
7271 +               if (cpus_intersects(*groupmask, group->cpumask)) {
7272 +                       printk(KERN_CONT "\n");
7273 +                       printk(KERN_ERR "ERROR: repeated CPUs\n");
7274 +                       break;
7275 +               }
7276 +
7277 +               cpus_or(*groupmask, *groupmask, group->cpumask);
7278 +
7279 +               cpulist_scnprintf(str, sizeof(str), group->cpumask);
7280 +               printk(KERN_CONT " %s", str);
7281 +
7282 +               group = group->next;
7283 +       } while (group != sd->groups);
7284 +       printk(KERN_CONT "\n");
7285 +
7286 +       if (!cpus_equal(sd->span, *groupmask))
7287 +               printk(KERN_ERR "ERROR: groups don't span domain->span\n");
7288 +
7289 +       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
7290 +               printk(KERN_ERR "ERROR: parent span is not a superset "
7291 +                       "of domain->span\n");
7292 +       return 0;
7293 +}
7294 +
7295 +static void sched_domain_debug(struct sched_domain *sd, int cpu)
7296 +{
7297 +       cpumask_t *groupmask;
7298 +       int level = 0;
7299 +
7300 +       if (!sd) {
7301 +               printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7302 +               return;
7303 +       }
7304 +
7305 +       printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
7306 +
7307 +       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
7308 +       if (!groupmask) {
7309 +               printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
7310 +               return;
7311 +       }
7312 +
7313 +       for (;;) {
7314 +               if (sched_domain_debug_one(sd, cpu, level, groupmask))
7315 +                       break;
7316 +               level++;
7317 +               sd = sd->parent;
7318 +               if (!sd)
7319 +                       break;
7320 +       }
7321 +       kfree(groupmask);
7322 +}
7323 +#else /* !CONFIG_SCHED_DEBUG */
7324 +# define sched_domain_debug(sd, cpu) do { } while (0)
7325 +#endif /* CONFIG_SCHED_DEBUG */
7326 +
7327 +static int sd_degenerate(struct sched_domain *sd)
7328 +{
7329 +       if (cpus_weight(sd->span) == 1)
7330 +               return 1;
7331 +
7332 +       /* Following flags need at least 2 groups */
7333 +       if (sd->flags & (SD_LOAD_BALANCE |
7334 +                        SD_BALANCE_NEWIDLE |
7335 +                        SD_BALANCE_FORK |
7336 +                        SD_BALANCE_EXEC |
7337 +                        SD_SHARE_CPUPOWER |
7338 +                        SD_SHARE_PKG_RESOURCES)) {
7339 +               if (sd->groups != sd->groups->next)
7340 +                       return 0;
7341 +       }
7342 +
7343 +       /* Following flags don't use groups */
7344 +       if (sd->flags & (SD_WAKE_IDLE |
7345 +                        SD_WAKE_AFFINE |
7346 +                        SD_WAKE_BALANCE))
7347 +               return 0;
7348 +
7349 +       return 1;
7350 +}
7351 +
7352 +static int
7353 +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7354 +{
7355 +       unsigned long cflags = sd->flags, pflags = parent->flags;
7356 +
7357 +       if (sd_degenerate(parent))
7358 +               return 1;
7359 +
7360 +       if (!cpus_equal(sd->span, parent->span))
7361 +               return 0;
7362 +
7363 +       /* Does parent contain flags not in child? */
7364 +       /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7365 +       if (cflags & SD_WAKE_AFFINE)
7366 +               pflags &= ~SD_WAKE_BALANCE;
7367 +       /* Flags needing groups don't count if only 1 group in parent */
7368 +       if (parent->groups == parent->groups->next) {
7369 +               pflags &= ~(SD_LOAD_BALANCE |
7370 +                               SD_BALANCE_NEWIDLE |
7371 +                               SD_BALANCE_FORK |
7372 +                               SD_BALANCE_EXEC |
7373 +                               SD_SHARE_CPUPOWER |
7374 +                               SD_SHARE_PKG_RESOURCES);
7375 +       }
7376 +       if (~cflags & pflags)
7377 +               return 0;
7378 +
7379 +       return 1;
7380 +}
7381 +
7382 +static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7383 +{
7384 +       unsigned long flags;
7385 +
7386 +       spin_lock_irqsave(&rq->lock, flags);
7387 +
7388 +       if (rq->rd) {
7389 +               struct root_domain *old_rd = rq->rd;
7390 +
7391 +               if (cpu_isset(rq->cpu, old_rd->online))
7392 +                       set_rq_offline(rq);
7393 +
7394 +               cpu_clear(rq->cpu, old_rd->span);
7395 +
7396 +               if (atomic_dec_and_test(&old_rd->refcount))
7397 +                       kfree(old_rd);
7398 +       }
7399 +
7400 +       atomic_inc(&rd->refcount);
7401 +       rq->rd = rd;
7402 +
7403 +       cpu_set(rq->cpu, rd->span);
7404 +       if (cpu_isset(rq->cpu, cpu_online_map))
7405 +               set_rq_online(rq);
7406 +
7407 +       spin_unlock_irqrestore(&rq->lock, flags);
7408 +}
7409 +
7410 +static void init_rootdomain(struct root_domain *rd)
7411 +{
7412 +       memset(rd, 0, sizeof(*rd));
7413 +
7414 +       cpus_clear(rd->span);
7415 +       cpus_clear(rd->online);
7416 +
7417 +       cpupri_init(&rd->cpupri);
7418 +}
7419 +
7420 +static void init_defrootdomain(void)
7421 +{
7422 +       init_rootdomain(&def_root_domain);
7423 +       atomic_set(&def_root_domain.refcount, 1);
7424 +}
7425 +
7426 +static struct root_domain *alloc_rootdomain(void)
7427 +{
7428 +       struct root_domain *rd;
7429 +
7430 +       rd = kmalloc(sizeof(*rd), GFP_KERNEL);
7431 +       if (!rd)
7432 +               return NULL;
7433 +
7434 +       init_rootdomain(rd);
7435 +
7436 +       return rd;
7437 +}
7438 +
7439 +/*
7440 + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
7441 + * hold the hotplug lock.
7442 + */
7443 +static void
7444 +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7445 +{
7446 +       struct rq *rq = cpu_rq(cpu);
7447 +       struct sched_domain *tmp;
7448 +
7449 +       /* Remove the sched domains which do not contribute to scheduling. */
7450 +       for (tmp = sd; tmp; ) {
7451 +               struct sched_domain *parent = tmp->parent;
7452 +               if (!parent)
7453 +                       break;
7454 +
7455 +               if (sd_parent_degenerate(tmp, parent)) {
7456 +                       tmp->parent = parent->parent;
7457 +                       if (parent->parent)
7458 +                               parent->parent->child = tmp;
7459 +               } else
7460 +                       tmp = tmp->parent;
7461 +       }
7462 +
7463 +       if (sd && sd_degenerate(sd)) {
7464 +               sd = sd->parent;
7465 +               if (sd)
7466 +                       sd->child = NULL;
7467 +       }
7468 +
7469 +       sched_domain_debug(sd, cpu);
7470 +
7471 +       rq_attach_root(rq, rd);
7472 +       rcu_assign_pointer(rq->sd, sd);
7473 +}
7474 +
7475 +/* cpus with isolated domains */
7476 +static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
7477 +
7478 +/* Setup the mask of cpus configured for isolated domains */
7479 +static int __init isolated_cpu_setup(char *str)
7480 +{
7481 +       static int __initdata ints[NR_CPUS];
7482 +       int i;
7483 +
7484 +       str = get_options(str, ARRAY_SIZE(ints), ints);
7485 +       cpus_clear(cpu_isolated_map);
7486 +       for (i = 1; i <= ints[0]; i++)
7487 +               if (ints[i] < NR_CPUS)
7488 +                       cpu_set(ints[i], cpu_isolated_map);
7489 +       return 1;
7490 +}
7491 +
7492 +__setup("isolcpus=", isolated_cpu_setup);
7493 +
7494 +/*
7495 + * init_sched_build_groups takes the cpumask we wish to span, and a pointer
7496 + * to a function which identifies what group(along with sched group) a CPU
7497 + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
7498 + * (due to the fact that we keep track of groups covered with a cpumask_t).
7499 + *
7500 + * init_sched_build_groups will build a circular linked list of the groups
7501 + * covered by the given span, and will set each group's ->cpumask correctly,
7502 + * and ->cpu_power to 0.
7503 + */
7504 +static void
7505 +init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
7506 +                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
7507 +                                       struct sched_group **sg,
7508 +                                       cpumask_t *tmpmask),
7509 +                       cpumask_t *covered, cpumask_t *tmpmask)
7510 +{
7511 +       struct sched_group *first = NULL, *last = NULL;
7512 +       int i;
7513 +
7514 +       cpus_clear(*covered);
7515 +
7516 +       for_each_cpu_mask_nr(i, *span) {
7517 +               struct sched_group *sg;
7518 +               int group = group_fn(i, cpu_map, &sg, tmpmask);
7519 +               int j;
7520 +
7521 +               if (cpu_isset(i, *covered))
7522 +                       continue;
7523 +
7524 +               cpus_clear(sg->cpumask);
7525 +               sg->__cpu_power = 0;
7526 +
7527 +               for_each_cpu_mask_nr(j, *span) {
7528 +                       if (group_fn(j, cpu_map, NULL, tmpmask) != group)
7529 +                               continue;
7530 +
7531 +                       cpu_set(j, *covered);
7532 +                       cpu_set(j, sg->cpumask);
7533 +               }
7534 +               if (!first)
7535 +                       first = sg;
7536 +               if (last)
7537 +                       last->next = sg;
7538 +               last = sg;
7539 +       }
7540 +       last->next = first;
7541 +}
7542 +
7543 +#define SD_NODES_PER_DOMAIN 16
7544 +
7545 +#ifdef CONFIG_NUMA
7546 +
7547 +/**
7548 + * find_next_best_node - find the next node to include in a sched_domain
7549 + * @node: node whose sched_domain we're building
7550 + * @used_nodes: nodes already in the sched_domain
7551 + *
7552 + * Find the next node to include in a given scheduling domain. Simply
7553 + * finds the closest node not already in the @used_nodes map.
7554 + *
7555 + * Should use nodemask_t.
7556 + */
7557 +static int find_next_best_node(int node, nodemask_t *used_nodes)
7558 +{
7559 +       int i, n, val, min_val, best_node = 0;
7560 +
7561 +       min_val = INT_MAX;
7562 +
7563 +       for (i = 0; i < nr_node_ids; i++) {
7564 +               /* Start at @node */
7565 +               n = (node + i) % nr_node_ids;
7566 +
7567 +               if (!nr_cpus_node(n))
7568 +                       continue;
7569 +
7570 +               /* Skip already used nodes */
7571 +               if (node_isset(n, *used_nodes))
7572 +                       continue;
7573 +
7574 +               /* Simple min distance search */
7575 +               val = node_distance(node, n);
7576 +
7577 +               if (val < min_val) {
7578 +                       min_val = val;
7579 +                       best_node = n;
7580 +               }
7581 +       }
7582 +
7583 +       node_set(best_node, *used_nodes);
7584 +       return best_node;
7585 +}
7586 +
7587 +/**
7588 + * sched_domain_node_span - get a cpumask for a node's sched_domain
7589 + * @node: node whose cpumask we're constructing
7590 + * @span: resulting cpumask
7591 + *
7592 + * Given a node, construct a good cpumask for its sched_domain to span. It
7593 + * should be one that prevents unnecessary balancing, but also spreads tasks
7594 + * out optimally.
7595 + */
7596 +static void sched_domain_node_span(int node, cpumask_t *span)
7597 +{
7598 +       nodemask_t used_nodes;
7599 +       node_to_cpumask_ptr(nodemask, node);
7600 +       int i;
7601 +
7602 +       cpus_clear(*span);
7603 +       nodes_clear(used_nodes);
7604 +
7605 +       cpus_or(*span, *span, *nodemask);
7606 +       node_set(node, used_nodes);
7607 +
7608 +       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7609 +               int next_node = find_next_best_node(node, &used_nodes);
7610 +
7611 +               node_to_cpumask_ptr_next(nodemask, next_node);
7612 +               cpus_or(*span, *span, *nodemask);
7613 +       }
7614 +}
7615 +#endif /* CONFIG_NUMA */
7616 +
7617 +int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7618 +
7619 +/*
7620 + * SMT sched-domains:
7621 + */
7622 +#ifdef CONFIG_SCHED_SMT
7623 +static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
7624 +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
7625 +
7626 +static int
7627 +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7628 +                cpumask_t *unused)
7629 +{
7630 +       if (sg)
7631 +               *sg = &per_cpu(sched_group_cpus, cpu);
7632 +       return cpu;
7633 +}
7634 +#endif /* CONFIG_SCHED_SMT */
7635 +
7636 +/*
7637 + * multi-core sched-domains:
7638 + */
7639 +#ifdef CONFIG_SCHED_MC
7640 +static DEFINE_PER_CPU(struct sched_domain, core_domains);
7641 +static DEFINE_PER_CPU(struct sched_group, sched_group_core);
7642 +#endif /* CONFIG_SCHED_MC */
7643 +
7644 +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7645 +static int
7646 +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7647 +                 cpumask_t *mask)
7648 +{
7649 +       int group;
7650 +
7651 +       *mask = per_cpu(cpu_sibling_map, cpu);
7652 +       cpus_and(*mask, *mask, *cpu_map);
7653 +       group = first_cpu(*mask);
7654 +       if (sg)
7655 +               *sg = &per_cpu(sched_group_core, group);
7656 +       return group;
7657 +}
7658 +#elif defined(CONFIG_SCHED_MC)
7659 +static int
7660 +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7661 +                 cpumask_t *unused)
7662 +{
7663 +       if (sg)
7664 +               *sg = &per_cpu(sched_group_core, cpu);
7665 +       return cpu;
7666 +}
7667 +#endif
7668 +
7669 +static DEFINE_PER_CPU(struct sched_domain, phys_domains);
7670 +static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
7671 +
7672 +static int
7673 +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7674 +                 cpumask_t *mask)
7675 +{
7676 +       int group;
7677 +#ifdef CONFIG_SCHED_MC
7678 +       *mask = cpu_coregroup_map(cpu);
7679 +       cpus_and(*mask, *mask, *cpu_map);
7680 +       group = first_cpu(*mask);
7681 +#elif defined(CONFIG_SCHED_SMT)
7682 +       *mask = per_cpu(cpu_sibling_map, cpu);
7683 +       cpus_and(*mask, *mask, *cpu_map);
7684 +       group = first_cpu(*mask);
7685 +#else
7686 +       group = cpu;
7687 +#endif
7688 +       if (sg)
7689 +               *sg = &per_cpu(sched_group_phys, group);
7690 +       return group;
7691 +}
7692 +
7693 +#ifdef CONFIG_NUMA
7694 +/*
7695 + * The init_sched_build_groups can't handle what we want to do with node
7696 + * groups, so roll our own. Now each node has its own list of groups which
7697 + * gets dynamically allocated.
7698 + */
7699 +static DEFINE_PER_CPU(struct sched_domain, node_domains);
7700 +static struct sched_group ***sched_group_nodes_bycpu;
7701 +
7702 +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7703 +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
7704 +
7705 +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
7706 +                                struct sched_group **sg, cpumask_t *nodemask)
7707 +{
7708 +       int group;
7709 +
7710 +       *nodemask = node_to_cpumask(cpu_to_node(cpu));
7711 +       cpus_and(*nodemask, *nodemask, *cpu_map);
7712 +       group = first_cpu(*nodemask);
7713 +
7714 +       if (sg)
7715 +               *sg = &per_cpu(sched_group_allnodes, group);
7716 +       return group;
7717 +}
7718 +
7719 +static void init_numa_sched_groups_power(struct sched_group *group_head)
7720 +{
7721 +       struct sched_group *sg = group_head;
7722 +       int j;
7723 +
7724 +       if (!sg)
7725 +               return;
7726 +       do {
7727 +               for_each_cpu_mask_nr(j, sg->cpumask) {
7728 +                       struct sched_domain *sd;
7729 +
7730 +                       sd = &per_cpu(phys_domains, j);
7731 +                       if (j != first_cpu(sd->groups->cpumask)) {
7732 +                               /*
7733 +                                * Only add "power" once for each
7734 +                                * physical package.
7735 +                                */
7736 +                               continue;
7737 +                       }
7738 +
7739 +                       sg_inc_cpu_power(sg, sd->groups->__cpu_power);
7740 +               }
7741 +               sg = sg->next;
7742 +       } while (sg != group_head);
7743 +}
7744 +#endif /* CONFIG_NUMA */
7745 +
7746 +#ifdef CONFIG_NUMA
7747 +/* Free memory allocated for various sched_group structures */
7748 +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7749 +{
7750 +       int cpu, i;
7751 +
7752 +       for_each_cpu_mask_nr(cpu, *cpu_map) {
7753 +               struct sched_group **sched_group_nodes
7754 +                       = sched_group_nodes_bycpu[cpu];
7755 +
7756 +               if (!sched_group_nodes)
7757 +                       continue;
7758 +
7759 +               for (i = 0; i < nr_node_ids; i++) {
7760 +                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
7761 +
7762 +                       *nodemask = node_to_cpumask(i);
7763 +                       cpus_and(*nodemask, *nodemask, *cpu_map);
7764 +                       if (cpus_empty(*nodemask))
7765 +                               continue;
7766 +
7767 +                       if (sg == NULL)
7768 +                               continue;
7769 +                       sg = sg->next;
7770 +next_sg:
7771 +                       oldsg = sg;
7772 +                       sg = sg->next;
7773 +                       kfree(oldsg);
7774 +                       if (oldsg != sched_group_nodes[i])
7775 +                               goto next_sg;
7776 +               }
7777 +               kfree(sched_group_nodes);
7778 +               sched_group_nodes_bycpu[cpu] = NULL;
7779 +       }
7780 +}
7781 +#else /* !CONFIG_NUMA */
7782 +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7783 +{
7784 +}
7785 +#endif /* CONFIG_NUMA */
7786 +
7787 +/*
7788 + * Initialize sched groups cpu_power.
7789 + *
7790 + * cpu_power indicates the capacity of sched group, which is used while
7791 + * distributing the load between different sched groups in a sched domain.
7792 + * Typically cpu_power for all the groups in a sched domain will be same unless
7793 + * there are asymmetries in the topology. If there are asymmetries, group
7794 + * having more cpu_power will pickup more load compared to the group having
7795 + * less cpu_power.
7796 + *
7797 + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
7798 + * the maximum number of tasks a group can handle in the presence of other idle
7799 + * or lightly loaded groups in the same sched domain.
7800 + */
7801 +static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7802 +{
7803 +       struct sched_domain *child;
7804 +       struct sched_group *group;
7805 +
7806 +       WARN_ON(!sd || !sd->groups);
7807 +
7808 +       if (cpu != first_cpu(sd->groups->cpumask))
7809 +               return;
7810 +
7811 +       child = sd->child;
7812 +
7813 +       sd->groups->__cpu_power = 0;
7814 +
7815 +       /*
7816 +        * For perf policy, if the groups in child domain share resources
7817 +        * (for example cores sharing some portions of the cache hierarchy
7818 +        * or SMT), then set this domain groups cpu_power such that each group
7819 +        * can handle only one task, when there are other idle groups in the
7820 +        * same sched domain.
7821 +        */
7822 +       if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
7823 +                      (child->flags &
7824 +                       (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
7825 +               sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
7826 +               return;
7827 +       }
7828 +
7829 +       /*
7830 +        * add cpu_power of each child group to this groups cpu_power
7831 +        */
7832 +       group = child->groups;
7833 +       do {
7834 +               sg_inc_cpu_power(sd->groups, group->__cpu_power);
7835 +               group = group->next;
7836 +       } while (group != child->groups);
7837 +}
7838 +
7839 +/*
7840 + * Initializers for schedule domains
7841 + * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7842 + */
7843 +
7844 +#define        SD_INIT(sd, type)       sd_init_##type(sd)
7845 +#define SD_INIT_FUNC(type)     \
7846 +static noinline void sd_init_##type(struct sched_domain *sd)   \
7847 +{                                                              \
7848 +       memset(sd, 0, sizeof(*sd));                             \
7849 +       *sd = SD_##type##_INIT;                                 \
7850 +       sd->level = SD_LV_##type;                               \
7851 +}
7852 +
7853 +SD_INIT_FUNC(CPU)
7854 +#ifdef CONFIG_NUMA
7855 + SD_INIT_FUNC(ALLNODES)
7856 + SD_INIT_FUNC(NODE)
7857 +#endif
7858 +#ifdef CONFIG_SCHED_SMT
7859 + SD_INIT_FUNC(SIBLING)
7860 +#endif
7861 +#ifdef CONFIG_SCHED_MC
7862 + SD_INIT_FUNC(MC)
7863 +#endif
7864 +
7865 +/*
7866 + * To minimize stack usage kmalloc room for cpumasks and share the
7867 + * space as the usage in build_sched_domains() dictates.  Used only
7868 + * if the amount of space is significant.
7869 + */
7870 +struct allmasks {
7871 +       cpumask_t tmpmask;                      /* make this one first */
7872 +       union {
7873 +               cpumask_t nodemask;
7874 +               cpumask_t this_sibling_map;
7875 +               cpumask_t this_core_map;
7876 +       };
7877 +       cpumask_t send_covered;
7878 +
7879 +#ifdef CONFIG_NUMA
7880 +       cpumask_t domainspan;
7881 +       cpumask_t covered;
7882 +       cpumask_t notcovered;
7883 +#endif
7884 +};
7885 +
7886 +#if    NR_CPUS > 128
7887 +#define        SCHED_CPUMASK_ALLOC             1
7888 +#define        SCHED_CPUMASK_FREE(v)           kfree(v)
7889 +#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
7890 +#else
7891 +#define        SCHED_CPUMASK_ALLOC             0
7892 +#define        SCHED_CPUMASK_FREE(v)
7893 +#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
7894 +#endif
7895 +
7896 +#define        SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
7897 +                       ((unsigned long)(a) + offsetof(struct allmasks, v))
7898 +
7899 +static int default_relax_domain_level = -1;
7900 +
7901 +static int __init setup_relax_domain_level(char *str)
7902 +{
7903 +       unsigned long val;
7904 +
7905 +       val = simple_strtoul(str, NULL, 0);
7906 +       if (val < SD_LV_MAX)
7907 +               default_relax_domain_level = val;
7908 +
7909 +       return 1;
7910 +}
7911 +__setup("relax_domain_level=", setup_relax_domain_level);
7912 +
7913 +static void set_domain_attribute(struct sched_domain *sd,
7914 +                                struct sched_domain_attr *attr)
7915 +{
7916 +       int request;
7917 +
7918 +       if (!attr || attr->relax_domain_level < 0) {
7919 +               if (default_relax_domain_level < 0)
7920 +                       return;
7921 +               else
7922 +                       request = default_relax_domain_level;
7923 +       } else
7924 +               request = attr->relax_domain_level;
7925 +       if (request < sd->level) {
7926 +               /* turn off idle balance on this domain */
7927 +               sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
7928 +       } else {
7929 +               /* turn on idle balance on this domain */
7930 +               sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
7931 +       }
7932 +}
7933 +
7934 +/*
7935 + * Build sched domains for a given set of cpus and attach the sched domains
7936 + * to the individual cpus
7937 + */
7938 +static int __build_sched_domains(const cpumask_t *cpu_map,
7939 +                                struct sched_domain_attr *attr)
7940 +{
7941 +       int i;
7942 +       struct root_domain *rd;
7943 +       SCHED_CPUMASK_DECLARE(allmasks);
7944 +       cpumask_t *tmpmask;
7945 +#ifdef CONFIG_NUMA
7946 +       struct sched_group **sched_group_nodes = NULL;
7947 +       int sd_allnodes = 0;
7948 +
7949 +       /*
7950 +        * Allocate the per-node list of sched groups
7951 +        */
7952 +       sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
7953 +                                   GFP_KERNEL);
7954 +       if (!sched_group_nodes) {
7955 +               printk(KERN_WARNING "Can not alloc sched group node list\n");
7956 +               return -ENOMEM;
7957 +       }
7958 +#endif
7959 +
7960 +       rd = alloc_rootdomain();
7961 +       if (!rd) {
7962 +               printk(KERN_WARNING "Cannot alloc root domain\n");
7963 +#ifdef CONFIG_NUMA
7964 +               kfree(sched_group_nodes);
7965 +#endif
7966 +               return -ENOMEM;
7967 +       }
7968 +
7969 +#if SCHED_CPUMASK_ALLOC
7970 +       /* get space for all scratch cpumask variables */
7971 +       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7972 +       if (!allmasks) {
7973 +               printk(KERN_WARNING "Cannot alloc cpumask array\n");
7974 +               kfree(rd);
7975 +#ifdef CONFIG_NUMA
7976 +               kfree(sched_group_nodes);
7977 +#endif
7978 +               return -ENOMEM;
7979 +       }
7980 +#endif
7981 +       tmpmask = (cpumask_t *)allmasks;
7982 +
7983 +
7984 +#ifdef CONFIG_NUMA
7985 +       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7986 +#endif
7987 +
7988 +       /*
7989 +        * Set up domains for cpus specified by the cpu_map.
7990 +        */
7991 +       for_each_cpu_mask_nr(i, *cpu_map) {
7992 +               struct sched_domain *sd = NULL, *p;
7993 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
7994 +
7995 +               *nodemask = node_to_cpumask(cpu_to_node(i));
7996 +               cpus_and(*nodemask, *nodemask, *cpu_map);
7997 +
7998 +#ifdef CONFIG_NUMA
7999 +               if (cpus_weight(*cpu_map) >
8000 +                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
8001 +                       sd = &per_cpu(allnodes_domains, i);
8002 +                       SD_INIT(sd, ALLNODES);
8003 +                       set_domain_attribute(sd, attr);
8004 +                       sd->span = *cpu_map;
8005 +                       cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8006 +                       p = sd;
8007 +                       sd_allnodes = 1;
8008 +               } else
8009 +                       p = NULL;
8010 +
8011 +               sd = &per_cpu(node_domains, i);
8012 +               SD_INIT(sd, NODE);
8013 +               set_domain_attribute(sd, attr);
8014 +               sched_domain_node_span(cpu_to_node(i), &sd->span);
8015 +               sd->parent = p;
8016 +               if (p)
8017 +                       p->child = sd;
8018 +               cpus_and(sd->span, sd->span, *cpu_map);
8019 +#endif
8020 +
8021 +               p = sd;
8022 +               sd = &per_cpu(phys_domains, i);
8023 +               SD_INIT(sd, CPU);
8024 +               set_domain_attribute(sd, attr);
8025 +               sd->span = *nodemask;
8026 +               sd->parent = p;
8027 +               if (p)
8028 +                       p->child = sd;
8029 +               cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
8030 +
8031 +#ifdef CONFIG_SCHED_MC
8032 +               p = sd;
8033 +               sd = &per_cpu(core_domains, i);
8034 +               SD_INIT(sd, MC);
8035 +               set_domain_attribute(sd, attr);
8036 +               sd->span = cpu_coregroup_map(i);
8037 +               cpus_and(sd->span, sd->span, *cpu_map);
8038 +               sd->parent = p;
8039 +               p->child = sd;
8040 +               cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8041 +#endif
8042 +
8043 +#ifdef CONFIG_SCHED_SMT
8044 +               p = sd;
8045 +               sd = &per_cpu(cpu_domains, i);
8046 +               SD_INIT(sd, SIBLING);
8047 +               set_domain_attribute(sd, attr);
8048 +               sd->span = per_cpu(cpu_sibling_map, i);
8049 +               cpus_and(sd->span, sd->span, *cpu_map);
8050 +               sd->parent = p;
8051 +               p->child = sd;
8052 +               cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8053 +#endif
8054 +       }
8055 +
8056 +#ifdef CONFIG_SCHED_SMT
8057 +       /* Set up CPU (sibling) groups */
8058 +       for_each_cpu_mask_nr(i, *cpu_map) {
8059 +               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
8060 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
8061 +
8062 +               *this_sibling_map = per_cpu(cpu_sibling_map, i);
8063 +               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
8064 +               if (i != first_cpu(*this_sibling_map))
8065 +                       continue;
8066 +
8067 +               init_sched_build_groups(this_sibling_map, cpu_map,
8068 +                                       &cpu_to_cpu_group,
8069 +                                       send_covered, tmpmask);
8070 +       }
8071 +#endif
8072 +
8073 +#ifdef CONFIG_SCHED_MC
8074 +       /* Set up multi-core groups */
8075 +       for_each_cpu_mask_nr(i, *cpu_map) {
8076 +               SCHED_CPUMASK_VAR(this_core_map, allmasks);
8077 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
8078 +
8079 +               *this_core_map = cpu_coregroup_map(i);
8080 +               cpus_and(*this_core_map, *this_core_map, *cpu_map);
8081 +               if (i != first_cpu(*this_core_map))
8082 +                       continue;
8083 +
8084 +               init_sched_build_groups(this_core_map, cpu_map,
8085 +                                       &cpu_to_core_group,
8086 +                                       send_covered, tmpmask);
8087 +       }
8088 +#endif
8089 +
8090 +       /* Set up physical groups */
8091 +       for (i = 0; i < nr_node_ids; i++) {
8092 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
8093 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
8094 +
8095 +               *nodemask = node_to_cpumask(i);
8096 +               cpus_and(*nodemask, *nodemask, *cpu_map);
8097 +               if (cpus_empty(*nodemask))
8098 +                       continue;
8099 +
8100 +               init_sched_build_groups(nodemask, cpu_map,
8101 +                                       &cpu_to_phys_group,
8102 +                                       send_covered, tmpmask);
8103 +       }
8104 +
8105 +#ifdef CONFIG_NUMA
8106 +       /* Set up node groups */
8107 +       if (sd_allnodes) {
8108 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
8109 +
8110 +               init_sched_build_groups(cpu_map, cpu_map,
8111 +                                       &cpu_to_allnodes_group,
8112 +                                       send_covered, tmpmask);
8113 +       }
8114 +
8115 +       for (i = 0; i < nr_node_ids; i++) {
8116 +               /* Set up node groups */
8117 +               struct sched_group *sg, *prev;
8118 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
8119 +               SCHED_CPUMASK_VAR(domainspan, allmasks);
8120 +               SCHED_CPUMASK_VAR(covered, allmasks);
8121 +               int j;
8122 +
8123 +               *nodemask = node_to_cpumask(i);
8124 +               cpus_clear(*covered);
8125 +
8126 +               cpus_and(*nodemask, *nodemask, *cpu_map);
8127 +               if (cpus_empty(*nodemask)) {
8128 +                       sched_group_nodes[i] = NULL;
8129 +                       continue;
8130 +               }
8131 +
8132 +               sched_domain_node_span(i, domainspan);
8133 +               cpus_and(*domainspan, *domainspan, *cpu_map);
8134 +
8135 +               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
8136 +               if (!sg) {
8137 +                       printk(KERN_WARNING "Can not alloc domain group for "
8138 +                               "node %d\n", i);
8139 +                       goto error;
8140 +               }
8141 +               sched_group_nodes[i] = sg;
8142 +               for_each_cpu_mask_nr(j, *nodemask) {
8143 +                       struct sched_domain *sd;
8144 +
8145 +                       sd = &per_cpu(node_domains, j);
8146 +                       sd->groups = sg;
8147 +               }
8148 +               sg->__cpu_power = 0;
8149 +               sg->cpumask = *nodemask;
8150 +               sg->next = sg;
8151 +               cpus_or(*covered, *covered, *nodemask);
8152 +               prev = sg;
8153 +
8154 +               for (j = 0; j < nr_node_ids; j++) {
8155 +                       SCHED_CPUMASK_VAR(notcovered, allmasks);
8156 +                       int n = (i + j) % nr_node_ids;
8157 +                       node_to_cpumask_ptr(pnodemask, n);
8158 +
8159 +                       cpus_complement(*notcovered, *covered);
8160 +                       cpus_and(*tmpmask, *notcovered, *cpu_map);
8161 +                       cpus_and(*tmpmask, *tmpmask, *domainspan);
8162 +                       if (cpus_empty(*tmpmask))
8163 +                               break;
8164 +
8165 +                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
8166 +                       if (cpus_empty(*tmpmask))
8167 +                               continue;
8168 +
8169 +                       sg = kmalloc_node(sizeof(struct sched_group),
8170 +                                         GFP_KERNEL, i);
8171 +                       if (!sg) {
8172 +                               printk(KERN_WARNING
8173 +                               "Can not alloc domain group for node %d\n", j);
8174 +                               goto error;
8175 +                       }
8176 +                       sg->__cpu_power = 0;
8177 +                       sg->cpumask = *tmpmask;
8178 +                       sg->next = prev->next;
8179 +                       cpus_or(*covered, *covered, *tmpmask);
8180 +                       prev->next = sg;
8181 +                       prev = sg;
8182 +               }
8183 +       }
8184 +#endif
8185 +
8186 +       /* Calculate CPU power for physical packages and nodes */
8187 +#ifdef CONFIG_SCHED_SMT
8188 +       for_each_cpu_mask_nr(i, *cpu_map) {
8189 +               struct sched_domain *sd = &per_cpu(cpu_domains, i);
8190 +
8191 +               init_sched_groups_power(i, sd);
8192 +       }
8193 +#endif
8194 +#ifdef CONFIG_SCHED_MC
8195 +       for_each_cpu_mask_nr(i, *cpu_map) {
8196 +               struct sched_domain *sd = &per_cpu(core_domains, i);
8197 +
8198 +               init_sched_groups_power(i, sd);
8199 +       }
8200 +#endif
8201 +
8202 +       for_each_cpu_mask_nr(i, *cpu_map) {
8203 +               struct sched_domain *sd = &per_cpu(phys_domains, i);
8204 +
8205 +               init_sched_groups_power(i, sd);
8206 +       }
8207 +
8208 +#ifdef CONFIG_NUMA
8209 +       for (i = 0; i < nr_node_ids; i++)
8210 +               init_numa_sched_groups_power(sched_group_nodes[i]);
8211 +
8212 +       if (sd_allnodes) {
8213 +               struct sched_group *sg;
8214 +
8215 +               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
8216 +                                                               tmpmask);
8217 +               init_numa_sched_groups_power(sg);
8218 +       }
8219 +#endif
8220 +
8221 +       /* Attach the domains */
8222 +       for_each_cpu_mask_nr(i, *cpu_map) {
8223 +               struct sched_domain *sd;
8224 +#ifdef CONFIG_SCHED_SMT
8225 +               sd = &per_cpu(cpu_domains, i);
8226 +#elif defined(CONFIG_SCHED_MC)
8227 +               sd = &per_cpu(core_domains, i);
8228 +#else
8229 +               sd = &per_cpu(phys_domains, i);
8230 +#endif
8231 +               cpu_attach_domain(sd, rd, i);
8232 +       }
8233 +
8234 +       SCHED_CPUMASK_FREE((void *)allmasks);
8235 +       return 0;
8236 +
8237 +#ifdef CONFIG_NUMA
8238 +error:
8239 +       free_sched_groups(cpu_map, tmpmask);
8240 +       SCHED_CPUMASK_FREE((void *)allmasks);
8241 +       return -ENOMEM;
8242 +#endif
8243 +}
8244 +
8245 +static int build_sched_domains(const cpumask_t *cpu_map)
8246 +{
8247 +       return __build_sched_domains(cpu_map, NULL);
8248 +}
8249 +
8250 +static cpumask_t *doms_cur;    /* current sched domains */
8251 +static int ndoms_cur;          /* number of sched domains in 'doms_cur' */
8252 +static struct sched_domain_attr *dattr_cur;
8253 +                               /* attribues of custom domains in 'doms_cur' */
8254 +
8255 +/*
8256 + * Special case: If a kmalloc of a doms_cur partition (array of
8257 + * cpumask_t) fails, then fallback to a single sched domain,
8258 + * as determined by the single cpumask_t fallback_doms.
8259 + */
8260 +static cpumask_t fallback_doms;
8261 +
8262 +void __attribute__((weak)) arch_update_cpu_topology(void)
8263 +{
8264 +}
8265 +
8266 +/*
8267 + * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8268 + * For now this just excludes isolated cpus, but could be used to
8269 + * exclude other special cases in the future.
8270 + */
8271 +static int arch_init_sched_domains(const cpumask_t *cpu_map)
8272 +{
8273 +       int err;
8274 +
8275 +       arch_update_cpu_topology();
8276 +       ndoms_cur = 1;
8277 +       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
8278 +       if (!doms_cur)
8279 +               doms_cur = &fallback_doms;
8280 +       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
8281 +       dattr_cur = NULL;
8282 +       err = build_sched_domains(doms_cur);
8283 +       register_sched_domain_sysctl();
8284 +
8285 +       return err;
8286 +}
8287 +
8288 +static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
8289 +                                      cpumask_t *tmpmask)
8290 +{
8291 +       free_sched_groups(cpu_map, tmpmask);
8292 +}
8293 +
8294 +/*
8295 + * Detach sched domains from a group of cpus specified in cpu_map
8296 + * These cpus will now be attached to the NULL domain
8297 + */
8298 +static void detach_destroy_domains(const cpumask_t *cpu_map)
8299 +{
8300 +       cpumask_t tmpmask;
8301 +       int i;
8302 +
8303 +       unregister_sched_domain_sysctl();
8304 +
8305 +       for_each_cpu_mask_nr(i, *cpu_map)
8306 +               cpu_attach_domain(NULL, &def_root_domain, i);
8307 +       synchronize_sched();
8308 +       arch_destroy_sched_domains(cpu_map, &tmpmask);
8309 +}
8310 +
8311 +/* handle null as "default" */
8312 +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8313 +                       struct sched_domain_attr *new, int idx_new)
8314 +{
8315 +       struct sched_domain_attr tmp;
8316 +
8317 +       /* fast path */
8318 +       if (!new && !cur)
8319 +               return 1;
8320 +
8321 +       tmp = SD_ATTR_INIT;
8322 +       return !memcmp(cur ? (cur + idx_cur) : &tmp,
8323 +                       new ? (new + idx_new) : &tmp,
8324 +                       sizeof(struct sched_domain_attr));
8325 +}
8326 +
8327 +/*
8328 + * Partition sched domains as specified by the 'ndoms_new'
8329 + * cpumasks in the array doms_new[] of cpumasks. This compares
8330 + * doms_new[] to the current sched domain partitioning, doms_cur[].
8331 + * It destroys each deleted domain and builds each new domain.
8332 + *
8333 + * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
8334 + * The masks don't intersect (don't overlap.) We should setup one
8335 + * sched domain for each mask. CPUs not in any of the cpumasks will
8336 + * not be load balanced. If the same cpumask appears both in the
8337 + * current 'doms_cur' domains and in the new 'doms_new', we can leave
8338 + * it as it is.
8339 + *
8340 + * The passed in 'doms_new' should be kmalloc'd. This routine takes
8341 + * ownership of it and will kfree it when done with it. If the caller
8342 + * failed the kmalloc call, then it can pass in doms_new == NULL &&
8343 + * ndoms_new == 1, and partition_sched_domains() will fallback to
8344 + * the single partition 'fallback_doms', it also forces the domains
8345 + * to be rebuilt.
8346 + *
8347 + * If doms_new == NULL it will be replaced with cpu_online_map.
8348 + * ndoms_new == 0 is a special case for destroying existing domains,
8349 + * and it will not create the default domain.
8350 + *
8351 + * Call with hotplug lock held
8352 + */
8353 +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
8354 +                            struct sched_domain_attr *dattr_new)
8355 +{
8356 +       int i, j, n;
8357 +
8358 +       mutex_lock(&sched_domains_mutex);
8359 +
8360 +       /* always unregister in case we don't destroy any domains */
8361 +       unregister_sched_domain_sysctl();
8362 +
8363 +       n = doms_new ? ndoms_new : 0;
8364 +
8365 +       /* Destroy deleted domains */
8366 +       for (i = 0; i < ndoms_cur; i++) {
8367 +               for (j = 0; j < n; j++) {
8368 +                       if (cpus_equal(doms_cur[i], doms_new[j])
8369 +                           && dattrs_equal(dattr_cur, i, dattr_new, j))
8370 +                               goto match1;
8371 +               }
8372 +               /* no match - a current sched domain not in new doms_new[] */
8373 +               detach_destroy_domains(doms_cur + i);
8374 +match1:
8375 +               ;
8376 +       }
8377 +
8378 +       if (doms_new == NULL) {
8379 +               ndoms_cur = 0;
8380 +               doms_new = &fallback_doms;
8381 +               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
8382 +               dattr_new = NULL;
8383 +       }
8384 +
8385 +       /* Build new domains */
8386 +       for (i = 0; i < ndoms_new; i++) {
8387 +               for (j = 0; j < ndoms_cur; j++) {
8388 +                       if (cpus_equal(doms_new[i], doms_cur[j])
8389 +                           && dattrs_equal(dattr_new, i, dattr_cur, j))
8390 +                               goto match2;
8391 +               }
8392 +               /* no match - add a new doms_new */
8393 +               __build_sched_domains(doms_new + i,
8394 +                                       dattr_new ? dattr_new + i : NULL);
8395 +match2:
8396 +               ;
8397 +       }
8398 +
8399 +       /* Remember the new sched domains */
8400 +       if (doms_cur != &fallback_doms)
8401 +               kfree(doms_cur);
8402 +       kfree(dattr_cur);       /* kfree(NULL) is safe */
8403 +       doms_cur = doms_new;
8404 +       dattr_cur = dattr_new;
8405 +       ndoms_cur = ndoms_new;
8406 +
8407 +       register_sched_domain_sysctl();
8408 +
8409 +       mutex_unlock(&sched_domains_mutex);
8410 +}
8411 +
8412 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
8413 +int arch_reinit_sched_domains(void)
8414 +{
8415 +       get_online_cpus();
8416 +
8417 +       /* Destroy domains first to force the rebuild */
8418 +       partition_sched_domains(0, NULL, NULL);
8419 +
8420 +       rebuild_sched_domains();
8421 +       put_online_cpus();
8422 +
8423 +       return 0;
8424 +}
8425 +
8426 +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
8427 +{
8428 +       int ret;
8429 +
8430 +       if (buf[0] != '0' && buf[0] != '1')
8431 +               return -EINVAL;
8432 +
8433 +       if (smt)
8434 +               sched_smt_power_savings = (buf[0] == '1');
8435 +       else
8436 +               sched_mc_power_savings = (buf[0] == '1');
8437 +
8438 +       ret = arch_reinit_sched_domains();
8439 +
8440 +       return ret ? ret : count;
8441 +}
8442 +
8443 +#ifdef CONFIG_SCHED_MC
8444 +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
8445 +                                          char *page)
8446 +{
8447 +       return sprintf(page, "%u\n", sched_mc_power_savings);
8448 +}
8449 +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
8450 +                                           const char *buf, size_t count)
8451 +{
8452 +       return sched_power_savings_store(buf, count, 0);
8453 +}
8454 +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
8455 +                        sched_mc_power_savings_show,
8456 +                        sched_mc_power_savings_store);
8457 +#endif
8458 +
8459 +#ifdef CONFIG_SCHED_SMT
8460 +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
8461 +                                           char *page)
8462 +{
8463 +       return sprintf(page, "%u\n", sched_smt_power_savings);
8464 +}
8465 +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
8466 +                                            const char *buf, size_t count)
8467 +{
8468 +       return sched_power_savings_store(buf, count, 1);
8469 +}
8470 +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
8471 +                  sched_smt_power_savings_show,
8472 +                  sched_smt_power_savings_store);
8473 +#endif
8474 +
8475 +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
8476 +{
8477 +       int err = 0;
8478 +
8479 +#ifdef CONFIG_SCHED_SMT
8480 +       if (smt_capable())
8481 +               err = sysfs_create_file(&cls->kset.kobj,
8482 +                                       &attr_sched_smt_power_savings.attr);
8483 +#endif
8484 +#ifdef CONFIG_SCHED_MC
8485 +       if (!err && mc_capable())
8486 +               err = sysfs_create_file(&cls->kset.kobj,
8487 +                                       &attr_sched_mc_power_savings.attr);
8488 +#endif
8489 +       return err;
8490 +}
8491 +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
8492 +
8493 +#ifndef CONFIG_CPUSETS
8494 +/*
8495 + * Add online and remove offline CPUs from the scheduler domains.
8496 + * When cpusets are enabled they take over this function.
8497 + */
8498 +static int update_sched_domains(struct notifier_block *nfb,
8499 +                               unsigned long action, void *hcpu)
8500 +{
8501 +       switch (action) {
8502 +       case CPU_ONLINE:
8503 +       case CPU_ONLINE_FROZEN:
8504 +       case CPU_DEAD:
8505 +       case CPU_DEAD_FROZEN:
8506 +               partition_sched_domains(1, NULL, NULL);
8507 +               return NOTIFY_OK;
8508 +
8509 +       default:
8510 +               return NOTIFY_DONE;
8511 +       }
8512 +}
8513 +#endif
8514 +
8515 +static int update_runtime(struct notifier_block *nfb,
8516 +                               unsigned long action, void *hcpu)
8517 +{
8518 +       int cpu = (int)(long)hcpu;
8519 +
8520 +       switch (action) {
8521 +       case CPU_DOWN_PREPARE:
8522 +       case CPU_DOWN_PREPARE_FROZEN:
8523 +               disable_runtime(cpu_rq(cpu));
8524 +               return NOTIFY_OK;
8525 +
8526 +       case CPU_DOWN_FAILED:
8527 +       case CPU_DOWN_FAILED_FROZEN:
8528 +       case CPU_ONLINE:
8529 +       case CPU_ONLINE_FROZEN:
8530 +               enable_runtime(cpu_rq(cpu));
8531 +               return NOTIFY_OK;
8532 +
8533 +       default:
8534 +               return NOTIFY_DONE;
8535 +       }
8536 +}
8537 +
8538 +void __init sched_init_smp(void)
8539 +{
8540 +       cpumask_t non_isolated_cpus;
8541 +
8542 +#if defined(CONFIG_NUMA)
8543 +       sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
8544 +                                                               GFP_KERNEL);
8545 +       BUG_ON(sched_group_nodes_bycpu == NULL);
8546 +#endif
8547 +       get_online_cpus();
8548 +       mutex_lock(&sched_domains_mutex);
8549 +       arch_init_sched_domains(&cpu_online_map);
8550 +       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
8551 +       if (cpus_empty(non_isolated_cpus))
8552 +               cpu_set(smp_processor_id(), non_isolated_cpus);
8553 +       mutex_unlock(&sched_domains_mutex);
8554 +       put_online_cpus();
8555 +
8556 +#ifndef CONFIG_CPUSETS
8557 +       /* XXX: Theoretical race here - CPU may be hotplugged now */
8558 +       hotcpu_notifier(update_sched_domains, 0);
8559 +#endif
8560 +
8561 +       /* RT runtime code needs to handle some hotplug events */
8562 +       hotcpu_notifier(update_runtime, 0);
8563 +
8564 +       init_hrtick();
8565 +
8566 +       /* Move init over to a non-isolated CPU */
8567 +       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
8568 +               BUG();
8569 +       sched_init_granularity();
8570 +}
8571 +#else
8572 +void __init sched_init_smp(void)
8573 +{
8574 +       sched_init_granularity();
8575 +}
8576 +#endif /* CONFIG_SMP */
8577 +
8578 +int in_sched_functions(unsigned long addr)
8579 +{
8580 +       return in_lock_functions(addr) ||
8581 +               (addr >= (unsigned long)__sched_text_start
8582 +               && addr < (unsigned long)__sched_text_end);
8583 +}
8584 +
8585 +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
8586 +{
8587 +       cfs_rq->tasks_timeline = RB_ROOT;
8588 +       INIT_LIST_HEAD(&cfs_rq->tasks);
8589 +#ifdef CONFIG_FAIR_GROUP_SCHED
8590 +       cfs_rq->rq = rq;
8591 +#endif
8592 +       cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8593 +}
8594 +
8595 +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8596 +{
8597 +       struct rt_prio_array *array;
8598 +       int i;
8599 +
8600 +       array = &rt_rq->active;
8601 +       for (i = 0; i < MAX_RT_PRIO; i++) {
8602 +               INIT_LIST_HEAD(array->queue + i);
8603 +               __clear_bit(i, array->bitmap);
8604 +       }
8605 +       /* delimiter for bitsearch: */
8606 +       __set_bit(MAX_RT_PRIO, array->bitmap);
8607 +
8608 +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8609 +       rt_rq->highest_prio = MAX_RT_PRIO;
8610 +#endif
8611 +#ifdef CONFIG_SMP
8612 +       rt_rq->rt_nr_migratory = 0;
8613 +       rt_rq->overloaded = 0;
8614 +#endif
8615 +
8616 +       rt_rq->rt_time = 0;
8617 +       rt_rq->rt_throttled = 0;
8618 +       rt_rq->rt_runtime = 0;
8619 +       spin_lock_init(&rt_rq->rt_runtime_lock);
8620 +
8621 +#ifdef CONFIG_RT_GROUP_SCHED
8622 +       rt_rq->rt_nr_boosted = 0;
8623 +       rt_rq->rq = rq;
8624 +#endif
8625 +}
8626 +
8627 +#ifdef CONFIG_FAIR_GROUP_SCHED
8628 +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8629 +                               struct sched_entity *se, int cpu, int add,
8630 +                               struct sched_entity *parent)
8631 +{
8632 +       struct rq *rq = cpu_rq(cpu);
8633 +       tg->cfs_rq[cpu] = cfs_rq;
8634 +       init_cfs_rq(cfs_rq, rq);
8635 +       cfs_rq->tg = tg;
8636 +       if (add)
8637 +               list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8638 +
8639 +       tg->se[cpu] = se;
8640 +       /* se could be NULL for init_task_group */
8641 +       if (!se)
8642 +               return;
8643 +
8644 +       if (!parent)
8645 +               se->cfs_rq = &rq->cfs;
8646 +       else
8647 +               se->cfs_rq = parent->my_q;
8648 +
8649 +       se->my_q = cfs_rq;
8650 +       se->load.weight = tg->shares;
8651 +       se->load.inv_weight = 0;
8652 +       se->parent = parent;
8653 +}
8654 +#endif
8655 +
8656 +#ifdef CONFIG_RT_GROUP_SCHED
8657 +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8658 +               struct sched_rt_entity *rt_se, int cpu, int add,
8659 +               struct sched_rt_entity *parent)
8660 +{
8661 +       struct rq *rq = cpu_rq(cpu);
8662 +
8663 +       tg->rt_rq[cpu] = rt_rq;
8664 +       init_rt_rq(rt_rq, rq);
8665 +       rt_rq->tg = tg;
8666 +       rt_rq->rt_se = rt_se;
8667 +       rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8668 +       if (add)
8669 +               list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8670 +
8671 +       tg->rt_se[cpu] = rt_se;
8672 +       if (!rt_se)
8673 +               return;
8674 +
8675 +       if (!parent)
8676 +               rt_se->rt_rq = &rq->rt;
8677 +       else
8678 +               rt_se->rt_rq = parent->my_q;
8679 +
8680 +       rt_se->my_q = rt_rq;
8681 +       rt_se->parent = parent;
8682 +       INIT_LIST_HEAD(&rt_se->run_list);
8683 +}
8684 +#endif
8685 +
8686 +void __init sched_init(void)
8687 +{
8688 +       int i, j;
8689 +       unsigned long alloc_size = 0, ptr;
8690 +
8691 +#ifdef CONFIG_FAIR_GROUP_SCHED
8692 +       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8693 +#endif
8694 +#ifdef CONFIG_RT_GROUP_SCHED
8695 +       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8696 +#endif
8697 +#ifdef CONFIG_USER_SCHED
8698 +       alloc_size *= 2;
8699 +#endif
8700 +       /*
8701 +        * As sched_init() is called before page_alloc is setup,
8702 +        * we use alloc_bootmem().
8703 +        */
8704 +       if (alloc_size) {
8705 +               ptr = (unsigned long)alloc_bootmem(alloc_size);
8706 +
8707 +#ifdef CONFIG_FAIR_GROUP_SCHED
8708 +               init_task_group.se = (struct sched_entity **)ptr;
8709 +               ptr += nr_cpu_ids * sizeof(void **);
8710 +
8711 +               init_task_group.cfs_rq = (struct cfs_rq **)ptr;
8712 +               ptr += nr_cpu_ids * sizeof(void **);
8713 +
8714 +#ifdef CONFIG_USER_SCHED
8715 +               root_task_group.se = (struct sched_entity **)ptr;
8716 +               ptr += nr_cpu_ids * sizeof(void **);
8717 +
8718 +               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8719 +               ptr += nr_cpu_ids * sizeof(void **);
8720 +#endif /* CONFIG_USER_SCHED */
8721 +#endif /* CONFIG_FAIR_GROUP_SCHED */
8722 +#ifdef CONFIG_RT_GROUP_SCHED
8723 +               init_task_group.rt_se = (struct sched_rt_entity **)ptr;
8724 +               ptr += nr_cpu_ids * sizeof(void **);
8725 +
8726 +               init_task_group.rt_rq = (struct rt_rq **)ptr;
8727 +               ptr += nr_cpu_ids * sizeof(void **);
8728 +
8729 +#ifdef CONFIG_USER_SCHED
8730 +               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8731 +               ptr += nr_cpu_ids * sizeof(void **);
8732 +
8733 +               root_task_group.rt_rq = (struct rt_rq **)ptr;
8734 +               ptr += nr_cpu_ids * sizeof(void **);
8735 +#endif /* CONFIG_USER_SCHED */
8736 +#endif /* CONFIG_RT_GROUP_SCHED */
8737 +       }
8738 +
8739 +#ifdef CONFIG_SMP
8740 +       init_defrootdomain();
8741 +#endif
8742 +
8743 +       init_rt_bandwidth(&def_rt_bandwidth,
8744 +                       global_rt_period(), global_rt_runtime());
8745 +
8746 +#ifdef CONFIG_RT_GROUP_SCHED
8747 +       init_rt_bandwidth(&init_task_group.rt_bandwidth,
8748 +                       global_rt_period(), global_rt_runtime());
8749 +#ifdef CONFIG_USER_SCHED
8750 +       init_rt_bandwidth(&root_task_group.rt_bandwidth,
8751 +                       global_rt_period(), RUNTIME_INF);
8752 +#endif /* CONFIG_USER_SCHED */
8753 +#endif /* CONFIG_RT_GROUP_SCHED */
8754 +
8755 +#ifdef CONFIG_GROUP_SCHED
8756 +       list_add(&init_task_group.list, &task_groups);
8757 +       INIT_LIST_HEAD(&init_task_group.children);
8758 +
8759 +#ifdef CONFIG_USER_SCHED
8760 +       INIT_LIST_HEAD(&root_task_group.children);
8761 +       init_task_group.parent = &root_task_group;
8762 +       list_add(&init_task_group.siblings, &root_task_group.children);
8763 +#endif /* CONFIG_USER_SCHED */
8764 +#endif /* CONFIG_GROUP_SCHED */
8765 +
8766 +       for_each_possible_cpu(i) {
8767 +               struct rq *rq;
8768 +
8769 +               rq = cpu_rq(i);
8770 +               spin_lock_init(&rq->lock);
8771 +               rq->nr_running = 0;
8772 +               init_cfs_rq(&rq->cfs, rq);
8773 +               init_rt_rq(&rq->rt, rq);
8774 +#ifdef CONFIG_FAIR_GROUP_SCHED
8775 +               init_task_group.shares = init_task_group_load;
8776 +               INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8777 +#ifdef CONFIG_CGROUP_SCHED
8778 +               /*
8779 +                * How much cpu bandwidth does init_task_group get?
8780 +                *
8781 +                * In case of task-groups formed thr' the cgroup filesystem, it
8782 +                * gets 100% of the cpu resources in the system. This overall
8783 +                * system cpu resource is divided among the tasks of
8784 +                * init_task_group and its child task-groups in a fair manner,
8785 +                * based on each entity's (task or task-group's) weight
8786 +                * (se->load.weight).
8787 +                *
8788 +                * In other words, if init_task_group has 10 tasks of weight
8789 +                * 1024) and two child groups A0 and A1 (of weight 1024 each),
8790 +                * then A0's share of the cpu resource is:
8791 +                *
8792 +                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8793 +                *
8794 +                * We achieve this by letting init_task_group's tasks sit
8795 +                * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8796 +                */
8797 +               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
8798 +#elif defined CONFIG_USER_SCHED
8799 +               root_task_group.shares = NICE_0_LOAD;
8800 +               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
8801 +               /*
8802 +                * In case of task-groups formed thr' the user id of tasks,
8803 +                * init_task_group represents tasks belonging to root user.
8804 +                * Hence it forms a sibling of all subsequent groups formed.
8805 +                * In this case, init_task_group gets only a fraction of overall
8806 +                * system cpu resource, based on the weight assigned to root
8807 +                * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
8808 +                * by letting tasks of init_task_group sit in a separate cfs_rq
8809 +                * (init_cfs_rq) and having one entity represent this group of
8810 +                * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
8811 +                */
8812 +               init_tg_cfs_entry(&init_task_group,
8813 +                               &per_cpu(init_cfs_rq, i),
8814 +                               &per_cpu(init_sched_entity, i), i, 1,
8815 +                               root_task_group.se[i]);
8816 +
8817 +#endif
8818 +#endif /* CONFIG_FAIR_GROUP_SCHED */
8819 +
8820 +               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8821 +#ifdef CONFIG_RT_GROUP_SCHED
8822 +               INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8823 +#ifdef CONFIG_CGROUP_SCHED
8824 +               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8825 +#elif defined CONFIG_USER_SCHED
8826 +               init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
8827 +               init_tg_rt_entry(&init_task_group,
8828 +                               &per_cpu(init_rt_rq, i),
8829 +                               &per_cpu(init_sched_rt_entity, i), i, 1,
8830 +                               root_task_group.rt_se[i]);
8831 +#endif
8832 +#endif
8833 +
8834 +               for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8835 +                       rq->cpu_load[j] = 0;
8836 +#ifdef CONFIG_SMP
8837 +               rq->sd = NULL;
8838 +               rq->rd = NULL;
8839 +               rq->active_balance = 0;
8840 +               rq->next_balance = jiffies;
8841 +               rq->push_cpu = 0;
8842 +               rq->cpu = i;
8843 +               rq->online = 0;
8844 +               rq->migration_thread = NULL;
8845 +               INIT_LIST_HEAD(&rq->migration_queue);
8846 +               rq_attach_root(rq, &def_root_domain);
8847 +#endif
8848 +               init_rq_hrtick(rq);
8849 +               atomic_set(&rq->nr_iowait, 0);
8850 +       }
8851 +
8852 +       set_load_weight(&init_task);
8853 +
8854 +#ifdef CONFIG_PREEMPT_NOTIFIERS
8855 +       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8856 +#endif
8857 +
8858 +#ifdef CONFIG_SMP
8859 +       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8860 +#endif
8861 +
8862 +#ifdef CONFIG_RT_MUTEXES
8863 +       plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
8864 +#endif
8865 +
8866 +       /*
8867 +        * The boot idle thread does lazy MMU switching as well:
8868 +        */
8869 +       atomic_inc(&init_mm.mm_count);
8870 +       enter_lazy_tlb(&init_mm, current);
8871 +
8872 +       /*
8873 +        * Make us the idle thread. Technically, schedule() should not be
8874 +        * called from this thread, however somewhere below it might be,
8875 +        * but because we are the idle thread, we just pick up running again
8876 +        * when this runqueue becomes "idle".
8877 +        */
8878 +       init_idle(current, smp_processor_id());
8879 +       /*
8880 +        * During early bootup we pretend to be a normal task:
8881 +        */
8882 +       current->sched_class = &fair_sched_class;
8883 +
8884 +       scheduler_running = 1;
8885 +}
8886 +
8887 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
8888 +void __might_sleep(char *file, int line)
8889 +{
8890 +#ifdef in_atomic
8891 +       static unsigned long prev_jiffy;        /* ratelimiting */
8892 +
8893 +       if ((in_atomic() || irqs_disabled()) &&
8894 +           system_state == SYSTEM_RUNNING && !oops_in_progress) {
8895 +               if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8896 +                       return;
8897 +               prev_jiffy = jiffies;
8898 +               printk(KERN_ERR "BUG: sleeping function called from invalid"
8899 +                               " context at %s:%d\n", file, line);
8900 +               printk("in_atomic():%d, irqs_disabled():%d\n",
8901 +                       in_atomic(), irqs_disabled());
8902 +               debug_show_held_locks(current);
8903 +               if (irqs_disabled())
8904 +                       print_irqtrace_events(current);
8905 +               dump_stack();
8906 +       }
8907 +#endif
8908 +}
8909 +EXPORT_SYMBOL(__might_sleep);
8910 +#endif
8911 +
8912 +#ifdef CONFIG_MAGIC_SYSRQ
8913 +static void normalize_task(struct rq *rq, struct task_struct *p)
8914 +{
8915 +       int on_rq;
8916 +
8917 +       update_rq_clock(rq);
8918 +       on_rq = p->se.on_rq;
8919 +       if (on_rq)
8920 +               deactivate_task(rq, p, 0);
8921 +       __setscheduler(rq, p, SCHED_NORMAL, 0);
8922 +       if (on_rq) {
8923 +               activate_task(rq, p, 0);
8924 +               resched_task(rq->curr);
8925 +       }
8926 +}
8927 +
8928 +void normalize_rt_tasks(void)
8929 +{
8930 +       struct task_struct *g, *p;
8931 +       unsigned long flags;
8932 +       struct rq *rq;
8933 +
8934 +       read_lock_irqsave(&tasklist_lock, flags);
8935 +       do_each_thread(g, p) {
8936 +               /*
8937 +                * Only normalize user tasks:
8938 +                */
8939 +               if (!p->mm)
8940 +                       continue;
8941 +
8942 +               p->se.exec_start                = 0;
8943 +#ifdef CONFIG_SCHEDSTATS
8944 +               p->se.wait_start                = 0;
8945 +               p->se.sleep_start               = 0;
8946 +               p->se.block_start               = 0;
8947 +#endif
8948 +
8949 +               if (!rt_task(p)) {
8950 +                       /*
8951 +                        * Renice negative nice level userspace
8952 +                        * tasks back to 0:
8953 +                        */
8954 +                       if (TASK_NICE(p) < 0 && p->mm)
8955 +                               set_user_nice(p, 0);
8956 +                       continue;
8957 +               }
8958 +
8959 +               spin_lock(&p->pi_lock);
8960 +               rq = __task_rq_lock(p);
8961 +
8962 +               normalize_task(rq, p);
8963 +
8964 +               __task_rq_unlock(rq);
8965 +               spin_unlock(&p->pi_lock);
8966 +       } while_each_thread(g, p);
8967 +
8968 +       read_unlock_irqrestore(&tasklist_lock, flags);
8969 +}
8970 +
8971 +#endif /* CONFIG_MAGIC_SYSRQ */
8972 +
8973 +#ifdef CONFIG_IA64
8974 +/*
8975 + * These functions are only useful for the IA64 MCA handling.
8976 + *
8977 + * They can only be called when the whole system has been
8978 + * stopped - every CPU needs to be quiescent, and no scheduling
8979 + * activity can take place. Using them for anything else would
8980 + * be a serious bug, and as a result, they aren't even visible
8981 + * under any other configuration.
8982 + */
8983 +
8984 +/**
8985 + * curr_task - return the current task for a given cpu.
8986 + * @cpu: the processor in question.
8987 + *
8988 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8989 + */
8990 +struct task_struct *curr_task(int cpu)
8991 +{
8992 +       return cpu_curr(cpu);
8993 +}
8994 +
8995 +/**
8996 + * set_curr_task - set the current task for a given cpu.
8997 + * @cpu: the processor in question.
8998 + * @p: the task pointer to set.
8999 + *
9000 + * Description: This function must only be used when non-maskable interrupts
9001 + * are serviced on a separate stack. It allows the architecture to switch the
9002 + * notion of the current task on a cpu in a non-blocking manner. This function
9003 + * must be called with all CPU's synchronized, and interrupts disabled, the
9004 + * and caller must save the original value of the current task (see
9005 + * curr_task() above) and restore that value before reenabling interrupts and
9006 + * re-starting the system.
9007 + *
9008 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
9009 + */
9010 +void set_curr_task(int cpu, struct task_struct *p)
9011 +{
9012 +       cpu_curr(cpu) = p;
9013 +}
9014 +
9015 +#endif
9016 +
9017 +#ifdef CONFIG_FAIR_GROUP_SCHED
9018 +static void free_fair_sched_group(struct task_group *tg)
9019 +{
9020 +       int i;
9021 +
9022 +       for_each_possible_cpu(i) {
9023 +               if (tg->cfs_rq)
9024 +                       kfree(tg->cfs_rq[i]);
9025 +               if (tg->se)
9026 +                       kfree(tg->se[i]);
9027 +       }
9028 +
9029 +       kfree(tg->cfs_rq);
9030 +       kfree(tg->se);
9031 +}
9032 +
9033 +static
9034 +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9035 +{
9036 +       struct cfs_rq *cfs_rq;
9037 +       struct sched_entity *se, *parent_se;
9038 +       struct rq *rq;
9039 +       int i;
9040 +
9041 +       tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9042 +       if (!tg->cfs_rq)
9043 +               goto err;
9044 +       tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9045 +       if (!tg->se)
9046 +               goto err;
9047 +
9048 +       tg->shares = NICE_0_LOAD;
9049 +
9050 +       for_each_possible_cpu(i) {
9051 +               rq = cpu_rq(i);
9052 +
9053 +               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
9054 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
9055 +               if (!cfs_rq)
9056 +                       goto err;
9057 +
9058 +               se = kmalloc_node(sizeof(struct sched_entity),
9059 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
9060 +               if (!se)
9061 +                       goto err;
9062 +
9063 +               parent_se = parent ? parent->se[i] : NULL;
9064 +               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
9065 +       }
9066 +
9067 +       return 1;
9068 +
9069 + err:
9070 +       return 0;
9071 +}
9072 +
9073 +static inline void register_fair_sched_group(struct task_group *tg, int cpu)
9074 +{
9075 +       list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
9076 +                       &cpu_rq(cpu)->leaf_cfs_rq_list);
9077 +}
9078 +
9079 +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
9080 +{
9081 +       list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
9082 +}
9083 +#else /* !CONFG_FAIR_GROUP_SCHED */
9084 +static inline void free_fair_sched_group(struct task_group *tg)
9085 +{
9086 +}
9087 +
9088 +static inline
9089 +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9090 +{
9091 +       return 1;
9092 +}
9093 +
9094 +static inline void register_fair_sched_group(struct task_group *tg, int cpu)
9095 +{
9096 +}
9097 +
9098 +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
9099 +{
9100 +}
9101 +#endif /* CONFIG_FAIR_GROUP_SCHED */
9102 +
9103 +#ifdef CONFIG_RT_GROUP_SCHED
9104 +static void free_rt_sched_group(struct task_group *tg)
9105 +{
9106 +       int i;
9107 +
9108 +       destroy_rt_bandwidth(&tg->rt_bandwidth);
9109 +
9110 +       for_each_possible_cpu(i) {
9111 +               if (tg->rt_rq)
9112 +                       kfree(tg->rt_rq[i]);
9113 +               if (tg->rt_se)
9114 +                       kfree(tg->rt_se[i]);
9115 +       }
9116 +
9117 +       kfree(tg->rt_rq);
9118 +       kfree(tg->rt_se);
9119 +}
9120 +
9121 +static
9122 +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9123 +{
9124 +       struct rt_rq *rt_rq;
9125 +       struct sched_rt_entity *rt_se, *parent_se;
9126 +       struct rq *rq;
9127 +       int i;
9128 +
9129 +       tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
9130 +       if (!tg->rt_rq)
9131 +               goto err;
9132 +       tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
9133 +       if (!tg->rt_se)
9134 +               goto err;
9135 +
9136 +       init_rt_bandwidth(&tg->rt_bandwidth,
9137 +                       ktime_to_ns(def_rt_bandwidth.rt_period), 0);
9138 +
9139 +       for_each_possible_cpu(i) {
9140 +               rq = cpu_rq(i);
9141 +
9142 +               rt_rq = kmalloc_node(sizeof(struct rt_rq),
9143 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
9144 +               if (!rt_rq)
9145 +                       goto err;
9146 +
9147 +               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
9148 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
9149 +               if (!rt_se)
9150 +                       goto err;
9151 +
9152 +               parent_se = parent ? parent->rt_se[i] : NULL;
9153 +               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
9154 +       }
9155 +
9156 +       return 1;
9157 +
9158 + err:
9159 +       return 0;
9160 +}
9161 +
9162 +static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9163 +{
9164 +       list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
9165 +                       &cpu_rq(cpu)->leaf_rt_rq_list);
9166 +}
9167 +
9168 +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9169 +{
9170 +       list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
9171 +}
9172 +#else /* !CONFIG_RT_GROUP_SCHED */
9173 +static inline void free_rt_sched_group(struct task_group *tg)
9174 +{
9175 +}
9176 +
9177 +static inline
9178 +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9179 +{
9180 +       return 1;
9181 +}
9182 +
9183 +static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9184 +{
9185 +}
9186 +
9187 +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9188 +{
9189 +}
9190 +#endif /* CONFIG_RT_GROUP_SCHED */
9191 +
9192 +#ifdef CONFIG_GROUP_SCHED
9193 +static void free_sched_group(struct task_group *tg)
9194 +{
9195 +       free_fair_sched_group(tg);
9196 +       free_rt_sched_group(tg);
9197 +       kfree(tg);
9198 +}
9199 +
9200 +/* allocate runqueue etc for a new task group */
9201 +struct task_group *sched_create_group(struct task_group *parent)
9202 +{
9203 +       struct task_group *tg;
9204 +       unsigned long flags;
9205 +       int i;
9206 +
9207 +       tg = kzalloc(sizeof(*tg), GFP_KERNEL);
9208 +       if (!tg)
9209 +               return ERR_PTR(-ENOMEM);
9210 +
9211 +       if (!alloc_fair_sched_group(tg, parent))
9212 +               goto err;
9213 +
9214 +       if (!alloc_rt_sched_group(tg, parent))
9215 +               goto err;
9216 +
9217 +       spin_lock_irqsave(&task_group_lock, flags);
9218 +       for_each_possible_cpu(i) {
9219 +               register_fair_sched_group(tg, i);
9220 +               register_rt_sched_group(tg, i);
9221 +       }
9222 +       list_add_rcu(&tg->list, &task_groups);
9223 +
9224 +       WARN_ON(!parent); /* root should already exist */
9225 +
9226 +       tg->parent = parent;
9227 +       INIT_LIST_HEAD(&tg->children);
9228 +       list_add_rcu(&tg->siblings, &parent->children);
9229 +       spin_unlock_irqrestore(&task_group_lock, flags);
9230 +
9231 +       return tg;
9232 +
9233 +err:
9234 +       free_sched_group(tg);
9235 +       return ERR_PTR(-ENOMEM);
9236 +}
9237 +
9238 +/* rcu callback to free various structures associated with a task group */
9239 +static void free_sched_group_rcu(struct rcu_head *rhp)
9240 +{
9241 +       /* now it should be safe to free those cfs_rqs */
9242 +       free_sched_group(container_of(rhp, struct task_group, rcu));
9243 +}
9244 +
9245 +/* Destroy runqueue etc associated with a task group */
9246 +void sched_destroy_group(struct task_group *tg)
9247 +{
9248 +       unsigned long flags;
9249 +       int i;
9250 +
9251 +       spin_lock_irqsave(&task_group_lock, flags);
9252 +       for_each_possible_cpu(i) {
9253 +               unregister_fair_sched_group(tg, i);
9254 +               unregister_rt_sched_group(tg, i);
9255 +       }
9256 +       list_del_rcu(&tg->list);
9257 +       list_del_rcu(&tg->siblings);
9258 +       spin_unlock_irqrestore(&task_group_lock, flags);
9259 +
9260 +       /* wait for possible concurrent references to cfs_rqs complete */
9261 +       call_rcu(&tg->rcu, free_sched_group_rcu);
9262 +}
9263 +
9264 +/* change task's runqueue when it moves between groups.
9265 + *     The caller of this function should have put the task in its new group
9266 + *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
9267 + *     reflect its new group.
9268 + */
9269 +void sched_move_task(struct task_struct *tsk)
9270 +{
9271 +       int on_rq, running;
9272 +       unsigned long flags;
9273 +       struct rq *rq;
9274 +
9275 +       rq = task_rq_lock(tsk, &flags);
9276 +
9277 +       update_rq_clock(rq);
9278 +
9279 +       running = task_current(rq, tsk);
9280 +       on_rq = tsk->se.on_rq;
9281 +
9282 +       if (on_rq)
9283 +               dequeue_task(rq, tsk, 0);
9284 +       if (unlikely(running))
9285 +               tsk->sched_class->put_prev_task(rq, tsk);
9286 +
9287 +       set_task_rq(tsk, task_cpu(tsk));
9288 +
9289 +#ifdef CONFIG_FAIR_GROUP_SCHED
9290 +       if (tsk->sched_class->moved_group)
9291 +               tsk->sched_class->moved_group(tsk);
9292 +#endif
9293 +
9294 +       if (unlikely(running))
9295 +               tsk->sched_class->set_curr_task(rq);
9296 +       if (on_rq)
9297 +               enqueue_task(rq, tsk, 0);
9298 +
9299 +       task_rq_unlock(rq, &flags);
9300 +}
9301 +#endif /* CONFIG_GROUP_SCHED */
9302 +
9303 +#ifdef CONFIG_FAIR_GROUP_SCHED
9304 +static void __set_se_shares(struct sched_entity *se, unsigned long shares)
9305 +{
9306 +       struct cfs_rq *cfs_rq = se->cfs_rq;
9307 +       int on_rq;
9308 +
9309 +       on_rq = se->on_rq;
9310 +       if (on_rq)
9311 +               dequeue_entity(cfs_rq, se, 0);
9312 +
9313 +       se->load.weight = shares;
9314 +       se->load.inv_weight = 0;
9315 +
9316 +       if (on_rq)
9317 +               enqueue_entity(cfs_rq, se, 0);
9318 +}
9319 +
9320 +static void set_se_shares(struct sched_entity *se, unsigned long shares)
9321 +{
9322 +       struct cfs_rq *cfs_rq = se->cfs_rq;
9323 +       struct rq *rq = cfs_rq->rq;
9324 +       unsigned long flags;
9325 +
9326 +       spin_lock_irqsave(&rq->lock, flags);
9327 +       __set_se_shares(se, shares);
9328 +       spin_unlock_irqrestore(&rq->lock, flags);
9329 +}
9330 +
9331 +static DEFINE_MUTEX(shares_mutex);
9332 +
9333 +int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9334 +{
9335 +       int i;
9336 +       unsigned long flags;
9337 +
9338 +       /*
9339 +        * We can't change the weight of the root cgroup.
9340 +        */
9341 +       if (!tg->se[0])
9342 +               return -EINVAL;
9343 +
9344 +       if (shares < MIN_SHARES)
9345 +               shares = MIN_SHARES;
9346 +       else if (shares > MAX_SHARES)
9347 +               shares = MAX_SHARES;
9348 +
9349 +       mutex_lock(&shares_mutex);
9350 +       if (tg->shares == shares)
9351 +               goto done;
9352 +
9353 +       spin_lock_irqsave(&task_group_lock, flags);
9354 +       for_each_possible_cpu(i)
9355 +               unregister_fair_sched_group(tg, i);
9356 +       list_del_rcu(&tg->siblings);
9357 +       spin_unlock_irqrestore(&task_group_lock, flags);
9358 +
9359 +       /* wait for any ongoing reference to this group to finish */
9360 +       synchronize_sched();
9361 +
9362 +       /*
9363 +        * Now we are free to modify the group's share on each cpu
9364 +        * w/o tripping rebalance_share or load_balance_fair.
9365 +        */
9366 +       tg->shares = shares;
9367 +       for_each_possible_cpu(i) {
9368 +               /*
9369 +                * force a rebalance
9370 +                */
9371 +               cfs_rq_set_shares(tg->cfs_rq[i], 0);
9372 +               set_se_shares(tg->se[i], shares);
9373 +       }
9374 +
9375 +       /*
9376 +        * Enable load balance activity on this group, by inserting it back on
9377 +        * each cpu's rq->leaf_cfs_rq_list.
9378 +        */
9379 +       spin_lock_irqsave(&task_group_lock, flags);
9380 +       for_each_possible_cpu(i)
9381 +               register_fair_sched_group(tg, i);
9382 +       list_add_rcu(&tg->siblings, &tg->parent->children);
9383 +       spin_unlock_irqrestore(&task_group_lock, flags);
9384 +done:
9385 +       mutex_unlock(&shares_mutex);
9386 +       return 0;
9387 +}
9388 +
9389 +unsigned long sched_group_shares(struct task_group *tg)
9390 +{
9391 +       return tg->shares;
9392 +}
9393 +#endif
9394 +
9395 +#ifdef CONFIG_RT_GROUP_SCHED
9396 +/*
9397 + * Ensure that the real time constraints are schedulable.
9398 + */
9399 +static DEFINE_MUTEX(rt_constraints_mutex);
9400 +
9401 +static unsigned long to_ratio(u64 period, u64 runtime)
9402 +{
9403 +       if (runtime == RUNTIME_INF)
9404 +               return 1ULL << 16;
9405 +
9406 +       return div64_u64(runtime << 16, period);
9407 +}
9408 +
9409 +#ifdef CONFIG_CGROUP_SCHED
9410 +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
9411 +{
9412 +       struct task_group *tgi, *parent = tg->parent;
9413 +       unsigned long total = 0;
9414 +
9415 +       if (!parent) {
9416 +               if (global_rt_period() < period)
9417 +                       return 0;
9418 +
9419 +               return to_ratio(period, runtime) <
9420 +                       to_ratio(global_rt_period(), global_rt_runtime());
9421 +       }
9422 +
9423 +       if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
9424 +               return 0;
9425 +
9426 +       rcu_read_lock();
9427 +       list_for_each_entry_rcu(tgi, &parent->children, siblings) {
9428 +               if (tgi == tg)
9429 +                       continue;
9430 +
9431 +               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
9432 +                               tgi->rt_bandwidth.rt_runtime);
9433 +       }
9434 +       rcu_read_unlock();
9435 +
9436 +       return total + to_ratio(period, runtime) <=
9437 +               to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
9438 +                               parent->rt_bandwidth.rt_runtime);
9439 +}
9440 +#elif defined CONFIG_USER_SCHED
9441 +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
9442 +{
9443 +       struct task_group *tgi;
9444 +       unsigned long total = 0;
9445 +       unsigned long global_ratio =
9446 +               to_ratio(global_rt_period(), global_rt_runtime());
9447 +
9448 +       rcu_read_lock();
9449 +       list_for_each_entry_rcu(tgi, &task_groups, list) {
9450 +               if (tgi == tg)
9451 +                       continue;
9452 +
9453 +               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
9454 +                               tgi->rt_bandwidth.rt_runtime);
9455 +       }
9456 +       rcu_read_unlock();
9457 +
9458 +       return total + to_ratio(period, runtime) < global_ratio;
9459 +}
9460 +#endif
9461 +
9462 +/* Must be called with tasklist_lock held */
9463 +static inline int tg_has_rt_tasks(struct task_group *tg)
9464 +{
9465 +       struct task_struct *g, *p;
9466 +       do_each_thread(g, p) {
9467 +               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
9468 +                       return 1;
9469 +       } while_each_thread(g, p);
9470 +       return 0;
9471 +}
9472 +
9473 +static int tg_set_bandwidth(struct task_group *tg,
9474 +               u64 rt_period, u64 rt_runtime)
9475 +{
9476 +       int i, err = 0;
9477 +
9478 +       mutex_lock(&rt_constraints_mutex);
9479 +       read_lock(&tasklist_lock);
9480 +       if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
9481 +               err = -EBUSY;
9482 +               goto unlock;
9483 +       }
9484 +       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
9485 +               err = -EINVAL;
9486 +               goto unlock;
9487 +       }
9488 +
9489 +       spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
9490 +       tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
9491 +       tg->rt_bandwidth.rt_runtime = rt_runtime;
9492 +
9493 +       for_each_possible_cpu(i) {
9494 +               struct rt_rq *rt_rq = tg->rt_rq[i];
9495 +
9496 +               spin_lock(&rt_rq->rt_runtime_lock);
9497 +               rt_rq->rt_runtime = rt_runtime;
9498 +               spin_unlock(&rt_rq->rt_runtime_lock);
9499 +       }
9500 +       spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
9501 + unlock:
9502 +       read_unlock(&tasklist_lock);
9503 +       mutex_unlock(&rt_constraints_mutex);
9504 +
9505 +       return err;
9506 +}
9507 +
9508 +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
9509 +{
9510 +       u64 rt_runtime, rt_period;
9511 +
9512 +       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
9513 +       rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
9514 +       if (rt_runtime_us < 0)
9515 +               rt_runtime = RUNTIME_INF;
9516 +
9517 +       return tg_set_bandwidth(tg, rt_period, rt_runtime);
9518 +}
9519 +
9520 +long sched_group_rt_runtime(struct task_group *tg)
9521 +{
9522 +       u64 rt_runtime_us;
9523 +
9524 +       if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
9525 +               return -1;
9526 +
9527 +       rt_runtime_us = tg->rt_bandwidth.rt_runtime;
9528 +       do_div(rt_runtime_us, NSEC_PER_USEC);
9529 +       return rt_runtime_us;
9530 +}
9531 +
9532 +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
9533 +{
9534 +       u64 rt_runtime, rt_period;
9535 +
9536 +       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
9537 +       rt_runtime = tg->rt_bandwidth.rt_runtime;
9538 +
9539 +       if (rt_period == 0)
9540 +               return -EINVAL;
9541 +
9542 +       return tg_set_bandwidth(tg, rt_period, rt_runtime);
9543 +}
9544 +
9545 +long sched_group_rt_period(struct task_group *tg)
9546 +{
9547 +       u64 rt_period_us;
9548 +
9549 +       rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
9550 +       do_div(rt_period_us, NSEC_PER_USEC);
9551 +       return rt_period_us;
9552 +}
9553 +
9554 +static int sched_rt_global_constraints(void)
9555 +{
9556 +       struct task_group *tg = &root_task_group;
9557 +       u64 rt_runtime, rt_period;
9558 +       int ret = 0;
9559 +
9560 +       if (sysctl_sched_rt_period <= 0)
9561 +               return -EINVAL;
9562 +
9563 +       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
9564 +       rt_runtime = tg->rt_bandwidth.rt_runtime;
9565 +
9566 +       mutex_lock(&rt_constraints_mutex);
9567 +       if (!__rt_schedulable(tg, rt_period, rt_runtime))
9568 +               ret = -EINVAL;
9569 +       mutex_unlock(&rt_constraints_mutex);
9570 +
9571 +       return ret;
9572 +}
9573 +#else /* !CONFIG_RT_GROUP_SCHED */
9574 +static int sched_rt_global_constraints(void)
9575 +{
9576 +       unsigned long flags;
9577 +       int i;
9578 +
9579 +       if (sysctl_sched_rt_period <= 0)
9580 +               return -EINVAL;
9581 +
9582 +       spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9583 +       for_each_possible_cpu(i) {
9584 +               struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9585 +
9586 +               spin_lock(&rt_rq->rt_runtime_lock);
9587 +               rt_rq->rt_runtime = global_rt_runtime();
9588 +               spin_unlock(&rt_rq->rt_runtime_lock);
9589 +       }
9590 +       spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
9591 +
9592 +       return 0;
9593 +}
9594 +#endif /* CONFIG_RT_GROUP_SCHED */
9595 +
9596 +int sched_rt_handler(struct ctl_table *table, int write,
9597 +               struct file *filp, void __user *buffer, size_t *lenp,
9598 +               loff_t *ppos)
9599 +{
9600 +       int ret;
9601 +       int old_period, old_runtime;
9602 +       static DEFINE_MUTEX(mutex);
9603 +
9604 +       mutex_lock(&mutex);
9605 +       old_period = sysctl_sched_rt_period;
9606 +       old_runtime = sysctl_sched_rt_runtime;
9607 +
9608 +       ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
9609 +
9610 +       if (!ret && write) {
9611 +               ret = sched_rt_global_constraints();
9612 +               if (ret) {
9613 +                       sysctl_sched_rt_period = old_period;
9614 +                       sysctl_sched_rt_runtime = old_runtime;
9615 +               } else {
9616 +                       def_rt_bandwidth.rt_runtime = global_rt_runtime();
9617 +                       def_rt_bandwidth.rt_period =
9618 +                               ns_to_ktime(global_rt_period());
9619 +               }
9620 +       }
9621 +       mutex_unlock(&mutex);
9622 +
9623 +       return ret;
9624 +}
9625 +
9626 +#ifdef CONFIG_CGROUP_SCHED
9627 +
9628 +/* return corresponding task_group object of a cgroup */
9629 +static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
9630 +{
9631 +       return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
9632 +                           struct task_group, css);
9633 +}
9634 +
9635 +static struct cgroup_subsys_state *
9636 +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9637 +{
9638 +       struct task_group *tg, *parent;
9639 +
9640 +       if (!cgrp->parent) {
9641 +               /* This is early initialization for the top cgroup */
9642 +               init_task_group.css.cgroup = cgrp;
9643 +               return &init_task_group.css;
9644 +       }
9645 +
9646 +       parent = cgroup_tg(cgrp->parent);
9647 +       tg = sched_create_group(parent);
9648 +       if (IS_ERR(tg))
9649 +               return ERR_PTR(-ENOMEM);
9650 +
9651 +       /* Bind the cgroup to task_group object we just created */
9652 +       tg->css.cgroup = cgrp;
9653 +
9654 +       return &tg->css;
9655 +}
9656 +
9657 +static void
9658 +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9659 +{
9660 +       struct task_group *tg = cgroup_tg(cgrp);
9661 +
9662 +       sched_destroy_group(tg);
9663 +}
9664 +
9665 +static int
9666 +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9667 +                     struct task_struct *tsk)
9668 +{
9669 +#ifdef CONFIG_RT_GROUP_SCHED
9670 +       /* Don't accept realtime tasks when there is no way for them to run */
9671 +       if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9672 +               return -EINVAL;
9673 +#else
9674 +       /* We don't support RT-tasks being in separate groups */
9675 +       if (tsk->sched_class != &fair_sched_class)
9676 +               return -EINVAL;
9677 +#endif
9678 +
9679 +       return 0;
9680 +}
9681 +
9682 +static void
9683 +cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9684 +                       struct cgroup *old_cont, struct task_struct *tsk)
9685 +{
9686 +       sched_move_task(tsk);
9687 +}
9688 +
9689 +#ifdef CONFIG_FAIR_GROUP_SCHED
9690 +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9691 +                               u64 shareval)
9692 +{
9693 +       return sched_group_set_shares(cgroup_tg(cgrp), shareval);
9694 +}
9695 +
9696 +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9697 +{
9698 +       struct task_group *tg = cgroup_tg(cgrp);
9699 +
9700 +       return (u64) tg->shares;
9701 +}
9702 +#endif /* CONFIG_FAIR_GROUP_SCHED */
9703 +
9704 +#ifdef CONFIG_RT_GROUP_SCHED
9705 +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9706 +                               s64 val)
9707 +{
9708 +       return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9709 +}
9710 +
9711 +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9712 +{
9713 +       return sched_group_rt_runtime(cgroup_tg(cgrp));
9714 +}
9715 +
9716 +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9717 +               u64 rt_period_us)
9718 +{
9719 +       return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9720 +}
9721 +
9722 +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9723 +{
9724 +       return sched_group_rt_period(cgroup_tg(cgrp));
9725 +}
9726 +#endif /* CONFIG_RT_GROUP_SCHED */
9727 +
9728 +static struct cftype cpu_files[] = {
9729 +#ifdef CONFIG_FAIR_GROUP_SCHED
9730 +       {
9731 +               .name = "shares",
9732 +               .read_u64 = cpu_shares_read_u64,
9733 +               .write_u64 = cpu_shares_write_u64,
9734 +       },
9735 +#endif
9736 +#ifdef CONFIG_RT_GROUP_SCHED
9737 +       {
9738 +               .name = "rt_runtime_us",
9739 +               .read_s64 = cpu_rt_runtime_read,
9740 +               .write_s64 = cpu_rt_runtime_write,
9741 +       },
9742 +       {
9743 +               .name = "rt_period_us",
9744 +               .read_u64 = cpu_rt_period_read_uint,
9745 +               .write_u64 = cpu_rt_period_write_uint,
9746 +       },
9747 +#endif
9748 +};
9749 +
9750 +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
9751 +{
9752 +       return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
9753 +}
9754 +
9755 +struct cgroup_subsys cpu_cgroup_subsys = {
9756 +       .name           = "cpu",
9757 +       .create         = cpu_cgroup_create,
9758 +       .destroy        = cpu_cgroup_destroy,
9759 +       .can_attach     = cpu_cgroup_can_attach,
9760 +       .attach         = cpu_cgroup_attach,
9761 +       .populate       = cpu_cgroup_populate,
9762 +       .subsys_id      = cpu_cgroup_subsys_id,
9763 +       .early_init     = 1,
9764 +};
9765 +
9766 +#endif /* CONFIG_CGROUP_SCHED */
9767 +
9768 +#ifdef CONFIG_CGROUP_CPUACCT
9769 +
9770 +/*
9771 + * CPU accounting code for task groups.
9772 + *
9773 + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
9774 + * (balbir@in.ibm.com).
9775 + */
9776 +
9777 +/* track cpu usage of a group of tasks */
9778 +struct cpuacct {
9779 +       struct cgroup_subsys_state css;
9780 +       /* cpuusage holds pointer to a u64-type object on every cpu */
9781 +       u64 *cpuusage;
9782 +};
9783 +
9784 +struct cgroup_subsys cpuacct_subsys;
9785 +
9786 +/* return cpu accounting group corresponding to this container */
9787 +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9788 +{
9789 +       return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9790 +                           struct cpuacct, css);
9791 +}
9792 +
9793 +/* return cpu accounting group to which this task belongs */
9794 +static inline struct cpuacct *task_ca(struct task_struct *tsk)
9795 +{
9796 +       return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9797 +                           struct cpuacct, css);
9798 +}
9799 +
9800 +/* create a new cpu accounting group */
9801 +static struct cgroup_subsys_state *cpuacct_create(
9802 +       struct cgroup_subsys *ss, struct cgroup *cgrp)
9803 +{
9804 +       struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9805 +
9806 +       if (!ca)
9807 +               return ERR_PTR(-ENOMEM);
9808 +
9809 +       ca->cpuusage = alloc_percpu(u64);
9810 +       if (!ca->cpuusage) {
9811 +               kfree(ca);
9812 +               return ERR_PTR(-ENOMEM);
9813 +       }
9814 +
9815 +       return &ca->css;
9816 +}
9817 +
9818 +/* destroy an existing cpu accounting group */
9819 +static void
9820 +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9821 +{
9822 +       struct cpuacct *ca = cgroup_ca(cgrp);
9823 +
9824 +       free_percpu(ca->cpuusage);
9825 +       kfree(ca);
9826 +}
9827 +
9828 +/* return total cpu usage (in nanoseconds) of a group */
9829 +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9830 +{
9831 +       struct cpuacct *ca = cgroup_ca(cgrp);
9832 +       u64 totalcpuusage = 0;
9833 +       int i;
9834 +
9835 +       for_each_possible_cpu(i) {
9836 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9837 +
9838 +               /*
9839 +                * Take rq->lock to make 64-bit addition safe on 32-bit
9840 +                * platforms.
9841 +                */
9842 +               spin_lock_irq(&cpu_rq(i)->lock);
9843 +               totalcpuusage += *cpuusage;
9844 +               spin_unlock_irq(&cpu_rq(i)->lock);
9845 +       }
9846 +
9847 +       return totalcpuusage;
9848 +}
9849 +
9850 +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9851 +                                                               u64 reset)
9852 +{
9853 +       struct cpuacct *ca = cgroup_ca(cgrp);
9854 +       int err = 0;
9855 +       int i;
9856 +
9857 +       if (reset) {
9858 +               err = -EINVAL;
9859 +               goto out;
9860 +       }
9861 +
9862 +       for_each_possible_cpu(i) {
9863 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9864 +
9865 +               spin_lock_irq(&cpu_rq(i)->lock);
9866 +               *cpuusage = 0;
9867 +               spin_unlock_irq(&cpu_rq(i)->lock);
9868 +       }
9869 +out:
9870 +       return err;
9871 +}
9872 +
9873 +static struct cftype files[] = {
9874 +       {
9875 +               .name = "usage",
9876 +               .read_u64 = cpuusage_read,
9877 +               .write_u64 = cpuusage_write,
9878 +       },
9879 +};
9880 +
9881 +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9882 +{
9883 +       return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
9884 +}
9885 +
9886 +/*
9887 + * charge this task's execution time to its accounting group.
9888 + *
9889 + * called with rq->lock held.
9890 + */
9891 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9892 +{
9893 +       struct cpuacct *ca;
9894 +
9895 +       if (!cpuacct_subsys.active)
9896 +               return;
9897 +
9898 +       ca = task_ca(tsk);
9899 +       if (ca) {
9900 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9901 +
9902 +               *cpuusage += cputime;
9903 +       }
9904 +}
9905 +
9906 +struct cgroup_subsys cpuacct_subsys = {
9907 +       .name = "cpuacct",
9908 +       .create = cpuacct_create,
9909 +       .destroy = cpuacct_destroy,
9910 +       .populate = cpuacct_populate,
9911 +       .subsys_id = cpuacct_subsys_id,
9912 +};
9913 +#endif /* CONFIG_CGROUP_CPUACCT */
9914 +
9915 +#ifdef CONFIG_CHOPSTIX
9916 +void (*rec_event)();
9917 +EXPORT_SYMBOL(rec_event);
9918 +
9919 +struct event_spec {
9920 +    unsigned long pc;
9921 +    unsigned long dcookie;
9922 +    unsigned int count;
9923 +    unsigned int reason;
9924 +};
9925 +
9926 +/* To support safe calling from asm */
9927 +asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
9928 +    struct pt_regs *regs;
9929 +    struct event_spec *es = event_signature_in->event_data;
9930 +    regs = task_pt_regs(current);
9931 +    event_signature_in->task=current;
9932 +    es->pc=regs->ip;
9933 +    event_signature_in->count=1;
9934 +    (*rec_event)(event_signature_in, count);
9935 +}
9936 +#endif
9937 diff -Nurb linux-2.6.27-590/kernel/sched.c.rej linux-2.6.27-591/kernel/sched.c.rej
9938 --- linux-2.6.27-590/kernel/sched.c.rej 1969-12-31 19:00:00.000000000 -0500
9939 +++ linux-2.6.27-591/kernel/sched.c.rej 2010-01-29 16:30:22.000000000 -0500
9940 @@ -0,0 +1,258 @@
9941 +***************
9942 +*** 23,28 ****
9943 +  #include <linux/nmi.h>
9944 +  #include <linux/init.h>
9945 +  #include <asm/uaccess.h>
9946 +  #include <linux/highmem.h>
9947 +  #include <linux/smp_lock.h>
9948 +  #include <asm/mmu_context.h>
9949 +--- 23,29 ----
9950 +  #include <linux/nmi.h>
9951 +  #include <linux/init.h>
9952 +  #include <asm/uaccess.h>
9953 ++ #include <linux/arrays.h>
9954 +  #include <linux/highmem.h>
9955 +  #include <linux/smp_lock.h>
9956 +  #include <asm/mmu_context.h>
9957 +***************
9958 +*** 451,456 ****
9959 +
9960 +  repeat_lock_task:
9961 +       rq = task_rq(p);
9962 +       spin_lock(&rq->lock);
9963 +       if (unlikely(rq != task_rq(p))) {
9964 +               spin_unlock(&rq->lock);
9965 +--- 455,461 ----
9966 +
9967 +  repeat_lock_task:
9968 +       rq = task_rq(p);
9969 ++
9970 +       spin_lock(&rq->lock);
9971 +       if (unlikely(rq != task_rq(p))) {
9972 +               spin_unlock(&rq->lock);
9973 +***************
9974 +*** 1761,1766 ****
9975 +        * event cannot wake it up and insert it on the runqueue either.
9976 +        */
9977 +       p->state = TASK_RUNNING;
9978 +
9979 +       /*
9980 +        * Make sure we do not leak PI boosting priority to the child:
9981 +--- 1766,1786 ----
9982 +        * event cannot wake it up and insert it on the runqueue either.
9983 +        */
9984 +       p->state = TASK_RUNNING;
9985 ++ #ifdef CONFIG_CHOPSTIX
9986 ++     /* The jiffy of last interruption */
9987 ++     if (p->state & TASK_UNINTERRUPTIBLE) {
9988 ++                              p->last_interrupted=jiffies;
9989 ++      }
9990 ++     else
9991 ++     if (p->state & TASK_INTERRUPTIBLE) {
9992 ++                              p->last_interrupted=INTERRUPTIBLE;
9993 ++      }
9994 ++     else
9995 ++          p->last_interrupted=RUNNING;
9996 ++
9997 ++     /* The jiffy of last execution */
9998 ++      p->last_ran_j=jiffies;
9999 ++ #endif
10000 +
10001 +       /*
10002 +        * Make sure we do not leak PI boosting priority to the child:
10003 +***************
10004 +*** 3628,3633 ****
10005 +
10006 +  #endif
10007 +
10008 +  static inline int interactive_sleep(enum sleep_type sleep_type)
10009 +  {
10010 +       return (sleep_type == SLEEP_INTERACTIVE ||
10011 +--- 3648,3654 ----
10012 +
10013 +  #endif
10014 +
10015 ++
10016 +  static inline int interactive_sleep(enum sleep_type sleep_type)
10017 +  {
10018 +       return (sleep_type == SLEEP_INTERACTIVE ||
10019 +***************
10020 +*** 3637,3652 ****
10021 +  /*
10022 +   * schedule() is the main scheduler function.
10023 +   */
10024 +  asmlinkage void __sched schedule(void)
10025 +  {
10026 +       struct task_struct *prev, *next;
10027 +       struct prio_array *array;
10028 +       struct list_head *queue;
10029 +       unsigned long long now;
10030 +-      unsigned long run_time;
10031 +       int cpu, idx, new_prio;
10032 +       long *switch_count;
10033 +       struct rq *rq;
10034 +
10035 +       /*
10036 +        * Test if we are atomic.  Since do_exit() needs to call into
10037 +--- 3658,3685 ----
10038 +  /*
10039 +   * schedule() is the main scheduler function.
10040 +   */
10041 ++
10042 ++ #ifdef CONFIG_CHOPSTIX
10043 ++ extern void (*rec_event)(void *,unsigned int);
10044 ++ struct event_spec {
10045 ++      unsigned long pc;
10046 ++      unsigned long dcookie;
10047 ++      unsigned int count;
10048 ++      unsigned int reason;
10049 ++ };
10050 ++ #endif
10051 ++
10052 +  asmlinkage void __sched schedule(void)
10053 +  {
10054 +       struct task_struct *prev, *next;
10055 +       struct prio_array *array;
10056 +       struct list_head *queue;
10057 +       unsigned long long now;
10058 ++      unsigned long run_time, diff;
10059 +       int cpu, idx, new_prio;
10060 +       long *switch_count;
10061 +       struct rq *rq;
10062 ++      int sampling_reason;
10063 +
10064 +       /*
10065 +        * Test if we are atomic.  Since do_exit() needs to call into
10066 +***************
10067 +*** 3700,3705 ****
10068 +       switch_count = &prev->nivcsw;
10069 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
10070 +               switch_count = &prev->nvcsw;
10071 +               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
10072 +                               unlikely(signal_pending(prev))))
10073 +                       prev->state = TASK_RUNNING;
10074 +--- 3733,3739 ----
10075 +       switch_count = &prev->nivcsw;
10076 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
10077 +               switch_count = &prev->nvcsw;
10078 ++
10079 +               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
10080 +                               unlikely(signal_pending(prev))))
10081 +                       prev->state = TASK_RUNNING;
10082 +***************
10083 +*** 3709,3714 ****
10084 +                               vx_uninterruptible_inc(prev);
10085 +                       }
10086 +                       deactivate_task(prev, rq);
10087 +               }
10088 +       }
10089 +
10090 +--- 3743,3759 ----
10091 +                               vx_uninterruptible_inc(prev);
10092 +                       }
10093 +                       deactivate_task(prev, rq);
10094 ++ #ifdef CONFIG_CHOPSTIX
10095 ++             /* An uninterruptible process just yielded. Record the current jiffie */
10096 ++                      if (prev->state & TASK_UNINTERRUPTIBLE) {
10097 ++                              prev->last_interrupted=jiffies;
10098 ++                      }
10099 ++             /* An interruptible process just yielded, or it got preempted.
10100 ++              * Mark it as interruptible */
10101 ++                      else if (prev->state & TASK_INTERRUPTIBLE) {
10102 ++                              prev->last_interrupted=INTERRUPTIBLE;
10103 ++                      }
10104 ++ #endif
10105 +               }
10106 +       }
10107 +
10108 +***************
10109 +*** 3785,3790 ****
10110 +               prev->sleep_avg = 0;
10111 +       prev->timestamp = prev->last_ran = now;
10112 +
10113 +       sched_info_switch(prev, next);
10114 +       if (likely(prev != next)) {
10115 +               next->timestamp = next->last_ran = now;
10116 +--- 3830,3869 ----
10117 +               prev->sleep_avg = 0;
10118 +       prev->timestamp = prev->last_ran = now;
10119 +
10120 ++ #ifdef CONFIG_CHOPSTIX
10121 ++      /* Run only if the Chopstix module so decrees it */
10122 ++      if (rec_event) {
10123 ++              prev->last_ran_j = jiffies;
10124 ++              if (next->last_interrupted!=INTERRUPTIBLE) {
10125 ++                      if (next->last_interrupted!=RUNNING) {
10126 ++                              diff = (jiffies-next->last_interrupted);
10127 ++                              sampling_reason = 0;/* BLOCKING */
10128 ++                      }
10129 ++                      else {
10130 ++                              diff = jiffies-next->last_ran_j;
10131 ++                              sampling_reason = 1;/* PREEMPTION */
10132 ++                      }
10133 ++
10134 ++                      if (diff >= HZ/10) {
10135 ++                              struct event event;
10136 ++                              struct event_spec espec;
10137 ++                 struct pt_regs *regs;
10138 ++                 regs = task_pt_regs(current);
10139 ++
10140 ++                              espec.reason = sampling_reason;
10141 ++                              event.event_data=&espec;
10142 ++                              event.task=next;
10143 ++                              espec.pc=regs->eip;
10144 ++                              event.event_type=2;
10145 ++                              /* index in the event array currently set up */
10146 ++                              /* make sure the counters are loaded in the order we want them to show up*/
10147 ++                              (*rec_event)(&event, diff);
10148 ++                      }
10149 ++              }
10150 ++         /* next has been elected to run */
10151 ++              next->last_interrupted=0;
10152 ++      }
10153 ++ #endif
10154 +       sched_info_switch(prev, next);
10155 +       if (likely(prev != next)) {
10156 +               next->timestamp = next->last_ran = now;
10157 +***************
10158 +*** 5737,5742 ****
10159 +       jiffies_to_timespec(p->policy == SCHED_FIFO ?
10160 +                               0 : task_timeslice(p), &t);
10161 +       read_unlock(&tasklist_lock);
10162 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
10163 +  out_nounlock:
10164 +       return retval;
10165 +--- 5817,5823 ----
10166 +       jiffies_to_timespec(p->policy == SCHED_FIFO ?
10167 +                               0 : task_timeslice(p), &t);
10168 +       read_unlock(&tasklist_lock);
10169 ++
10170 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
10171 +  out_nounlock:
10172 +       return retval;
10173 +***************
10174 +*** 7980,7982 ****
10175 +  }
10176 +
10177 +  #endif
10178 +--- 8061,8080 ----
10179 +  }
10180 +
10181 +  #endif
10182 ++
10183 ++ #ifdef CONFIG_CHOPSTIX
10184 ++ void (*rec_event)(void *,unsigned int) = NULL;
10185 ++
10186 ++ /* To support safe calling from asm */
10187 ++ asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
10188 ++     struct pt_regs *regs;
10189 ++     struct event_spec *es = event_signature_in->event_data;
10190 ++     regs = task_pt_regs(current);
10191 ++      event_signature_in->task=current;
10192 ++      es->pc=regs->eip;
10193 ++     event_signature_in->count=1;
10194 ++     (*rec_event)(event_signature_in, count);
10195 ++ }
10196 ++ EXPORT_SYMBOL(rec_event);
10197 ++ EXPORT_SYMBOL(in_sched_functions);
10198 ++ #endif
10199 diff -Nurb linux-2.6.27-590/mm/memory.c linux-2.6.27-591/mm/memory.c
10200 --- linux-2.6.27-590/mm/memory.c        2010-01-29 16:29:48.000000000 -0500
10201 +++ linux-2.6.27-591/mm/memory.c        2010-01-31 22:21:18.000000000 -0500
10202 @@ -61,6 +61,7 @@
10203
10204  #include <linux/swapops.h>
10205  #include <linux/elf.h>
10206 +#include <linux/arrays.h>
10207
10208  #include "internal.h"
10209
10210 @@ -2690,6 +2691,15 @@
10211         return ret;
10212  }
10213
10214 +extern void (*rec_event)(void *,unsigned int);
10215 +struct event_spec {
10216 +       unsigned long pc;
10217 +       unsigned long dcookie;
10218 +       unsigned count;
10219 +       unsigned char reason;
10220 +};
10221 +
10222 +
10223  /*
10224   * By the time we get here, we already hold the mm semaphore
10225   */
10226 @@ -2719,6 +2729,24 @@
10227         if (!pte)
10228                 return VM_FAULT_OOM;
10229
10230 +#ifdef CONFIG_CHOPSTIX
10231 +       if (rec_event) {
10232 +               struct event event;
10233 +               struct event_spec espec;
10234 +        struct pt_regs *regs;
10235 +        unsigned int pc;
10236 +        regs = task_pt_regs(current);
10237 +        pc = regs->ip & (unsigned int) ~4095;
10238 +
10239 +               espec.reason = 0; /* alloc */
10240 +               event.event_data=&espec;
10241 +               event.task = current;
10242 +               espec.pc=pc;
10243 +               event.event_type=5;
10244 +               (*rec_event)(&event, 1);
10245 +       }
10246 +#endif
10247 +
10248         return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
10249  }
10250
10251 diff -Nurb linux-2.6.27-590/mm/slab.c linux-2.6.27-591/mm/slab.c
10252 --- linux-2.6.27-590/mm/slab.c  2010-01-29 16:29:48.000000000 -0500
10253 +++ linux-2.6.27-591/mm/slab.c  2010-01-29 16:30:22.000000000 -0500
10254 @@ -110,6 +110,7 @@
10255  #include       <linux/fault-inject.h>
10256  #include       <linux/rtmutex.h>
10257  #include       <linux/reciprocal_div.h>
10258 +#include <linux/arrays.h>
10259  #include       <linux/debugobjects.h>
10260
10261  #include       <asm/cacheflush.h>
10262 @@ -248,6 +249,14 @@
10263         void *addr;
10264  };
10265
10266 +extern void (*rec_event)(void *,unsigned int);
10267 +struct event_spec {
10268 +       unsigned long pc;
10269 +       unsigned long dcookie;
10270 +       unsigned count;
10271 +       unsigned char reason;
10272 +};
10273 +
10274  /*
10275   * struct array_cache
10276   *
10277 @@ -3469,6 +3478,19 @@
10278         local_irq_restore(save_flags);
10279         objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
10280         prefetchw(objp);
10281 +#ifdef CONFIG_CHOPSTIX
10282 +       if (rec_event && objp) {
10283 +               struct event event;
10284 +               struct event_spec espec;
10285 +
10286 +               espec.reason = 0; /* alloc */
10287 +               event.event_data=&espec;
10288 +               event.task = current;
10289 +               espec.pc=caller;
10290 +               event.event_type=5;
10291 +               (*rec_event)(&event, cachep->buffer_size);
10292 +       }
10293 +#endif
10294
10295         if (unlikely((flags & __GFP_ZERO) && objp))
10296                 memset(objp, 0, obj_size(cachep));
10297 @@ -3578,12 +3600,26 @@
10298   * Release an obj back to its cache. If the obj has a constructed state, it must
10299   * be in this state _before_ it is released.  Called with disabled ints.
10300   */
10301 -static inline void __cache_free(struct kmem_cache *cachep, void *objp)
10302 +static inline void __cache_free(struct kmem_cache *cachep, void *objp, void *caller)
10303  {
10304         struct array_cache *ac = cpu_cache_get(cachep);
10305
10306         check_irq_off();
10307 -       objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
10308 +       objp = cache_free_debugcheck(cachep, objp, caller);
10309 + #ifdef CONFIG_CHOPSTIX
10310 +       if (rec_event && objp) {
10311 +               struct event event;
10312 +               struct event_spec espec;
10313 +
10314 +               espec.reason = 1; /* free */
10315 +               event.event_data=&espec;
10316 +               event.task = current;
10317 +               espec.pc=caller;
10318 +               event.event_type=4;
10319 +               (*rec_event)(&event, cachep->buffer_size);
10320 +       }
10321 + #endif
10322 +
10323         vx_slab_free(cachep);
10324
10325         /*
10326 @@ -3714,6 +3750,7 @@
10327                                           void *caller)
10328  {
10329         struct kmem_cache *cachep;
10330 +       void *ret;
10331
10332         /* If you want to save a few bytes .text space: replace
10333          * __ with kmem_.
10334 @@ -3741,10 +3778,17 @@
10335  EXPORT_SYMBOL(__kmalloc_track_caller);
10336
10337  #else
10338 +#ifdef CONFIG_CHOPSTIX
10339 +void *__kmalloc(size_t size, gfp_t flags)
10340 +{
10341 +       return __do_kmalloc(size, flags, __builtin_return_address(0));
10342 +}
10343 +#else
10344  void *__kmalloc(size_t size, gfp_t flags)
10345  {
10346         return __do_kmalloc(size, flags, NULL);
10347  }
10348 +#endif
10349  EXPORT_SYMBOL(__kmalloc);
10350  #endif
10351
10352 @@ -3764,7 +3808,7 @@
10353         debug_check_no_locks_freed(objp, obj_size(cachep));
10354         if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
10355                 debug_check_no_obj_freed(objp, obj_size(cachep));
10356 -       __cache_free(cachep, objp);
10357 +       __cache_free(cachep, objp,__builtin_return_address(0));
10358         local_irq_restore(flags);
10359  }
10360  EXPORT_SYMBOL(kmem_cache_free);
10361 @@ -3790,7 +3834,7 @@
10362         c = virt_to_cache(objp);
10363         debug_check_no_locks_freed(objp, obj_size(c));
10364         debug_check_no_obj_freed(objp, obj_size(c));
10365 -       __cache_free(c, (void *)objp);
10366 +       __cache_free(c, (void *)objp,__builtin_return_address(0));
10367         local_irq_restore(flags);
10368  }
10369  EXPORT_SYMBOL(kfree);