linux-2.6-591-chopstix-intern.patch

   1 diff -Nurb linux-2.6.27-590/arch/Kconfig linux-2.6.27-591/arch/Kconfig
   2 --- linux-2.6.27-590/arch/Kconfig       2010-01-29 16:29:46.000000000 -0500
   3 +++ linux-2.6.27-591/arch/Kconfig       2010-01-29 16:30:22.000000000 -0500
   4 @@ -13,9 +13,18 @@
   5
   6           If unsure, say N.
   7
   8 +config CHOPSTIX
   9 +       bool "Chopstix (PlanetLab)"
  10 +       depends on MODULES && OPROFILE
  11 +       help
  12 +         Chopstix allows you to monitor various events by summarizing them
  13 +         in lossy data structures and transferring these data structures
  14 +         into user space. If in doubt, say "N".
  15 +
  16  config HAVE_OPROFILE
  17         def_bool n
  18
  19 +
  20  config KPROBES
  21         bool "Kprobes"
  22         depends on KALLSYMS && MODULES
  23 diff -Nurb linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c
  24 --- linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c   2008-10-09 18:13:53.000000000 -0400
  25 +++ linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c   2010-01-29 16:45:48.000000000 -0500
  26 @@ -9,6 +9,7 @@
  27  #include <linux/signal.h>
  28  #include <linux/personality.h>
  29  #include <linux/suspend.h>
  30 +#include <linux/arrays.h>
  31  #include <linux/kbuild.h>
  32  #include <asm/ucontext.h>
  33  #include "sigframe.h"
  34 @@ -24,9 +25,20 @@
  35  #include <linux/lguest.h>
  36  #include "../../../drivers/lguest/lg.h"
  37
  38 +
  39 +#define STACKOFFSET(sym, str, mem) \
  40 +       DEFINE(sym, offsetof(struct str, mem)-sizeof(struct str));
  41 +
  42  /* workaround for a warning with -Wmissing-prototypes */
  43  void foo(void);
  44
  45 +struct event_spec {
  46 +       unsigned long pc;
  47 +       unsigned long dcookie;
  48 +       unsigned count;
  49 +       unsigned int number;
  50 +};
  51 +
  52  void foo(void)
  53  {
  54         OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
  55 @@ -50,6 +62,16 @@
  56         OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
  57         BLANK();
  58
  59 +    STACKOFFSET(TASK_thread, task_struct, thread);
  60 +    STACKOFFSET(THREAD_esp, thread_struct, sp);
  61 +    STACKOFFSET(EVENT_event_data, event, event_data);
  62 +    STACKOFFSET(EVENT_task, event, task);
  63 +    STACKOFFSET(EVENT_event_type, event, event_type);
  64 +    STACKOFFSET(SPEC_number, event_spec, number);
  65 +    DEFINE(EVENT_SIZE, sizeof(struct event));
  66 +    DEFINE(SPEC_SIZE, sizeof(struct event_spec));
  67 +    DEFINE(SPEC_EVENT_SIZE, sizeof(struct event_spec)+sizeof(struct event));
  68 +
  69         OFFSET(TI_task, thread_info, task);
  70         OFFSET(TI_exec_domain, thread_info, exec_domain);
  71         OFFSET(TI_flags, thread_info, flags);
  72 diff -Nurb linux-2.6.27-590/arch/x86/kernel/entry_32.S linux-2.6.27-591/arch/x86/kernel/entry_32.S
  73 --- linux-2.6.27-590/arch/x86/kernel/entry_32.S 2008-10-09 18:13:53.000000000 -0400
  74 +++ linux-2.6.27-591/arch/x86/kernel/entry_32.S 2010-01-29 16:30:22.000000000 -0500
  75 @@ -426,6 +426,33 @@
  76         cmpl $(nr_syscalls), %eax
  77         jae syscall_badsys
  78  syscall_call:
  79 +    /* Move Chopstix syscall probe here */
  80 +    /* Save and clobber: eax, ecx, ebp  */
  81 +    pushl   %eax
  82 +    pushl   %ecx
  83 +    pushl   %ebp
  84 +    movl    %esp, %ebp
  85 +    subl    $SPEC_EVENT_SIZE, %esp
  86 +    movl    rec_event, %ecx
  87 +    testl   %ecx, %ecx
  88 +    jz  carry_on
  89 +    # struct event is first, just below %ebp
  90 +    movl    %eax, (SPEC_number-EVENT_SIZE)(%ebp)
  91 +    leal    -SPEC_EVENT_SIZE(%ebp), %eax
  92 +    movl    %eax, EVENT_event_data(%ebp)
  93 +    movl    $6, EVENT_event_type(%ebp)
  94 +    movl    rec_event, %edx
  95 +    movl    $1, 4(%esp)
  96 +    leal    -EVENT_SIZE(%ebp), %eax
  97 +    movl    %eax, (%esp)
  98 +    call    rec_event_asm
  99 +carry_on:
 100 +    addl $SPEC_EVENT_SIZE, %esp
 101 +    popl %ebp
 102 +    popl %ecx
 103 +    popl %eax
 104 +     /* End chopstix */
 105 +
 106         call *sys_call_table(,%eax,4)
 107         movl %eax,PT_EAX(%esp)          # store the return value
 108  syscall_exit:
 109 diff -Nurb linux-2.6.27-590/arch/x86/mm/fault.c linux-2.6.27-591/arch/x86/mm/fault.c
 110 --- linux-2.6.27-590/arch/x86/mm/fault.c        2010-01-29 16:29:46.000000000 -0500
 111 +++ linux-2.6.27-591/arch/x86/mm/fault.c        2010-01-29 16:30:22.000000000 -0500
 112 @@ -79,6 +79,15 @@
 113  #endif
 114  }
 115
 116 +
 117 +extern void (*rec_event)(void *,unsigned int);
 118 +struct event_spec {
 119 +       unsigned long pc;
 120 +       unsigned long dcookie;
 121 +       unsigned count;
 122 +       unsigned char reason;
 123 +};
 124 +
 125  /*
 126   * X86_32
 127   * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 128 diff -Nurb linux-2.6.27-590/drivers/oprofile/cpu_buffer.c linux-2.6.27-591/drivers/oprofile/cpu_buffer.c
 129 --- linux-2.6.27-590/drivers/oprofile/cpu_buffer.c      2008-10-09 18:13:53.000000000 -0400
 130 +++ linux-2.6.27-591/drivers/oprofile/cpu_buffer.c      2010-01-29 16:30:22.000000000 -0500
 131 @@ -21,6 +21,7 @@
 132  #include <linux/oprofile.h>
 133  #include <linux/vmalloc.h>
 134  #include <linux/errno.h>
 135 +#include <linux/arrays.h>
 136
 137  #include "event_buffer.h"
 138  #include "cpu_buffer.h"
 139 @@ -147,6 +148,17 @@
 140                 b->head_pos = 0;
 141  }
 142
 143 +#ifdef CONFIG_CHOPSTIX
 144 +
 145 +struct event_spec {
 146 +       unsigned int pc;
 147 +       unsigned long dcookie;
 148 +       unsigned count;
 149 +};
 150 +
 151 +extern void (*rec_event)(void *,unsigned int);
 152 +#endif
 153 +
 154  static inline void
 155  add_sample(struct oprofile_cpu_buffer * cpu_buf,
 156             unsigned long pc, unsigned long event)
 157 @@ -155,6 +167,7 @@
 158         entry->eip = pc;
 159         entry->event = event;
 160         increment_head(cpu_buf);
 161 +
 162  }
 163
 164  static inline void
 165 @@ -250,8 +263,28 @@
 166  {
 167         int is_kernel = !user_mode(regs);
 168         unsigned long pc = profile_pc(regs);
 169 +       int res=0;
 170
 171 +#ifdef CONFIG_CHOPSTIX
 172 +       if (rec_event) {
 173 +               struct event esig;
 174 +               struct event_spec espec;
 175 +               esig.task = current;
 176 +               espec.pc=pc;
 177 +               espec.count=1;
 178 +               esig.event_data=&espec;
 179 +               esig.event_type=event; /* index in the event array currently set up */
 180 +                                       /* make sure the counters are loaded in the order we want them to show up*/
 181 +               (*rec_event)(&esig, 1);
 182 +       }
 183 +       else {
 184         oprofile_add_ext_sample(pc, regs, event, is_kernel);
 185 +       }
 186 +#else
 187 +       oprofile_add_ext_sample(pc, regs, event, is_kernel);
 188 +#endif
 189 +
 190 +
 191  }
 192
 193  void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 194 diff -Nurb linux-2.6.27-590/fs/bio.c linux-2.6.27-591/fs/bio.c
 195 --- linux-2.6.27-590/fs/bio.c   2008-10-09 18:13:53.000000000 -0400
 196 +++ linux-2.6.27-591/fs/bio.c   2010-01-29 16:30:22.000000000 -0500
 197 @@ -27,6 +27,7 @@
 198  #include <linux/workqueue.h>
 199  #include <linux/blktrace_api.h>
 200  #include <scsi/sg.h>           /* for struct sg_iovec */
 201 +#include <linux/arrays.h>
 202
 203  static struct kmem_cache *bio_slab __read_mostly;
 204
 205 @@ -44,6 +45,7 @@
 206  };
 207  #undef BV
 208
 209 +
 210  /*
 211   * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 212   * IO code that does not need private memory pools.
 213 @@ -1171,6 +1173,14 @@
 214         }
 215  }
 216
 217 +struct event_spec {
 218 +       unsigned long pc;
 219 +       unsigned long dcookie;
 220 +       unsigned count;
 221 +       unsigned char reason;
 222 +};
 223 +
 224 +extern void (*rec_event)(void *,unsigned int);
 225  /**
 226   * bio_endio - end I/O on a bio
 227   * @bio:       bio
 228 @@ -1192,6 +1202,24 @@
 229         else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 230                 error = -EIO;
 231
 232 +#ifdef CONFIG_CHOPSTIX
 233 +               if (rec_event) {
 234 +                       struct event event;
 235 +                       struct event_spec espec;
 236 +                       unsigned long eip;
 237 +
 238 +                       espec.reason = 1;/*response */
 239 +
 240 +                       eip = bio->bi_end_io;
 241 +                       event.event_data=&espec;
 242 +                       espec.pc=eip;
 243 +                       event.event_type=3;
 244 +                       /* index in the event array currently set up */
 245 +                       /* make sure the counters are loaded in the order we want them to show up*/
 246 +                       (*rec_event)(&event, bytes_done);
 247 +               }
 248 +#endif
 249 +
 250         if (bio->bi_end_io)
 251                 bio->bi_end_io(bio, error);
 252  }
 253 diff -Nurb linux-2.6.27-590/fs/exec.c linux-2.6.27-591/fs/exec.c
 254 --- linux-2.6.27-590/fs/exec.c  2010-01-29 16:29:48.000000000 -0500
 255 +++ linux-2.6.27-591/fs/exec.c  2010-01-29 16:45:48.000000000 -0500
 256 @@ -27,6 +27,7 @@
 257  #include <linux/fdtable.h>
 258  #include <linux/mm.h>
 259  #include <linux/stat.h>
 260 +#include <linux/dcookies.h>
 261  #include <linux/fcntl.h>
 262  #include <linux/smp_lock.h>
 263  #include <linux/swap.h>
 264 @@ -698,6 +699,13 @@
 265                 goto out;
 266         }
 267
 268 + #ifdef CONFIG_CHOPSTIX
 269 +    unsigned long cookie;
 270 +    extern void (*rec_event)(void *, unsigned int);
 271 +    if (rec_event && !nd.dentry->d_cookie)
 272 +        get_dcookie(nd.dentry, nd.mnt, &cookie);
 273 + #endif
 274 +
 275         return file;
 276
 277   out_path_put:
 278 diff -Nurb linux-2.6.27-590/include/linux/arrays.h linux-2.6.27-591/include/linux/arrays.h
 279 --- linux-2.6.27-590/include/linux/arrays.h     1969-12-31 19:00:00.000000000 -0500
 280 +++ linux-2.6.27-591/include/linux/arrays.h     2010-01-29 16:30:22.000000000 -0500
 281 @@ -0,0 +1,36 @@
 282 +#ifndef __ARRAYS_H__
 283 +#define __ARRAYS_H__
 284 +#include <linux/list.h>
 285 +
 286 +#define SAMPLING_METHOD_DEFAULT 0
 287 +#define SAMPLING_METHOD_LOG 1
 288 +
 289 +/* Every probe has an array handler */
 290 +
 291 +/* XXX - Optimize this structure */
 292 +
 293 +extern void (*rec_event)(void *,unsigned int);
 294 +struct array_handler {
 295 +       struct list_head link;
 296 +       unsigned int (*hash_func)(void *);
 297 +       unsigned int (*sampling_func)(void *,int,void *);
 298 +       unsigned short size;
 299 +       unsigned int threshold;
 300 +       unsigned char **expcount;
 301 +       unsigned int sampling_method;
 302 +       unsigned int **arrays;
 303 +       unsigned int arraysize;
 304 +       unsigned int num_samples[2];
 305 +       void **epoch_samples; /* size-sized lists of samples */
 306 +       unsigned int (*serialize)(void *, void *);
 307 +       unsigned char code[5];
 308 +};
 309 +
 310 +struct event {
 311 +       struct list_head link;
 312 +       void *event_data;
 313 +       unsigned int count;
 314 +       unsigned int event_type;
 315 +       struct task_struct *task;
 316 +};
 317 +#endif
 318 diff -Nurb linux-2.6.27-590/include/linux/sched.h.rej linux-2.6.27-591/include/linux/sched.h.rej
 319 --- linux-2.6.27-590/include/linux/sched.h.rej  1969-12-31 19:00:00.000000000 -0500
 320 +++ linux-2.6.27-591/include/linux/sched.h.rej  2010-01-29 16:30:22.000000000 -0500
 321 @@ -0,0 +1,19 @@
 322 +***************
 323 +*** 850,855 ****
 324 +  #endif
 325 +       unsigned long sleep_avg;
 326 +       unsigned long long timestamp, last_ran;
 327 +       unsigned long long sched_time; /* sched_clock time spent running */
 328 +       enum sleep_type sleep_type;
 329 +
 330 +--- 850,859 ----
 331 +  #endif
 332 +       unsigned long sleep_avg;
 333 +       unsigned long long timestamp, last_ran;
 334 ++ #ifdef CONFIG_CHOPSTIX
 335 ++      unsigned long last_interrupted, last_ran_j;
 336 ++ #endif
 337 ++
 338 +       unsigned long long sched_time; /* sched_clock time spent running */
 339 +       enum sleep_type sleep_type;
 340 +
 341 diff -Nurb linux-2.6.27-590/kernel/sched.c linux-2.6.27-591/kernel/sched.c
 342 --- linux-2.6.27-590/kernel/sched.c     2010-01-29 16:29:48.000000000 -0500
 343 +++ linux-2.6.27-591/kernel/sched.c     2010-01-29 17:30:42.000000000 -0500
 344 @@ -10,7 +10,7 @@
 345   *  1998-11-19 Implemented schedule_timeout() and related stuff
 346   *             by Andrea Arcangeli
 347   *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
 348 - *             hybrid priority-list and round-robin design with
 349 + *             hybrid priority-list and round-robin deventn with
 350   *             an array-switch method of distributing timeslices
 351   *             and per-CPU runqueues.  Cleanups and useful suggestions
 352   *             by Davide Libenzi, preemptible kernel bits by Robert Love.
 353 @@ -79,6 +79,9 @@
 354
 355  #include "sched_cpupri.h"
 356
 357 +#define INTERRUPTIBLE   -1
 358 +#define RUNNING         0
 359 +
 360  /*
 361   * Convert user-nice values [ -20 ... 0 ... 19 ]
 362   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 363 @@ -4428,6 +4431,11 @@
 364         }
 365  }
 366
 367 +#ifdef CONFIG_CHOPSTIX
 368 +void (*rec_event)(void *,unsigned int) = NULL;
 369 +EXPORT_SYMBOL(rec_event);
 370 +#endif
 371 +
 372  /*
 373   * schedule() is the main scheduler function.
 374   */
 375 @@ -5369,6 +5377,7 @@
 376         get_task_struct(p);
 377         read_unlock(&tasklist_lock);
 378
 379 +
 380         retval = -EPERM;
 381         if ((current->euid != p->euid) && (current->euid != p->uid) &&
 382                         !capable(CAP_SYS_NICE))
 383 diff -Nurb linux-2.6.27-590/kernel/sched.c.orig linux-2.6.27-591/kernel/sched.c.orig
 384 --- linux-2.6.27-590/kernel/sched.c.orig        1969-12-31 19:00:00.000000000 -0500
 385 +++ linux-2.6.27-591/kernel/sched.c.orig        2010-01-29 16:30:22.000000000 -0500
 386 @@ -0,0 +1,9302 @@
 387 +/*
 388 + *  kernel/sched.c
 389 + *
 390 + *  Kernel scheduler and related syscalls
 391 + *
 392 + *  Copyright (C) 1991-2002  Linus Torvalds
 393 + *
 394 + *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 395 + *             make semaphores SMP safe
 396 + *  1998-11-19 Implemented schedule_timeout() and related stuff
 397 + *             by Andrea Arcangeli
 398 + *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
 399 + *             hybrid priority-list and round-robin deventn with
 400 + *             an array-switch method of distributing timeslices
 401 + *             and per-CPU runqueues.  Cleanups and useful suggestions
 402 + *             by Davide Libenzi, preemptible kernel bits by Robert Love.
 403 + *  2003-09-03 Interactivity tuning by Con Kolivas.
 404 + *  2004-04-02 Scheduler domains code by Nick Piggin
 405 + *  2007-04-15  Work begun on replacing all interactivity tuning with a
 406 + *              fair scheduling design by Con Kolivas.
 407 + *  2007-05-05  Load balancing (smp-nice) and other improvements
 408 + *              by Peter Williams
 409 + *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
 410 + *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
 411 + *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
 412 + *              Thomas Gleixner, Mike Kravetz
 413 + */
 414 +
 415 +#include <linux/mm.h>
 416 +#include <linux/module.h>
 417 +#include <linux/nmi.h>
 418 +#include <linux/init.h>
 419 +#include <linux/uaccess.h>
 420 +#include <linux/highmem.h>
 421 +#include <linux/smp_lock.h>
 422 +#include <asm/mmu_context.h>
 423 +#include <linux/interrupt.h>
 424 +#include <linux/capability.h>
 425 +#include <linux/completion.h>
 426 +#include <linux/kernel_stat.h>
 427 +#include <linux/debug_locks.h>
 428 +#include <linux/security.h>
 429 +#include <linux/notifier.h>
 430 +#include <linux/profile.h>
 431 +#include <linux/freezer.h>
 432 +#include <linux/vmalloc.h>
 433 +#include <linux/blkdev.h>
 434 +#include <linux/delay.h>
 435 +#include <linux/pid_namespace.h>
 436 +#include <linux/smp.h>
 437 +#include <linux/threads.h>
 438 +#include <linux/timer.h>
 439 +#include <linux/rcupdate.h>
 440 +#include <linux/cpu.h>
 441 +#include <linux/cpuset.h>
 442 +#include <linux/percpu.h>
 443 +#include <linux/kthread.h>
 444 +#include <linux/seq_file.h>
 445 +#include <linux/sysctl.h>
 446 +#include <linux/syscalls.h>
 447 +#include <linux/times.h>
 448 +#include <linux/tsacct_kern.h>
 449 +#include <linux/kprobes.h>
 450 +#include <linux/delayacct.h>
 451 +#include <linux/reciprocal_div.h>
 452 +#include <linux/unistd.h>
 453 +#include <linux/pagemap.h>
 454 +#include <linux/hrtimer.h>
 455 +#include <linux/tick.h>
 456 +#include <linux/bootmem.h>
 457 +#include <linux/debugfs.h>
 458 +#include <linux/ctype.h>
 459 +#include <linux/ftrace.h>
 460 +#include <linux/vs_sched.h>
 461 +#include <linux/vs_cvirt.h>
 462 +
 463 +#include <asm/tlb.h>
 464 +#include <asm/irq_regs.h>
 465 +
 466 +#include "sched_cpupri.h"
 467 +
 468 +#define INTERRUPTIBLE   -1
 469 +#define RUNNING         0
 470 +
 471 +/*
 472 + * Convert user-nice values [ -20 ... 0 ... 19 ]
 473 + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 474 + * and back.
 475 + */
 476 +#define NICE_TO_PRIO(nice)     (MAX_RT_PRIO + (nice) + 20)
 477 +#define PRIO_TO_NICE(prio)     ((prio) - MAX_RT_PRIO - 20)
 478 +#define TASK_NICE(p)           PRIO_TO_NICE((p)->static_prio)
 479 +
 480 +/*
 481 + * 'User priority' is the nice value converted to something we
 482 + * can work with better when scaling various scheduler parameters,
 483 + * it's a [ 0 ... 39 ] range.
 484 + */
 485 +#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
 486 +#define TASK_USER_PRIO(p)      USER_PRIO((p)->static_prio)
 487 +#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
 488 +
 489 +/*
 490 + * Helpers for converting nanosecond timing to jiffy resolution
 491 + */
 492 +#define NS_TO_JIFFIES(TIME)    ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 493 +
 494 +#define NICE_0_LOAD            SCHED_LOAD_SCALE
 495 +#define NICE_0_SHIFT           SCHED_LOAD_SHIFT
 496 +
 497 +/*
 498 + * These are the 'tuning knobs' of the scheduler:
 499 + *
 500 + * default timeslice is 100 msecs (used only for SCHED_RR tasks).
 501 + * Timeslices get refilled after they expire.
 502 + */
 503 +#define DEF_TIMESLICE          (100 * HZ / 1000)
 504 +
 505 +/*
 506 + * single value that denotes runtime == period, ie unlimited time.
 507 + */
 508 +#define RUNTIME_INF    ((u64)~0ULL)
 509 +
 510 +#ifdef CONFIG_SMP
 511 +/*
 512 + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 513 + * Since cpu_power is a 'constant', we can use a reciprocal divide.
 514 + */
 515 +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 516 +{
 517 +       return reciprocal_divide(load, sg->reciprocal_cpu_power);
 518 +}
 519 +
 520 +/*
 521 + * Each time a sched group cpu_power is changed,
 522 + * we must compute its reciprocal value
 523 + */
 524 +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 525 +{
 526 +       sg->__cpu_power += val;
 527 +       sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 528 +}
 529 +#endif
 530 +
 531 +static inline int rt_policy(int policy)
 532 +{
 533 +       if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 534 +               return 1;
 535 +       return 0;
 536 +}
 537 +
 538 +static inline int task_has_rt_policy(struct task_struct *p)
 539 +{
 540 +       return rt_policy(p->policy);
 541 +}
 542 +
 543 +/*
 544 + * This is the priority-queue data structure of the RT scheduling class:
 545 + */
 546 +struct rt_prio_array {
 547 +       DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 548 +       struct list_head queue[MAX_RT_PRIO];
 549 +};
 550 +
 551 +struct rt_bandwidth {
 552 +       /* nests inside the rq lock: */
 553 +       spinlock_t              rt_runtime_lock;
 554 +       ktime_t                 rt_period;
 555 +       u64                     rt_runtime;
 556 +       struct hrtimer          rt_period_timer;
 557 +};
 558 +
 559 +static struct rt_bandwidth def_rt_bandwidth;
 560 +
 561 +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 562 +
 563 +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 564 +{
 565 +       struct rt_bandwidth *rt_b =
 566 +               container_of(timer, struct rt_bandwidth, rt_period_timer);
 567 +       ktime_t now;
 568 +       int overrun;
 569 +       int idle = 0;
 570 +
 571 +       for (;;) {
 572 +               now = hrtimer_cb_get_time(timer);
 573 +               overrun = hrtimer_forward(timer, now, rt_b->rt_period);
 574 +
 575 +               if (!overrun)
 576 +                       break;
 577 +
 578 +               idle = do_sched_rt_period_timer(rt_b, overrun);
 579 +       }
 580 +
 581 +       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 582 +}
 583 +
 584 +static
 585 +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 586 +{
 587 +       rt_b->rt_period = ns_to_ktime(period);
 588 +       rt_b->rt_runtime = runtime;
 589 +
 590 +       spin_lock_init(&rt_b->rt_runtime_lock);
 591 +
 592 +       hrtimer_init(&rt_b->rt_period_timer,
 593 +                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 594 +       rt_b->rt_period_timer.function = sched_rt_period_timer;
 595 +       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 596 +}
 597 +
 598 +static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 599 +{
 600 +       ktime_t now;
 601 +
 602 +       if (rt_b->rt_runtime == RUNTIME_INF)
 603 +               return;
 604 +
 605 +       if (hrtimer_active(&rt_b->rt_period_timer))
 606 +               return;
 607 +
 608 +       spin_lock(&rt_b->rt_runtime_lock);
 609 +       for (;;) {
 610 +               if (hrtimer_active(&rt_b->rt_period_timer))
 611 +                       break;
 612 +
 613 +               now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 614 +               hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 615 +               hrtimer_start(&rt_b->rt_period_timer,
 616 +                             rt_b->rt_period_timer.expires,
 617 +                             HRTIMER_MODE_ABS);
 618 +       }
 619 +       spin_unlock(&rt_b->rt_runtime_lock);
 620 +}
 621 +
 622 +#ifdef CONFIG_RT_GROUP_SCHED
 623 +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 624 +{
 625 +       hrtimer_cancel(&rt_b->rt_period_timer);
 626 +}
 627 +#endif
 628 +
 629 +/*
 630 + * sched_domains_mutex serializes calls to arch_init_sched_domains,
 631 + * detach_destroy_domains and partition_sched_domains.
 632 + */
 633 +static DEFINE_MUTEX(sched_domains_mutex);
 634 +
 635 +#ifdef CONFIG_GROUP_SCHED
 636 +
 637 +#include <linux/cgroup.h>
 638 +
 639 +struct cfs_rq;
 640 +
 641 +static LIST_HEAD(task_groups);
 642 +
 643 +/* task group related information */
 644 +struct task_group {
 645 +#ifdef CONFIG_CGROUP_SCHED
 646 +       struct cgroup_subsys_state css;
 647 +#endif
 648 +
 649 +#ifdef CONFIG_FAIR_GROUP_SCHED
 650 +       /* schedulable entities of this group on each cpu */
 651 +       struct sched_entity **se;
 652 +       /* runqueue "owned" by this group on each cpu */
 653 +       struct cfs_rq **cfs_rq;
 654 +       unsigned long shares;
 655 +#endif
 656 +
 657 +#ifdef CONFIG_RT_GROUP_SCHED
 658 +       struct sched_rt_entity **rt_se;
 659 +       struct rt_rq **rt_rq;
 660 +
 661 +       struct rt_bandwidth rt_bandwidth;
 662 +#endif
 663 +
 664 +       struct rcu_head rcu;
 665 +       struct list_head list;
 666 +
 667 +       struct task_group *parent;
 668 +       struct list_head siblings;
 669 +       struct list_head children;
 670 +};
 671 +
 672 +#ifdef CONFIG_USER_SCHED
 673 +
 674 +/*
 675 + * Root task group.
 676 + *     Every UID task group (including init_task_group aka UID-0) will
 677 + *     be a child to this group.
 678 + */
 679 +struct task_group root_task_group;
 680 +
 681 +#ifdef CONFIG_FAIR_GROUP_SCHED
 682 +/* Default task group's sched entity on each cpu */
 683 +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 684 +/* Default task group's cfs_rq on each cpu */
 685 +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 686 +#endif /* CONFIG_FAIR_GROUP_SCHED */
 687 +
 688 +#ifdef CONFIG_RT_GROUP_SCHED
 689 +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 690 +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 691 +#endif /* CONFIG_RT_GROUP_SCHED */
 692 +#else /* !CONFIG_FAIR_GROUP_SCHED */
 693 +#define root_task_group init_task_group
 694 +#endif /* CONFIG_FAIR_GROUP_SCHED */
 695 +
 696 +/* task_group_lock serializes add/remove of task groups and also changes to
 697 + * a task group's cpu shares.
 698 + */
 699 +static DEFINE_SPINLOCK(task_group_lock);
 700 +
 701 +#ifdef CONFIG_FAIR_GROUP_SCHED
 702 +#ifdef CONFIG_USER_SCHED
 703 +# define INIT_TASK_GROUP_LOAD  (2*NICE_0_LOAD)
 704 +#else /* !CONFIG_USER_SCHED */
 705 +# define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
 706 +#endif /* CONFIG_USER_SCHED */
 707 +
 708 +/*
 709 + * A weight of 0 or 1 can cause arithmetics problems.
 710 + * A weight of a cfs_rq is the sum of weights of which entities
 711 + * are queued on this cfs_rq, so a weight of a entity should not be
 712 + * too large, so as the shares value of a task group.
 713 + * (The default weight is 1024 - so there's no practical
 714 + *  limitation from this.)
 715 + */
 716 +#define MIN_SHARES     2
 717 +#define MAX_SHARES     (1UL << 18)
 718 +
 719 +static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 720 +#endif
 721 +
 722 +/* Default task group.
 723 + *     Every task in system belong to this group at bootup.
 724 + */
 725 +struct task_group init_task_group;
 726 +
 727 +/* return group to which a task belongs */
 728 +static inline struct task_group *task_group(struct task_struct *p)
 729 +{
 730 +       struct task_group *tg;
 731 +
 732 +#ifdef CONFIG_USER_SCHED
 733 +       tg = p->user->tg;
 734 +#elif defined(CONFIG_CGROUP_SCHED)
 735 +       tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 736 +                               struct task_group, css);
 737 +#else
 738 +       tg = &init_task_group;
 739 +#endif
 740 +       return tg;
 741 +}
 742 +
 743 +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 744 +static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 745 +{
 746 +#ifdef CONFIG_FAIR_GROUP_SCHED
 747 +       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 748 +       p->se.parent = task_group(p)->se[cpu];
 749 +#endif
 750 +
 751 +#ifdef CONFIG_RT_GROUP_SCHED
 752 +       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 753 +       p->rt.parent = task_group(p)->rt_se[cpu];
 754 +#endif
 755 +}
 756 +
 757 +#else
 758 +
 759 +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 760 +static inline struct task_group *task_group(struct task_struct *p)
 761 +{
 762 +       return NULL;
 763 +}
 764 +
 765 +#endif /* CONFIG_GROUP_SCHED */
 766 +
 767 +/* CFS-related fields in a runqueue */
 768 +struct cfs_rq {
 769 +       struct load_weight load;
 770 +       unsigned long nr_running;
 771 +
 772 +       u64 exec_clock;
 773 +       u64 min_vruntime;
 774 +       u64 pair_start;
 775 +
 776 +       struct rb_root tasks_timeline;
 777 +       struct rb_node *rb_leftmost;
 778 +
 779 +       struct list_head tasks;
 780 +       struct list_head *balance_iterator;
 781 +
 782 +       /*
 783 +        * 'curr' points to currently running entity on this cfs_rq.
 784 +        * It is set to NULL otherwise (i.e when none are currently running).
 785 +        */
 786 +       struct sched_entity *curr, *next;
 787 +
 788 +       unsigned long nr_spread_over;
 789 +
 790 +#ifdef CONFIG_FAIR_GROUP_SCHED
 791 +       struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 792 +
 793 +       /*
 794 +        * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 795 +        * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 796 +        * (like users, containers etc.)
 797 +        *
 798 +        * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 799 +        * list is used during load balance.
 800 +        */
 801 +       struct list_head leaf_cfs_rq_list;
 802 +       struct task_group *tg;  /* group that "owns" this runqueue */
 803 +
 804 +#ifdef CONFIG_SMP
 805 +       /*
 806 +        * the part of load.weight contributed by tasks
 807 +        */
 808 +       unsigned long task_weight;
 809 +
 810 +       /*
 811 +        *   h_load = weight * f(tg)
 812 +        *
 813 +        * Where f(tg) is the recursive weight fraction assigned to
 814 +        * this group.
 815 +        */
 816 +       unsigned long h_load;
 817 +
 818 +       /*
 819 +        * this cpu's part of tg->shares
 820 +        */
 821 +       unsigned long shares;
 822 +
 823 +       /*
 824 +        * load.weight at the time we set shares
 825 +        */
 826 +       unsigned long rq_weight;
 827 +#endif
 828 +#endif
 829 +};
 830 +
 831 +/* Real-Time classes' related field in a runqueue: */
 832 +struct rt_rq {
 833 +       struct rt_prio_array active;
 834 +       unsigned long rt_nr_running;
 835 +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 836 +       int highest_prio; /* highest queued rt task prio */
 837 +#endif
 838 +#ifdef CONFIG_SMP
 839 +       unsigned long rt_nr_migratory;
 840 +       int overloaded;
 841 +#endif
 842 +       int rt_throttled;
 843 +       u64 rt_time;
 844 +       u64 rt_runtime;
 845 +       /* Nests inside the rq lock: */
 846 +       spinlock_t rt_runtime_lock;
 847 +
 848 +#ifdef CONFIG_RT_GROUP_SCHED
 849 +       unsigned long rt_nr_boosted;
 850 +
 851 +       struct rq *rq;
 852 +       struct list_head leaf_rt_rq_list;
 853 +       struct task_group *tg;
 854 +       struct sched_rt_entity *rt_se;
 855 +#endif
 856 +};
 857 +
 858 +#ifdef CONFIG_SMP
 859 +
 860 +/*
 861 + * We add the notion of a root-domain which will be used to define per-domain
 862 + * variables. Each exclusive cpuset essentially defines an island domain by
 863 + * fully partitioning the member cpus from any other cpuset. Whenever a new
 864 + * exclusive cpuset is created, we also create and attach a new root-domain
 865 + * object.
 866 + *
 867 + */
 868 +struct root_domain {
 869 +       atomic_t refcount;
 870 +       cpumask_t span;
 871 +       cpumask_t online;
 872 +
 873 +       /*
 874 +        * The "RT overload" flag: it gets set if a CPU has more than
 875 +        * one runnable RT task.
 876 +        */
 877 +       cpumask_t rto_mask;
 878 +       atomic_t rto_count;
 879 +#ifdef CONFIG_SMP
 880 +       struct cpupri cpupri;
 881 +#endif
 882 +};
 883 +
 884 +/*
 885 + * By default the system creates a single root-domain with all cpus as
 886 + * members (mimicking the global state we have today).
 887 + */
 888 +static struct root_domain def_root_domain;
 889 +
 890 +#endif
 891 +       unsigned long norm_time;
 892 +       unsigned long idle_time;
 893 +#ifdef CONFIG_VSERVER_IDLETIME
 894 +       int idle_skip;
 895 +#endif
 896 +#ifdef CONFIG_VSERVER_HARDCPU
 897 +       struct list_head hold_queue;
 898 +       unsigned long nr_onhold;
 899 +       int idle_tokens;
 900 +#endif
 901 +
 902 +/*
 903 + * This is the main, per-CPU runqueue data structure.
 904 + *
 905 + * Locking rule: those places that want to lock multiple runqueues
 906 + * (such as the load balancing or the thread migration code), lock
 907 + * acquire operations must be ordered by ascending &runqueue.
 908 + */
 909 +struct rq {
 910 +       /* runqueue lock: */
 911 +       spinlock_t lock;
 912 +
 913 +       /*
 914 +        * nr_running and cpu_load should be in the same cacheline because
 915 +        * remote CPUs use both these fields when doing load calculation.
 916 +        */
 917 +       unsigned long nr_running;
 918 +       #define CPU_LOAD_IDX_MAX 5
 919 +       unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 920 +       unsigned char idle_at_tick;
 921 +#ifdef CONFIG_NO_HZ
 922 +       unsigned long last_tick_seen;
 923 +       unsigned char in_nohz_recently;
 924 +#endif
 925 +       /* capture load from *all* tasks on this cpu: */
 926 +       struct load_weight load;
 927 +       unsigned long nr_load_updates;
 928 +       u64 nr_switches;
 929 +
 930 +       struct cfs_rq cfs;
 931 +       struct rt_rq rt;
 932 +
 933 +#ifdef CONFIG_FAIR_GROUP_SCHED
 934 +       /* list of leaf cfs_rq on this cpu: */
 935 +       struct list_head leaf_cfs_rq_list;
 936 +#endif
 937 +#ifdef CONFIG_RT_GROUP_SCHED
 938 +       struct list_head leaf_rt_rq_list;
 939 +#endif
 940 +
 941 +       /*
 942 +        * This is part of a global counter where only the total sum
 943 +        * over all CPUs matters. A task can increase this counter on
 944 +        * one CPU and if it got migrated afterwards it may decrease
 945 +        * it on another CPU. Always updated under the runqueue lock:
 946 +        */
 947 +       unsigned long nr_uninterruptible;
 948 +
 949 +       struct task_struct *curr, *idle;
 950 +       unsigned long next_balance;
 951 +       struct mm_struct *prev_mm;
 952 +
 953 +       u64 clock;
 954 +
 955 +       atomic_t nr_iowait;
 956 +
 957 +#ifdef CONFIG_SMP
 958 +       struct root_domain *rd;
 959 +       struct sched_domain *sd;
 960 +
 961 +       /* For active balancing */
 962 +       int active_balance;
 963 +       int push_cpu;
 964 +       /* cpu of this runqueue: */
 965 +       int cpu;
 966 +       int online;
 967 +
 968 +       unsigned long avg_load_per_task;
 969 +
 970 +       struct task_struct *migration_thread;
 971 +       struct list_head migration_queue;
 972 +#endif
 973 +
 974 +#ifdef CONFIG_SCHED_HRTICK
 975 +#ifdef CONFIG_SMP
 976 +       int hrtick_csd_pending;
 977 +       struct call_single_data hrtick_csd;
 978 +#endif
 979 +       struct hrtimer hrtick_timer;
 980 +#endif
 981 +
 982 +#ifdef CONFIG_SCHEDSTATS
 983 +       /* latency stats */
 984 +       struct sched_info rq_sched_info;
 985 +
 986 +       /* sys_sched_yield() stats */
 987 +       unsigned int yld_exp_empty;
 988 +       unsigned int yld_act_empty;
 989 +       unsigned int yld_both_empty;
 990 +       unsigned int yld_count;
 991 +
 992 +       /* schedule() stats */
 993 +       unsigned int sched_switch;
 994 +       unsigned int sched_count;
 995 +       unsigned int sched_goidle;
 996 +
 997 +       /* try_to_wake_up() stats */
 998 +       unsigned int ttwu_count;
 999 +       unsigned int ttwu_local;
1000 +
1001 +       /* BKL stats */
1002 +       unsigned int bkl_count;
1003 +#endif
1004 +};
1005 +
1006 +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
1007 +
1008 +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
1009 +{
1010 +       rq->curr->sched_class->check_preempt_curr(rq, p);
1011 +}
1012 +
1013 +static inline int cpu_of(struct rq *rq)
1014 +{
1015 +#ifdef CONFIG_SMP
1016 +       return rq->cpu;
1017 +#else
1018 +       return 0;
1019 +#endif
1020 +}
1021 +
1022 +/*
1023 + * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1024 + * See detach_destroy_domains: synchronize_sched for details.
1025 + *
1026 + * The domain tree of any CPU may only be accessed from within
1027 + * preempt-disabled sections.
1028 + */
1029 +#define for_each_domain(cpu, __sd) \
1030 +       for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
1031 +
1032 +#define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
1033 +#define this_rq()              (&__get_cpu_var(runqueues))
1034 +#define task_rq(p)             cpu_rq(task_cpu(p))
1035 +#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
1036 +
1037 +static inline void update_rq_clock(struct rq *rq)
1038 +{
1039 +       rq->clock = sched_clock_cpu(cpu_of(rq));
1040 +}
1041 +
1042 +/*
1043 + * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
1044 + */
1045 +#ifdef CONFIG_SCHED_DEBUG
1046 +# define const_debug __read_mostly
1047 +#else
1048 +# define const_debug static const
1049 +#endif
1050 +
1051 +/**
1052 + * runqueue_is_locked
1053 + *
1054 + * Returns true if the current cpu runqueue is locked.
1055 + * This interface allows printk to be called with the runqueue lock
1056 + * held and know whether or not it is OK to wake up the klogd.
1057 + */
1058 +int runqueue_is_locked(void)
1059 +{
1060 +       int cpu = get_cpu();
1061 +       struct rq *rq = cpu_rq(cpu);
1062 +       int ret;
1063 +
1064 +       ret = spin_is_locked(&rq->lock);
1065 +       put_cpu();
1066 +       return ret;
1067 +}
1068 +
1069 +/*
1070 + * Debugging: various feature bits
1071 + */
1072 +
1073 +#define SCHED_FEAT(name, enabled)      \
1074 +       __SCHED_FEAT_##name ,
1075 +
1076 +enum {
1077 +#include "sched_features.h"
1078 +};
1079 +
1080 +#undef SCHED_FEAT
1081 +
1082 +#define SCHED_FEAT(name, enabled)      \
1083 +       (1UL << __SCHED_FEAT_##name) * enabled |
1084 +
1085 +const_debug unsigned int sysctl_sched_features =
1086 +#include "sched_features.h"
1087 +       0;
1088 +
1089 +#undef SCHED_FEAT
1090 +
1091 +#ifdef CONFIG_SCHED_DEBUG
1092 +#define SCHED_FEAT(name, enabled)      \
1093 +       #name ,
1094 +
1095 +static __read_mostly char *sched_feat_names[] = {
1096 +#include "sched_features.h"
1097 +       NULL
1098 +};
1099 +
1100 +#undef SCHED_FEAT
1101 +
1102 +static int sched_feat_open(struct inode *inode, struct file *filp)
1103 +{
1104 +       filp->private_data = inode->i_private;
1105 +       return 0;
1106 +}
1107 +
1108 +static ssize_t
1109 +sched_feat_read(struct file *filp, char __user *ubuf,
1110 +               size_t cnt, loff_t *ppos)
1111 +{
1112 +       char *buf;
1113 +       int r = 0;
1114 +       int len = 0;
1115 +       int i;
1116 +
1117 +       for (i = 0; sched_feat_names[i]; i++) {
1118 +               len += strlen(sched_feat_names[i]);
1119 +               len += 4;
1120 +       }
1121 +
1122 +       buf = kmalloc(len + 2, GFP_KERNEL);
1123 +       if (!buf)
1124 +               return -ENOMEM;
1125 +
1126 +       for (i = 0; sched_feat_names[i]; i++) {
1127 +               if (sysctl_sched_features & (1UL << i))
1128 +                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
1129 +               else
1130 +                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
1131 +       }
1132 +
1133 +       r += sprintf(buf + r, "\n");
1134 +       WARN_ON(r >= len + 2);
1135 +
1136 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1137 +
1138 +       kfree(buf);
1139 +
1140 +       return r;
1141 +}
1142 +
1143 +static ssize_t
1144 +sched_feat_write(struct file *filp, const char __user *ubuf,
1145 +               size_t cnt, loff_t *ppos)
1146 +{
1147 +       char buf[64];
1148 +       char *cmp = buf;
1149 +       int neg = 0;
1150 +       int i;
1151 +
1152 +       if (cnt > 63)
1153 +               cnt = 63;
1154 +
1155 +       if (copy_from_user(&buf, ubuf, cnt))
1156 +               return -EFAULT;
1157 +
1158 +       buf[cnt] = 0;
1159 +
1160 +       if (strncmp(buf, "NO_", 3) == 0) {
1161 +               neg = 1;
1162 +               cmp += 3;
1163 +       }
1164 +
1165 +       for (i = 0; sched_feat_names[i]; i++) {
1166 +               int len = strlen(sched_feat_names[i]);
1167 +
1168 +               if (strncmp(cmp, sched_feat_names[i], len) == 0) {
1169 +                       if (neg)
1170 +                               sysctl_sched_features &= ~(1UL << i);
1171 +                       else
1172 +                               sysctl_sched_features |= (1UL << i);
1173 +                       break;
1174 +               }
1175 +       }
1176 +
1177 +       if (!sched_feat_names[i])
1178 +               return -EINVAL;
1179 +
1180 +       filp->f_pos += cnt;
1181 +
1182 +       return cnt;
1183 +}
1184 +
1185 +static struct file_operations sched_feat_fops = {
1186 +       .open   = sched_feat_open,
1187 +       .read   = sched_feat_read,
1188 +       .write  = sched_feat_write,
1189 +};
1190 +
1191 +static __init int sched_init_debug(void)
1192 +{
1193 +       debugfs_create_file("sched_features", 0644, NULL, NULL,
1194 +                       &sched_feat_fops);
1195 +
1196 +       return 0;
1197 +}
1198 +late_initcall(sched_init_debug);
1199 +
1200 +#endif
1201 +
1202 +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
1203 +
1204 +/*
1205 + * Number of tasks to iterate in a single balance run.
1206 + * Limited because this is done with IRQs disabled.
1207 + */
1208 +const_debug unsigned int sysctl_sched_nr_migrate = 32;
1209 +
1210 +/*
1211 + * ratelimit for updating the group shares.
1212 + * default: 0.25ms
1213 + */
1214 +unsigned int sysctl_sched_shares_ratelimit = 250000;
1215 +
1216 +/*
1217 + * period over which we measure -rt task cpu usage in us.
1218 + * default: 1s
1219 + */
1220 +unsigned int sysctl_sched_rt_period = 1000000;
1221 +
1222 +static __read_mostly int scheduler_running;
1223 +
1224 +/*
1225 + * part of the period that we allow rt tasks to run in us.
1226 + * default: 0.95s
1227 + */
1228 +int sysctl_sched_rt_runtime = 950000;
1229 +
1230 +static inline u64 global_rt_period(void)
1231 +{
1232 +       return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
1233 +}
1234 +
1235 +static inline u64 global_rt_runtime(void)
1236 +{
1237 +       if (sysctl_sched_rt_runtime < 0)
1238 +               return RUNTIME_INF;
1239 +
1240 +       return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
1241 +}
1242 +
1243 +#ifndef prepare_arch_switch
1244 +# define prepare_arch_switch(next)     do { } while (0)
1245 +#endif
1246 +#ifndef finish_arch_switch
1247 +# define finish_arch_switch(prev)      do { } while (0)
1248 +#endif
1249 +
1250 +static inline int task_current(struct rq *rq, struct task_struct *p)
1251 +{
1252 +       return rq->curr == p;
1253 +}
1254 +
1255 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1256 +static inline int task_running(struct rq *rq, struct task_struct *p)
1257 +{
1258 +       return task_current(rq, p);
1259 +}
1260 +
1261 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1262 +{
1263 +}
1264 +
1265 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1266 +{
1267 +#ifdef CONFIG_DEBUG_SPINLOCK
1268 +       /* this is a valid case when another task releases the spinlock */
1269 +       rq->lock.owner = current;
1270 +#endif
1271 +       /*
1272 +        * If we are tracking spinlock dependencies then we have to
1273 +        * fix up the runqueue lock - which gets 'carried over' from
1274 +        * prev into current:
1275 +        */
1276 +       spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1277 +
1278 +       spin_unlock_irq(&rq->lock);
1279 +}
1280 +
1281 +#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1282 +static inline int task_running(struct rq *rq, struct task_struct *p)
1283 +{
1284 +#ifdef CONFIG_SMP
1285 +       return p->oncpu;
1286 +#else
1287 +       return task_current(rq, p);
1288 +#endif
1289 +}
1290 +
1291 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1292 +{
1293 +#ifdef CONFIG_SMP
1294 +       /*
1295 +        * We can optimise this out completely for !SMP, because the
1296 +        * SMP rebalancing from interrupt is the only thing that cares
1297 +        * here.
1298 +        */
1299 +       next->oncpu = 1;
1300 +#endif
1301 +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1302 +       spin_unlock_irq(&rq->lock);
1303 +#else
1304 +       spin_unlock(&rq->lock);
1305 +#endif
1306 +}
1307 +
1308 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1309 +{
1310 +#ifdef CONFIG_SMP
1311 +       /*
1312 +        * After ->oncpu is cleared, the task can be moved to a different CPU.
1313 +        * We must ensure this doesn't happen until the switch is completely
1314 +        * finished.
1315 +        */
1316 +       smp_wmb();
1317 +       prev->oncpu = 0;
1318 +#endif
1319 +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1320 +       local_irq_enable();
1321 +#endif
1322 +}
1323 +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1324 +
1325 +/*
1326 + * __task_rq_lock - lock the runqueue a given task resides on.
1327 + * Must be called interrupts disabled.
1328 + */
1329 +static inline struct rq *__task_rq_lock(struct task_struct *p)
1330 +       __acquires(rq->lock)
1331 +{
1332 +       for (;;) {
1333 +               struct rq *rq = task_rq(p);
1334 +               spin_lock(&rq->lock);
1335 +               if (likely(rq == task_rq(p)))
1336 +                       return rq;
1337 +               spin_unlock(&rq->lock);
1338 +       }
1339 +}
1340 +
1341 +/*
1342 + * task_rq_lock - lock the runqueue a given task resides on and disable
1343 + * interrupts. Note the ordering: we can safely lookup the task_rq without
1344 + * explicitly disabling preemption.
1345 + */
1346 +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1347 +       __acquires(rq->lock)
1348 +{
1349 +       struct rq *rq;
1350 +
1351 +       for (;;) {
1352 +               local_irq_save(*flags);
1353 +               rq = task_rq(p);
1354 +               spin_lock(&rq->lock);
1355 +               if (likely(rq == task_rq(p)))
1356 +                       return rq;
1357 +               spin_unlock_irqrestore(&rq->lock, *flags);
1358 +       }
1359 +}
1360 +
1361 +static void __task_rq_unlock(struct rq *rq)
1362 +       __releases(rq->lock)
1363 +{
1364 +       spin_unlock(&rq->lock);
1365 +}
1366 +
1367 +static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1368 +       __releases(rq->lock)
1369 +{
1370 +       spin_unlock_irqrestore(&rq->lock, *flags);
1371 +}
1372 +
1373 +/*
1374 + * this_rq_lock - lock this runqueue and disable interrupts.
1375 + */
1376 +static struct rq *this_rq_lock(void)
1377 +       __acquires(rq->lock)
1378 +{
1379 +       struct rq *rq;
1380 +
1381 +       local_irq_disable();
1382 +       rq = this_rq();
1383 +       spin_lock(&rq->lock);
1384 +
1385 +       return rq;
1386 +}
1387 +
1388 +#ifdef CONFIG_SCHED_HRTICK
1389 +/*
1390 + * Use HR-timers to deliver accurate preemption points.
1391 + *
1392 + * Its all a bit involved since we cannot program an hrt while holding the
1393 + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1394 + * reschedule event.
1395 + *
1396 + * When we get rescheduled we reprogram the hrtick_timer outside of the
1397 + * rq->lock.
1398 + */
1399 +
1400 +/*
1401 + * Use hrtick when:
1402 + *  - enabled by features
1403 + *  - hrtimer is actually high res
1404 + */
1405 +static inline int hrtick_enabled(struct rq *rq)
1406 +{
1407 +       if (!sched_feat(HRTICK))
1408 +               return 0;
1409 +       if (!cpu_active(cpu_of(rq)))
1410 +               return 0;
1411 +       return hrtimer_is_hres_active(&rq->hrtick_timer);
1412 +}
1413 +
1414 +static void hrtick_clear(struct rq *rq)
1415 +{
1416 +       if (hrtimer_active(&rq->hrtick_timer))
1417 +               hrtimer_cancel(&rq->hrtick_timer);
1418 +}
1419 +
1420 +/*
1421 + * High-resolution timer tick.
1422 + * Runs from hardirq context with interrupts disabled.
1423 + */
1424 +static enum hrtimer_restart hrtick(struct hrtimer *timer)
1425 +{
1426 +       struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1427 +
1428 +       WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1429 +
1430 +       spin_lock(&rq->lock);
1431 +       update_rq_clock(rq);
1432 +       rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1433 +       spin_unlock(&rq->lock);
1434 +
1435 +       return HRTIMER_NORESTART;
1436 +}
1437 +
1438 +#ifdef CONFIG_SMP
1439 +/*
1440 + * called from hardirq (IPI) context
1441 + */
1442 +static void __hrtick_start(void *arg)
1443 +{
1444 +       struct rq *rq = arg;
1445 +
1446 +       spin_lock(&rq->lock);
1447 +       hrtimer_restart(&rq->hrtick_timer);
1448 +       rq->hrtick_csd_pending = 0;
1449 +       spin_unlock(&rq->lock);
1450 +}
1451 +
1452 +/*
1453 + * Called to set the hrtick timer state.
1454 + *
1455 + * called with rq->lock held and irqs disabled
1456 + */
1457 +static void hrtick_start(struct rq *rq, u64 delay)
1458 +{
1459 +       struct hrtimer *timer = &rq->hrtick_timer;
1460 +       ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1461 +
1462 +       timer->expires = time;
1463 +
1464 +       if (rq == this_rq()) {
1465 +               hrtimer_restart(timer);
1466 +       } else if (!rq->hrtick_csd_pending) {
1467 +               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1468 +               rq->hrtick_csd_pending = 1;
1469 +       }
1470 +}
1471 +
1472 +static int
1473 +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1474 +{
1475 +       int cpu = (int)(long)hcpu;
1476 +
1477 +       switch (action) {
1478 +       case CPU_UP_CANCELED:
1479 +       case CPU_UP_CANCELED_FROZEN:
1480 +       case CPU_DOWN_PREPARE:
1481 +       case CPU_DOWN_PREPARE_FROZEN:
1482 +       case CPU_DEAD:
1483 +       case CPU_DEAD_FROZEN:
1484 +               hrtick_clear(cpu_rq(cpu));
1485 +               return NOTIFY_OK;
1486 +       }
1487 +
1488 +       return NOTIFY_DONE;
1489 +}
1490 +
1491 +static __init void init_hrtick(void)
1492 +{
1493 +       hotcpu_notifier(hotplug_hrtick, 0);
1494 +}
1495 +#else
1496 +/*
1497 + * Called to set the hrtick timer state.
1498 + *
1499 + * called with rq->lock held and irqs disabled
1500 + */
1501 +static void hrtick_start(struct rq *rq, u64 delay)
1502 +{
1503 +       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1504 +}
1505 +
1506 +static void init_hrtick(void)
1507 +{
1508 +}
1509 +#endif /* CONFIG_SMP */
1510 +
1511 +static void init_rq_hrtick(struct rq *rq)
1512 +{
1513 +#ifdef CONFIG_SMP
1514 +       rq->hrtick_csd_pending = 0;
1515 +
1516 +       rq->hrtick_csd.flags = 0;
1517 +       rq->hrtick_csd.func = __hrtick_start;
1518 +       rq->hrtick_csd.info = rq;
1519 +#endif
1520 +
1521 +       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1522 +       rq->hrtick_timer.function = hrtick;
1523 +       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1524 +}
1525 +#else
1526 +static inline void hrtick_clear(struct rq *rq)
1527 +{
1528 +}
1529 +
1530 +static inline void init_rq_hrtick(struct rq *rq)
1531 +{
1532 +}
1533 +
1534 +static inline void init_hrtick(void)
1535 +{
1536 +}
1537 +#endif
1538 +
1539 +/*
1540 + * resched_task - mark a task 'to be rescheduled now'.
1541 + *
1542 + * On UP this means the setting of the need_resched flag, on SMP it
1543 + * might also involve a cross-CPU call to trigger the scheduler on
1544 + * the target CPU.
1545 + */
1546 +#ifdef CONFIG_SMP
1547 +
1548 +#ifndef tsk_is_polling
1549 +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1550 +#endif
1551 +
1552 +static void resched_task(struct task_struct *p)
1553 +{
1554 +       int cpu;
1555 +
1556 +       assert_spin_locked(&task_rq(p)->lock);
1557 +
1558 +       if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1559 +               return;
1560 +
1561 +       set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1562 +
1563 +       cpu = task_cpu(p);
1564 +       if (cpu == smp_processor_id())
1565 +               return;
1566 +
1567 +       /* NEED_RESCHED must be visible before we test polling */
1568 +       smp_mb();
1569 +       if (!tsk_is_polling(p))
1570 +               smp_send_reschedule(cpu);
1571 +}
1572 +
1573 +static void resched_cpu(int cpu)
1574 +{
1575 +       struct rq *rq = cpu_rq(cpu);
1576 +       unsigned long flags;
1577 +
1578 +       if (!spin_trylock_irqsave(&rq->lock, flags))
1579 +               return;
1580 +       resched_task(cpu_curr(cpu));
1581 +       spin_unlock_irqrestore(&rq->lock, flags);
1582 +}
1583 +
1584 +#ifdef CONFIG_NO_HZ
1585 +/*
1586 + * When add_timer_on() enqueues a timer into the timer wheel of an
1587 + * idle CPU then this timer might expire before the next timer event
1588 + * which is scheduled to wake up that CPU. In case of a completely
1589 + * idle system the next event might even be infinite time into the
1590 + * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1591 + * leaves the inner idle loop so the newly added timer is taken into
1592 + * account when the CPU goes back to idle and evaluates the timer
1593 + * wheel for the next timer event.
1594 + */
1595 +void wake_up_idle_cpu(int cpu)
1596 +{
1597 +       struct rq *rq = cpu_rq(cpu);
1598 +
1599 +       if (cpu == smp_processor_id())
1600 +               return;
1601 +
1602 +       /*
1603 +        * This is safe, as this function is called with the timer
1604 +        * wheel base lock of (cpu) held. When the CPU is on the way
1605 +        * to idle and has not yet set rq->curr to idle then it will
1606 +        * be serialized on the timer wheel base lock and take the new
1607 +        * timer into account automatically.
1608 +        */
1609 +       if (rq->curr != rq->idle)
1610 +               return;
1611 +
1612 +       /*
1613 +        * We can set TIF_RESCHED on the idle task of the other CPU
1614 +        * lockless. The worst case is that the other CPU runs the
1615 +        * idle task through an additional NOOP schedule()
1616 +        */
1617 +       set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1618 +
1619 +       /* NEED_RESCHED must be visible before we test polling */
1620 +       smp_mb();
1621 +       if (!tsk_is_polling(rq->idle))
1622 +               smp_send_reschedule(cpu);
1623 +}
1624 +#endif /* CONFIG_NO_HZ */
1625 +
1626 +#else /* !CONFIG_SMP */
1627 +static void resched_task(struct task_struct *p)
1628 +{
1629 +       assert_spin_locked(&task_rq(p)->lock);
1630 +       set_tsk_need_resched(p);
1631 +}
1632 +#endif /* CONFIG_SMP */
1633 +
1634 +#if BITS_PER_LONG == 32
1635 +# define WMULT_CONST   (~0UL)
1636 +#else
1637 +# define WMULT_CONST   (1UL << 32)
1638 +#endif
1639 +
1640 +#define WMULT_SHIFT    32
1641 +
1642 +/*
1643 + * Shift right and round:
1644 + */
1645 +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1646 +
1647 +/*
1648 + * delta *= weight / lw
1649 + */
1650 +static unsigned long
1651 +calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1652 +               struct load_weight *lw)
1653 +{
1654 +       u64 tmp;
1655 +
1656 +       if (!lw->inv_weight) {
1657 +               if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1658 +                       lw->inv_weight = 1;
1659 +               else
1660 +                       lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1661 +                               / (lw->weight+1);
1662 +       }
1663 +
1664 +       tmp = (u64)delta_exec * weight;
1665 +       /*
1666 +        * Check whether we'd overflow the 64-bit multiplication:
1667 +        */
1668 +       if (unlikely(tmp > WMULT_CONST))
1669 +               tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1670 +                       WMULT_SHIFT/2);
1671 +       else
1672 +               tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1673 +
1674 +       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1675 +}
1676 +
1677 +static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1678 +{
1679 +       lw->weight += inc;
1680 +       lw->inv_weight = 0;
1681 +}
1682 +
1683 +static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1684 +{
1685 +       lw->weight -= dec;
1686 +       lw->inv_weight = 0;
1687 +}
1688 +
1689 +/*
1690 + * To aid in avoiding the subversion of "niceness" due to uneven distribution
1691 + * of tasks with abnormal "nice" values across CPUs the contribution that
1692 + * each task makes to its run queue's load is weighted according to its
1693 + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1694 + * scaled version of the new time slice allocation that they receive on time
1695 + * slice expiry etc.
1696 + */
1697 +
1698 +#define WEIGHT_IDLEPRIO                2
1699 +#define WMULT_IDLEPRIO         (1 << 31)
1700 +
1701 +/*
1702 + * Nice levels are multiplicative, with a gentle 10% change for every
1703 + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1704 + * nice 1, it will get ~10% less CPU time than another CPU-bound task
1705 + * that remained on nice 0.
1706 + *
1707 + * The "10% effect" is relative and cumulative: from _any_ nice level,
1708 + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1709 + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1710 + * If a task goes up by ~10% and another task goes down by ~10% then
1711 + * the relative distance between them is ~25%.)
1712 + */
1713 +static const int prio_to_weight[40] = {
1714 + /* -20 */     88761,     71755,     56483,     46273,     36291,
1715 + /* -15 */     29154,     23254,     18705,     14949,     11916,
1716 + /* -10 */      9548,      7620,      6100,      4904,      3906,
1717 + /*  -5 */      3121,      2501,      1991,      1586,      1277,
1718 + /*   0 */      1024,       820,       655,       526,       423,
1719 + /*   5 */       335,       272,       215,       172,       137,
1720 + /*  10 */       110,        87,        70,        56,        45,
1721 + /*  15 */        36,        29,        23,        18,        15,
1722 +};
1723 +
1724 +/*
1725 + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1726 + *
1727 + * In cases where the weight does not change often, we can use the
1728 + * precalculated inverse to speed up arithmetics by turning divisions
1729 + * into multiplications:
1730 + */
1731 +static const u32 prio_to_wmult[40] = {
1732 + /* -20 */     48388,     59856,     76040,     92818,    118348,
1733 + /* -15 */    147320,    184698,    229616,    287308,    360437,
1734 + /* -10 */    449829,    563644,    704093,    875809,   1099582,
1735 + /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
1736 + /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
1737 + /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
1738 + /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
1739 + /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1740 +};
1741 +
1742 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1743 +
1744 +/*
1745 + * runqueue iterator, to support SMP load-balancing between different
1746 + * scheduling classes, without having to expose their internal data
1747 + * structures to the load-balancing proper:
1748 + */
1749 +struct rq_iterator {
1750 +       void *arg;
1751 +       struct task_struct *(*start)(void *);
1752 +       struct task_struct *(*next)(void *);
1753 +};
1754 +
1755 +#ifdef CONFIG_SMP
1756 +static unsigned long
1757 +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1758 +             unsigned long max_load_move, struct sched_domain *sd,
1759 +             enum cpu_idle_type idle, int *all_pinned,
1760 +             int *this_best_prio, struct rq_iterator *iterator);
1761 +
1762 +static int
1763 +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1764 +                  struct sched_domain *sd, enum cpu_idle_type idle,
1765 +                  struct rq_iterator *iterator);
1766 +#endif
1767 +
1768 +#ifdef CONFIG_CGROUP_CPUACCT
1769 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1770 +#else
1771 +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1772 +#endif
1773 +
1774 +static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1775 +{
1776 +       update_load_add(&rq->load, load);
1777 +}
1778 +
1779 +static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1780 +{
1781 +       update_load_sub(&rq->load, load);
1782 +}
1783 +
1784 +#ifdef CONFIG_SMP
1785 +static unsigned long source_load(int cpu, int type);
1786 +static unsigned long target_load(int cpu, int type);
1787 +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1788 +
1789 +static unsigned long cpu_avg_load_per_task(int cpu)
1790 +{
1791 +       struct rq *rq = cpu_rq(cpu);
1792 +
1793 +       if (rq->nr_running)
1794 +               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1795 +
1796 +       return rq->avg_load_per_task;
1797 +}
1798 +
1799 +#ifdef CONFIG_FAIR_GROUP_SCHED
1800 +
1801 +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1802 +
1803 +/*
1804 + * Iterate the full tree, calling @down when first entering a node and @up when
1805 + * leaving it for the final time.
1806 + */
1807 +static void
1808 +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1809 +{
1810 +       struct task_group *parent, *child;
1811 +
1812 +       rcu_read_lock();
1813 +       parent = &root_task_group;
1814 +down:
1815 +       (*down)(parent, cpu, sd);
1816 +       list_for_each_entry_rcu(child, &parent->children, siblings) {
1817 +               parent = child;
1818 +               goto down;
1819 +
1820 +up:
1821 +               continue;
1822 +       }
1823 +       (*up)(parent, cpu, sd);
1824 +
1825 +       child = parent;
1826 +       parent = parent->parent;
1827 +       if (parent)
1828 +               goto up;
1829 +       rcu_read_unlock();
1830 +}
1831 +
1832 +static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1833 +
1834 +/*
1835 + * Calculate and set the cpu's group shares.
1836 + */
1837 +static void
1838 +__update_group_shares_cpu(struct task_group *tg, int cpu,
1839 +                         unsigned long sd_shares, unsigned long sd_rq_weight)
1840 +{
1841 +       int boost = 0;
1842 +       unsigned long shares;
1843 +       unsigned long rq_weight;
1844 +
1845 +       if (!tg->se[cpu])
1846 +               return;
1847 +
1848 +       rq_weight = tg->cfs_rq[cpu]->load.weight;
1849 +
1850 +       /*
1851 +        * If there are currently no tasks on the cpu pretend there is one of
1852 +        * average load so that when a new task gets to run here it will not
1853 +        * get delayed by group starvation.
1854 +        */
1855 +       if (!rq_weight) {
1856 +               boost = 1;
1857 +               rq_weight = NICE_0_LOAD;
1858 +       }
1859 +
1860 +       if (unlikely(rq_weight > sd_rq_weight))
1861 +               rq_weight = sd_rq_weight;
1862 +
1863 +       /*
1864 +        *           \Sum shares * rq_weight
1865 +        * shares =  -----------------------
1866 +        *               \Sum rq_weight
1867 +        *
1868 +        */
1869 +       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1870 +
1871 +       /*
1872 +        * record the actual number of shares, not the boosted amount.
1873 +        */
1874 +       tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1875 +       tg->cfs_rq[cpu]->rq_weight = rq_weight;
1876 +
1877 +       if (shares < MIN_SHARES)
1878 +               shares = MIN_SHARES;
1879 +       else if (shares > MAX_SHARES)
1880 +               shares = MAX_SHARES;
1881 +
1882 +       __set_se_shares(tg->se[cpu], shares);
1883 +}
1884 +
1885 +/*
1886 + * Re-compute the task group their per cpu shares over the given domain.
1887 + * This needs to be done in a bottom-up fashion because the rq weight of a
1888 + * parent group depends on the shares of its child groups.
1889 + */
1890 +static void
1891 +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1892 +{
1893 +       unsigned long rq_weight = 0;
1894 +       unsigned long shares = 0;
1895 +       int i;
1896 +
1897 +       for_each_cpu_mask(i, sd->span) {
1898 +               rq_weight += tg->cfs_rq[i]->load.weight;
1899 +               shares += tg->cfs_rq[i]->shares;
1900 +       }
1901 +
1902 +       if ((!shares && rq_weight) || shares > tg->shares)
1903 +               shares = tg->shares;
1904 +
1905 +       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1906 +               shares = tg->shares;
1907 +
1908 +       if (!rq_weight)
1909 +               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1910 +
1911 +       for_each_cpu_mask(i, sd->span) {
1912 +               struct rq *rq = cpu_rq(i);
1913 +               unsigned long flags;
1914 +
1915 +               spin_lock_irqsave(&rq->lock, flags);
1916 +               __update_group_shares_cpu(tg, i, shares, rq_weight);
1917 +               spin_unlock_irqrestore(&rq->lock, flags);
1918 +       }
1919 +}
1920 +
1921 +/*
1922 + * Compute the cpu's hierarchical load factor for each task group.
1923 + * This needs to be done in a top-down fashion because the load of a child
1924 + * group is a fraction of its parents load.
1925 + */
1926 +static void
1927 +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1928 +{
1929 +       unsigned long load;
1930 +
1931 +       if (!tg->parent) {
1932 +               load = cpu_rq(cpu)->load.weight;
1933 +       } else {
1934 +               load = tg->parent->cfs_rq[cpu]->h_load;
1935 +               load *= tg->cfs_rq[cpu]->shares;
1936 +               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1937 +       }
1938 +
1939 +       tg->cfs_rq[cpu]->h_load = load;
1940 +}
1941 +
1942 +static void
1943 +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1944 +{
1945 +}
1946 +
1947 +static void update_shares(struct sched_domain *sd)
1948 +{
1949 +       u64 now = cpu_clock(raw_smp_processor_id());
1950 +       s64 elapsed = now - sd->last_update;
1951 +
1952 +       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1953 +               sd->last_update = now;
1954 +               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1955 +       }
1956 +}
1957 +
1958 +static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1959 +{
1960 +       spin_unlock(&rq->lock);
1961 +       update_shares(sd);
1962 +       spin_lock(&rq->lock);
1963 +}
1964 +
1965 +static void update_h_load(int cpu)
1966 +{
1967 +       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1968 +}
1969 +
1970 +#else
1971 +
1972 +static inline void update_shares(struct sched_domain *sd)
1973 +{
1974 +}
1975 +
1976 +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1977 +{
1978 +}
1979 +
1980 +#endif
1981 +
1982 +#endif
1983 +
1984 +#ifdef CONFIG_FAIR_GROUP_SCHED
1985 +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1986 +{
1987 +#ifdef CONFIG_SMP
1988 +       cfs_rq->shares = shares;
1989 +#endif
1990 +}
1991 +#endif
1992 +
1993 +#include "sched_stats.h"
1994 +#include "sched_idletask.c"
1995 +#include "sched_fair.c"
1996 +#include "sched_rt.c"
1997 +#ifdef CONFIG_SCHED_DEBUG
1998 +# include "sched_debug.c"
1999 +#endif
2000 +
2001 +#define sched_class_highest (&rt_sched_class)
2002 +#define for_each_class(class) \
2003 +   for (class = sched_class_highest; class; class = class->next)
2004 +
2005 +static void inc_nr_running(struct rq *rq)
2006 +{
2007 +       rq->nr_running++;
2008 +}
2009 +
2010 +static void dec_nr_running(struct rq *rq)
2011 +{
2012 +       rq->nr_running--;
2013 +}
2014 +
2015 +static void set_load_weight(struct task_struct *p)
2016 +{
2017 +       if (task_has_rt_policy(p)) {
2018 +               p->se.load.weight = prio_to_weight[0] * 2;
2019 +               p->se.load.inv_weight = prio_to_wmult[0] >> 1;
2020 +               return;
2021 +       }
2022 +
2023 +       /*
2024 +        * SCHED_IDLE tasks get minimal weight:
2025 +        */
2026 +       if (p->policy == SCHED_IDLE) {
2027 +               p->se.load.weight = WEIGHT_IDLEPRIO;
2028 +               p->se.load.inv_weight = WMULT_IDLEPRIO;
2029 +               return;
2030 +       }
2031 +
2032 +       p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
2033 +       p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
2034 +}
2035 +
2036 +static void update_avg(u64 *avg, u64 sample)
2037 +{
2038 +       s64 diff = sample - *avg;
2039 +       *avg += diff >> 3;
2040 +}
2041 +
2042 +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
2043 +{
2044 +       // BUG_ON(p->state & TASK_ONHOLD);
2045 +       sched_info_queued(p);
2046 +       p->sched_class->enqueue_task(rq, p, wakeup);
2047 +       p->se.on_rq = 1;
2048 +}
2049 +
2050 +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
2051 +{
2052 +       if (sleep && p->se.last_wakeup) {
2053 +               update_avg(&p->se.avg_overlap,
2054 +                          p->se.sum_exec_runtime - p->se.last_wakeup);
2055 +               p->se.last_wakeup = 0;
2056 +       }
2057 +
2058 +       sched_info_dequeued(p);
2059 +       p->sched_class->dequeue_task(rq, p, sleep);
2060 +       p->se.on_rq = 0;
2061 +}
2062 +
2063 +/*
2064 + * __normal_prio - return the priority that is based on the static prio
2065 + */
2066 +static inline int __normal_prio(struct task_struct *p)
2067 +{
2068 +       return p->static_prio;
2069 +}
2070 +
2071 +/*
2072 + * Calculate the expected normal priority: i.e. priority
2073 + * without taking RT-inheritance into account. Might be
2074 + * boosted by interactivity modifiers. Changes upon fork,
2075 + * setprio syscalls, and whenever the interactivity
2076 + * estimator recalculates.
2077 + */
2078 +static inline int normal_prio(struct task_struct *p)
2079 +{
2080 +       int prio;
2081 +
2082 +       if (task_has_rt_policy(p))
2083 +               prio = MAX_RT_PRIO-1 - p->rt_priority;
2084 +       else
2085 +               prio = __normal_prio(p);
2086 +       return prio;
2087 +}
2088 +
2089 +/*
2090 + * Calculate the current priority, i.e. the priority
2091 + * taken into account by the scheduler. This value might
2092 + * be boosted by RT tasks, or might be boosted by
2093 + * interactivity modifiers. Will be RT if the task got
2094 + * RT-boosted. If not then it returns p->normal_prio.
2095 + */
2096 +static int effective_prio(struct task_struct *p)
2097 +{
2098 +       p->normal_prio = normal_prio(p);
2099 +       /*
2100 +        * If we are RT tasks or we were boosted to RT priority,
2101 +        * keep the priority unchanged. Otherwise, update priority
2102 +        * to the normal priority:
2103 +        */
2104 +       if (!rt_prio(p->prio))
2105 +               return p->normal_prio;
2106 +       return p->prio;
2107 +}
2108 +
2109 +/*
2110 + * activate_task - move a task to the runqueue.
2111 + */
2112 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
2113 +{
2114 +       if (task_contributes_to_load(p))
2115 +               rq->nr_uninterruptible--;
2116 +
2117 +       enqueue_task(rq, p, wakeup);
2118 +       inc_nr_running(rq);
2119 +}
2120 +
2121 +/*
2122 + * deactivate_task - remove a task from the runqueue.
2123 + */
2124 +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
2125 +{
2126 +       if (task_contributes_to_load(p))
2127 +               rq->nr_uninterruptible++;
2128 +
2129 +       dequeue_task(rq, p, sleep);
2130 +       dec_nr_running(rq);
2131 +}
2132 +
2133 +/**
2134 + * task_curr - is this task currently executing on a CPU?
2135 + * @p: the task in question.
2136 + */
2137 +inline int task_curr(const struct task_struct *p)
2138 +{
2139 +       return cpu_curr(task_cpu(p)) == p;
2140 +}
2141 +
2142 +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
2143 +{
2144 +       set_task_rq(p, cpu);
2145 +#ifdef CONFIG_SMP
2146 +       /*
2147 +        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
2148 +        * successfuly executed on another CPU. We must ensure that updates of
2149 +        * per-task data have been completed by this moment.
2150 +        */
2151 +       smp_wmb();
2152 +       task_thread_info(p)->cpu = cpu;
2153 +#endif
2154 +}
2155 +
2156 +static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2157 +                                      const struct sched_class *prev_class,
2158 +                                      int oldprio, int running)
2159 +{
2160 +       if (prev_class != p->sched_class) {
2161 +               if (prev_class->switched_from)
2162 +                       prev_class->switched_from(rq, p, running);
2163 +               p->sched_class->switched_to(rq, p, running);
2164 +       } else
2165 +               p->sched_class->prio_changed(rq, p, oldprio, running);
2166 +}
2167 +
2168 +#ifdef CONFIG_SMP
2169 +
2170 +/* Used instead of source_load when we know the type == 0 */
2171 +static unsigned long weighted_cpuload(const int cpu)
2172 +{
2173 +       return cpu_rq(cpu)->load.weight;
2174 +}
2175 +
2176 +/*
2177 + * Is this task likely cache-hot:
2178 + */
2179 +static int
2180 +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2181 +{
2182 +       s64 delta;
2183 +
2184 +       /*
2185 +        * Buddy candidates are cache hot:
2186 +        */
2187 +       if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
2188 +               return 1;
2189 +
2190 +       if (p->sched_class != &fair_sched_class)
2191 +               return 0;
2192 +
2193 +       if (sysctl_sched_migration_cost == -1)
2194 +               return 1;
2195 +       if (sysctl_sched_migration_cost == 0)
2196 +               return 0;
2197 +
2198 +       delta = now - p->se.exec_start;
2199 +
2200 +       return delta < (s64)sysctl_sched_migration_cost;
2201 +}
2202 +
2203 +
2204 +void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2205 +{
2206 +       int old_cpu = task_cpu(p);
2207 +       struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2208 +       struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2209 +                     *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2210 +       u64 clock_offset;
2211 +
2212 +       clock_offset = old_rq->clock - new_rq->clock;
2213 +
2214 +#ifdef CONFIG_SCHEDSTATS
2215 +       if (p->se.wait_start)
2216 +               p->se.wait_start -= clock_offset;
2217 +       if (p->se.sleep_start)
2218 +               p->se.sleep_start -= clock_offset;
2219 +       if (p->se.block_start)
2220 +               p->se.block_start -= clock_offset;
2221 +       if (old_cpu != new_cpu) {
2222 +               schedstat_inc(p, se.nr_migrations);
2223 +               if (task_hot(p, old_rq->clock, NULL))
2224 +                       schedstat_inc(p, se.nr_forced2_migrations);
2225 +       }
2226 +#endif
2227 +       p->se.vruntime -= old_cfsrq->min_vruntime -
2228 +                                        new_cfsrq->min_vruntime;
2229 +
2230 +       __set_task_cpu(p, new_cpu);
2231 +}
2232 +
2233 +struct migration_req {
2234 +       struct list_head list;
2235 +
2236 +       struct task_struct *task;
2237 +       int dest_cpu;
2238 +
2239 +       struct completion done;
2240 +};
2241 +
2242 +#include "sched_mon.h"
2243 +
2244 +
2245 +/*
2246 + * The task's runqueue lock must be held.
2247 + * Returns true if you have to wait for migration thread.
2248 + */
2249 +static int
2250 +migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2251 +{
2252 +       struct rq *rq = task_rq(p);
2253 +
2254 +       vxm_migrate_task(p, rq, dest_cpu);
2255 +       /*
2256 +        * If the task is not on a runqueue (and not running), then
2257 +        * it is sufficient to simply update the task's cpu field.
2258 +        */
2259 +       if (!p->se.on_rq && !task_running(rq, p)) {
2260 +               set_task_cpu(p, dest_cpu);
2261 +               return 0;
2262 +       }
2263 +
2264 +       init_completion(&req->done);
2265 +       req->task = p;
2266 +       req->dest_cpu = dest_cpu;
2267 +       list_add(&req->list, &rq->migration_queue);
2268 +
2269 +       return 1;
2270 +}
2271 +
2272 +/*
2273 + * wait_task_inactive - wait for a thread to unschedule.
2274 + *
2275 + * If @match_state is nonzero, it's the @p->state value just checked and
2276 + * not expected to change.  If it changes, i.e. @p might have woken up,
2277 + * then return zero.  When we succeed in waiting for @p to be off its CPU,
2278 + * we return a positive number (its total switch count).  If a second call
2279 + * a short while later returns the same number, the caller can be sure that
2280 + * @p has remained unscheduled the whole time.
2281 + *
2282 + * The caller must ensure that the task *will* unschedule sometime soon,
2283 + * else this function might spin for a *long* time. This function can't
2284 + * be called with interrupts off, or it may introduce deadlock with
2285 + * smp_call_function() if an IPI is sent by the same process we are
2286 + * waiting to become inactive.
2287 + */
2288 +unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2289 +{
2290 +       unsigned long flags;
2291 +       int running, on_rq;
2292 +       unsigned long ncsw;
2293 +       struct rq *rq;
2294 +
2295 +       for (;;) {
2296 +               /*
2297 +                * We do the initial early heuristics without holding
2298 +                * any task-queue locks at all. We'll only try to get
2299 +                * the runqueue lock when things look like they will
2300 +                * work out!
2301 +                */
2302 +               rq = task_rq(p);
2303 +
2304 +               /*
2305 +                * If the task is actively running on another CPU
2306 +                * still, just relax and busy-wait without holding
2307 +                * any locks.
2308 +                *
2309 +                * NOTE! Since we don't hold any locks, it's not
2310 +                * even sure that "rq" stays as the right runqueue!
2311 +                * But we don't care, since "task_running()" will
2312 +                * return false if the runqueue has changed and p
2313 +                * is actually now running somewhere else!
2314 +                */
2315 +               while (task_running(rq, p)) {
2316 +                       if (match_state && unlikely(p->state != match_state))
2317 +                               return 0;
2318 +                       cpu_relax();
2319 +               }
2320 +
2321 +               /*
2322 +                * Ok, time to look more closely! We need the rq
2323 +                * lock now, to be *sure*. If we're wrong, we'll
2324 +                * just go back and repeat.
2325 +                */
2326 +               rq = task_rq_lock(p, &flags);
2327 +               running = task_running(rq, p);
2328 +               on_rq = p->se.on_rq;
2329 +               ncsw = 0;
2330 +               if (!match_state || p->state == match_state) {
2331 +                       ncsw = p->nivcsw + p->nvcsw;
2332 +                       if (unlikely(!ncsw))
2333 +                               ncsw = 1;
2334 +               }
2335 +               task_rq_unlock(rq, &flags);
2336 +
2337 +               /*
2338 +                * If it changed from the expected state, bail out now.
2339 +                */
2340 +               if (unlikely(!ncsw))
2341 +                       break;
2342 +
2343 +               /*
2344 +                * Was it really running after all now that we
2345 +                * checked with the proper locks actually held?
2346 +                *
2347 +                * Oops. Go back and try again..
2348 +                */
2349 +               if (unlikely(running)) {
2350 +                       cpu_relax();
2351 +                       continue;
2352 +               }
2353 +
2354 +               /*
2355 +                * It's not enough that it's not actively running,
2356 +                * it must be off the runqueue _entirely_, and not
2357 +                * preempted!
2358 +                *
2359 +                * So if it wa still runnable (but just not actively
2360 +                * running right now), it's preempted, and we should
2361 +                * yield - it could be a while.
2362 +                */
2363 +               if (unlikely(on_rq)) {
2364 +                       schedule_timeout_uninterruptible(1);
2365 +                       continue;
2366 +               }
2367 +
2368 +               /*
2369 +                * Ahh, all good. It wasn't running, and it wasn't
2370 +                * runnable, which means that it will never become
2371 +                * running in the future either. We're all done!
2372 +                */
2373 +               break;
2374 +       }
2375 +
2376 +       return ncsw;
2377 +}
2378 +
2379 +/***
2380 + * kick_process - kick a running thread to enter/exit the kernel
2381 + * @p: the to-be-kicked thread
2382 + *
2383 + * Cause a process which is running on another CPU to enter
2384 + * kernel-mode, without any delay. (to get signals handled.)
2385 + *
2386 + * NOTE: this function doesnt have to take the runqueue lock,
2387 + * because all it wants to ensure is that the remote task enters
2388 + * the kernel. If the IPI races and the task has been migrated
2389 + * to another CPU then no harm is done and the purpose has been
2390 + * achieved as well.
2391 + */
2392 +void kick_process(struct task_struct *p)
2393 +{
2394 +       int cpu;
2395 +
2396 +       preempt_disable();
2397 +       cpu = task_cpu(p);
2398 +       if ((cpu != smp_processor_id()) && task_curr(p))
2399 +               smp_send_reschedule(cpu);
2400 +       preempt_enable();
2401 +}
2402 +
2403 +/*
2404 + * Return a low guess at the load of a migration-source cpu weighted
2405 + * according to the scheduling class and "nice" value.
2406 + *
2407 + * We want to under-estimate the load of migration sources, to
2408 + * balance conservatively.
2409 + */
2410 +static unsigned long source_load(int cpu, int type)
2411 +{
2412 +       struct rq *rq = cpu_rq(cpu);
2413 +       unsigned long total = weighted_cpuload(cpu);
2414 +
2415 +       if (type == 0 || !sched_feat(LB_BIAS))
2416 +               return total;
2417 +
2418 +       return min(rq->cpu_load[type-1], total);
2419 +}
2420 +
2421 +/*
2422 + * Return a high guess at the load of a migration-target cpu weighted
2423 + * according to the scheduling class and "nice" value.
2424 + */
2425 +static unsigned long target_load(int cpu, int type)
2426 +{
2427 +       struct rq *rq = cpu_rq(cpu);
2428 +       unsigned long total = weighted_cpuload(cpu);
2429 +
2430 +       if (type == 0 || !sched_feat(LB_BIAS))
2431 +               return total;
2432 +
2433 +       return max(rq->cpu_load[type-1], total);
2434 +}
2435 +
2436 +/*
2437 + * find_idlest_group finds and returns the least busy CPU group within the
2438 + * domain.
2439 + */
2440 +static struct sched_group *
2441 +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2442 +{
2443 +       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2444 +       unsigned long min_load = ULONG_MAX, this_load = 0;
2445 +       int load_idx = sd->forkexec_idx;
2446 +       int imbalance = 100 + (sd->imbalance_pct-100)/2;
2447 +
2448 +       do {
2449 +               unsigned long load, avg_load;
2450 +               int local_group;
2451 +               int i;
2452 +
2453 +               /* Skip over this group if it has no CPUs allowed */
2454 +               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
2455 +                       continue;
2456 +
2457 +               local_group = cpu_isset(this_cpu, group->cpumask);
2458 +
2459 +               /* Tally up the load of all CPUs in the group */
2460 +               avg_load = 0;
2461 +
2462 +               for_each_cpu_mask_nr(i, group->cpumask) {
2463 +                       /* Bias balancing toward cpus of our domain */
2464 +                       if (local_group)
2465 +                               load = source_load(i, load_idx);
2466 +                       else
2467 +                               load = target_load(i, load_idx);
2468 +
2469 +                       avg_load += load;
2470 +               }
2471 +
2472 +               /* Adjust by relative CPU power of the group */
2473 +               avg_load = sg_div_cpu_power(group,
2474 +                               avg_load * SCHED_LOAD_SCALE);
2475 +
2476 +               if (local_group) {
2477 +                       this_load = avg_load;
2478 +                       this = group;
2479 +               } else if (avg_load < min_load) {
2480 +                       min_load = avg_load;
2481 +                       idlest = group;
2482 +               }
2483 +       } while (group = group->next, group != sd->groups);
2484 +
2485 +       if (!idlest || 100*this_load < imbalance*min_load)
2486 +               return NULL;
2487 +       return idlest;
2488 +}
2489 +
2490 +/*
2491 + * find_idlest_cpu - find the idlest cpu among the cpus in group.
2492 + */
2493 +static int
2494 +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2495 +               cpumask_t *tmp)
2496 +{
2497 +       unsigned long load, min_load = ULONG_MAX;
2498 +       int idlest = -1;
2499 +       int i;
2500 +
2501 +       /* Traverse only the allowed CPUs */
2502 +       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
2503 +
2504 +       for_each_cpu_mask_nr(i, *tmp) {
2505 +               load = weighted_cpuload(i);
2506 +
2507 +               if (load < min_load || (load == min_load && i == this_cpu)) {
2508 +                       min_load = load;
2509 +                       idlest = i;
2510 +               }
2511 +       }
2512 +
2513 +       return idlest;
2514 +}
2515 +
2516 +/*
2517 + * sched_balance_self: balance the current task (running on cpu) in domains
2518 + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2519 + * SD_BALANCE_EXEC.
2520 + *
2521 + * Balance, ie. select the least loaded group.
2522 + *
2523 + * Returns the target CPU number, or the same CPU if no balancing is needed.
2524 + *
2525 + * preempt must be disabled.
2526 + */
2527 +static int sched_balance_self(int cpu, int flag)
2528 +{
2529 +       struct task_struct *t = current;
2530 +       struct sched_domain *tmp, *sd = NULL;
2531 +
2532 +       for_each_domain(cpu, tmp) {
2533 +               /*
2534 +                * If power savings logic is enabled for a domain, stop there.
2535 +                */
2536 +               if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2537 +                       break;
2538 +               if (tmp->flags & flag)
2539 +                       sd = tmp;
2540 +       }
2541 +
2542 +       if (sd)
2543 +               update_shares(sd);
2544 +
2545 +       while (sd) {
2546 +               cpumask_t span, tmpmask;
2547 +               struct sched_group *group;
2548 +               int new_cpu, weight;
2549 +
2550 +               if (!(sd->flags & flag)) {
2551 +                       sd = sd->child;
2552 +                       continue;
2553 +               }
2554 +
2555 +               span = sd->span;
2556 +               group = find_idlest_group(sd, t, cpu);
2557 +               if (!group) {
2558 +                       sd = sd->child;
2559 +                       continue;
2560 +               }
2561 +
2562 +               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
2563 +               if (new_cpu == -1 || new_cpu == cpu) {
2564 +                       /* Now try balancing at a lower domain level of cpu */
2565 +                       sd = sd->child;
2566 +                       continue;
2567 +               }
2568 +
2569 +               /* Now try balancing at a lower domain level of new_cpu */
2570 +               cpu = new_cpu;
2571 +               sd = NULL;
2572 +               weight = cpus_weight(span);
2573 +               for_each_domain(cpu, tmp) {
2574 +                       if (weight <= cpus_weight(tmp->span))
2575 +                               break;
2576 +                       if (tmp->flags & flag)
2577 +                               sd = tmp;
2578 +               }
2579 +               /* while loop will break here if sd == NULL */
2580 +       }
2581 +
2582 +       return cpu;
2583 +}
2584 +
2585 +#endif /* CONFIG_SMP */
2586 +
2587 +/***
2588 + * try_to_wake_up - wake up a thread
2589 + * @p: the to-be-woken-up thread
2590 + * @state: the mask of task states that can be woken
2591 + * @sync: do a synchronous wakeup?
2592 + *
2593 + * Put it on the run-queue if it's not already there. The "current"
2594 + * thread is always on the run-queue (except when the actual
2595 + * re-schedule is in progress), and as such you're allowed to do
2596 + * the simpler "current->state = TASK_RUNNING" to mark yourself
2597 + * runnable without the overhead of this.
2598 + *
2599 + * returns failure only if the task is already active.
2600 + */
2601 +static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2602 +{
2603 +       int cpu, orig_cpu, this_cpu, success = 0;
2604 +       unsigned long flags;
2605 +       long old_state;
2606 +       struct rq *rq;
2607 +
2608 +       if (!sched_feat(SYNC_WAKEUPS))
2609 +               sync = 0;
2610 +
2611 +#ifdef CONFIG_SMP
2612 +       if (sched_feat(LB_WAKEUP_UPDATE)) {
2613 +               struct sched_domain *sd;
2614 +
2615 +               this_cpu = raw_smp_processor_id();
2616 +               cpu = task_cpu(p);
2617 +
2618 +               for_each_domain(this_cpu, sd) {
2619 +                       if (cpu_isset(cpu, sd->span)) {
2620 +                               update_shares(sd);
2621 +                               break;
2622 +                       }
2623 +               }
2624 +       }
2625 +#endif
2626 +
2627 +       smp_wmb();
2628 +       rq = task_rq_lock(p, &flags);
2629 +       old_state = p->state;
2630 +       if (!(old_state & state))
2631 +               goto out;
2632 +
2633 +       if (p->se.on_rq)
2634 +               goto out_running;
2635 +
2636 +       cpu = task_cpu(p);
2637 +       orig_cpu = cpu;
2638 +       this_cpu = smp_processor_id();
2639 +
2640 +#ifdef CONFIG_SMP
2641 +       if (unlikely(task_running(rq, p)))
2642 +               goto out_activate;
2643 +
2644 +       cpu = p->sched_class->select_task_rq(p, sync);
2645 +       if (cpu != orig_cpu) {
2646 +               set_task_cpu(p, cpu);
2647 +               task_rq_unlock(rq, &flags);
2648 +               /* might preempt at this point */
2649 +               rq = task_rq_lock(p, &flags);
2650 +               old_state = p->state;
2651 +
2652 +       /* we need to unhold suspended tasks
2653 +       if (old_state & TASK_ONHOLD) {
2654 +               vx_unhold_task(p, rq);
2655 +               old_state = p->state;
2656 +       } */
2657 +               if (!(old_state & state))
2658 +                       goto out;
2659 +               if (p->se.on_rq)
2660 +                       goto out_running;
2661 +
2662 +               this_cpu = smp_processor_id();
2663 +               cpu = task_cpu(p);
2664 +       }
2665 +
2666 +#ifdef CONFIG_SCHEDSTATS
2667 +       schedstat_inc(rq, ttwu_count);
2668 +       if (cpu == this_cpu)
2669 +               schedstat_inc(rq, ttwu_local);
2670 +       else {
2671 +               struct sched_domain *sd;
2672 +               for_each_domain(this_cpu, sd) {
2673 +                       if (cpu_isset(cpu, sd->span)) {
2674 +                               schedstat_inc(sd, ttwu_wake_remote);
2675 +                               break;
2676 +                       }
2677 +               }
2678 +       }
2679 +#endif /* CONFIG_SCHEDSTATS */
2680 +
2681 +out_activate:
2682 +#endif /* CONFIG_SMP */
2683 +       schedstat_inc(p, se.nr_wakeups);
2684 +       if (sync)
2685 +               schedstat_inc(p, se.nr_wakeups_sync);
2686 +       if (orig_cpu != cpu)
2687 +               schedstat_inc(p, se.nr_wakeups_migrate);
2688 +       if (cpu == this_cpu)
2689 +               schedstat_inc(p, se.nr_wakeups_local);
2690 +       else
2691 +               schedstat_inc(p, se.nr_wakeups_remote);
2692 +       update_rq_clock(rq);
2693 +       activate_task(rq, p, 1);
2694 +       success = 1;
2695 +
2696 +out_running:
2697 +       trace_mark(kernel_sched_wakeup,
2698 +               "pid %d state %ld ## rq %p task %p rq->curr %p",
2699 +               p->pid, p->state, rq, p, rq->curr);
2700 +       check_preempt_curr(rq, p);
2701 +
2702 +       p->state = TASK_RUNNING;
2703 +#ifdef CONFIG_SMP
2704 +       if (p->sched_class->task_wake_up)
2705 +               p->sched_class->task_wake_up(rq, p);
2706 +#endif
2707 +out:
2708 +       current->se.last_wakeup = current->se.sum_exec_runtime;
2709 +
2710 +       task_rq_unlock(rq, &flags);
2711 +
2712 +       return success;
2713 +}
2714 +
2715 +int wake_up_process(struct task_struct *p)
2716 +{
2717 +       return try_to_wake_up(p, TASK_ALL, 0);
2718 +}
2719 +EXPORT_SYMBOL(wake_up_process);
2720 +
2721 +int wake_up_state(struct task_struct *p, unsigned int state)
2722 +{
2723 +       return try_to_wake_up(p, state, 0);
2724 +}
2725 +
2726 +/*
2727 + * Perform scheduler related setup for a newly forked process p.
2728 + * p is forked by current.
2729 + *
2730 + * __sched_fork() is basic setup used by init_idle() too:
2731 + */
2732 +static void __sched_fork(struct task_struct *p)
2733 +{
2734 +       p->se.exec_start                = 0;
2735 +       p->se.sum_exec_runtime          = 0;
2736 +       p->se.prev_sum_exec_runtime     = 0;
2737 +       p->se.last_wakeup               = 0;
2738 +       p->se.avg_overlap               = 0;
2739 +
2740 +#ifdef CONFIG_SCHEDSTATS
2741 +       p->se.wait_start                = 0;
2742 +       p->se.sum_sleep_runtime         = 0;
2743 +       p->se.sleep_start               = 0;
2744 +       p->se.block_start               = 0;
2745 +       p->se.sleep_max                 = 0;
2746 +       p->se.block_max                 = 0;
2747 +       p->se.exec_max                  = 0;
2748 +       p->se.slice_max                 = 0;
2749 +       p->se.wait_max                  = 0;
2750 +#endif
2751 +
2752 +       INIT_LIST_HEAD(&p->rt.run_list);
2753 +       p->se.on_rq = 0;
2754 +       INIT_LIST_HEAD(&p->se.group_node);
2755 +
2756 +#ifdef CONFIG_PREEMPT_NOTIFIERS
2757 +       INIT_HLIST_HEAD(&p->preempt_notifiers);
2758 +#endif
2759 +
2760 +       /*
2761 +        * We mark the process as running here, but have not actually
2762 +        * inserted it onto the runqueue yet. This guarantees that
2763 +        * nobody will actually run it, and a signal or other external
2764 +        * event cannot wake it up and insert it on the runqueue either.
2765 +        */
2766 +       p->state = TASK_RUNNING;
2767 +}
2768 +
2769 +/*
2770 + * fork()/clone()-time setup:
2771 + */
2772 +void sched_fork(struct task_struct *p, int clone_flags)
2773 +{
2774 +       int cpu = get_cpu();
2775 +
2776 +       __sched_fork(p);
2777 +
2778 +#ifdef CONFIG_SMP
2779 +       cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2780 +#endif
2781 +       set_task_cpu(p, cpu);
2782 +
2783 +       /*
2784 +        * Make sure we do not leak PI boosting priority to the child:
2785 +        */
2786 +       p->prio = current->normal_prio;
2787 +       if (!rt_prio(p->prio))
2788 +               p->sched_class = &fair_sched_class;
2789 +
2790 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2791 +       if (likely(sched_info_on()))
2792 +               memset(&p->sched_info, 0, sizeof(p->sched_info));
2793 +#endif
2794 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2795 +       p->oncpu = 0;
2796 +#endif
2797 +#ifdef CONFIG_PREEMPT
2798 +       /* Want to start with kernel preemption disabled. */
2799 +       task_thread_info(p)->preempt_count = 1;
2800 +#endif
2801 +       put_cpu();
2802 +}
2803 +
2804 +/*
2805 + * wake_up_new_task - wake up a newly created task for the first time.
2806 + *
2807 + * This function will do some initial scheduler statistics housekeeping
2808 + * that must be done for every newly created context, then puts the task
2809 + * on the runqueue and wakes it.
2810 + */
2811 +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2812 +{
2813 +       unsigned long flags;
2814 +       struct rq *rq;
2815 +
2816 +       rq = task_rq_lock(p, &flags);
2817 +       BUG_ON(p->state != TASK_RUNNING);
2818 +       update_rq_clock(rq);
2819 +
2820 +       p->prio = effective_prio(p);
2821 +
2822 +       if (!p->sched_class->task_new || !current->se.on_rq) {
2823 +               activate_task(rq, p, 0);
2824 +       } else {
2825 +               /*
2826 +                * Let the scheduling class do new task startup
2827 +                * management (if any):
2828 +                */
2829 +               p->sched_class->task_new(rq, p);
2830 +               inc_nr_running(rq);
2831 +       }
2832 +       trace_mark(kernel_sched_wakeup_new,
2833 +               "pid %d state %ld ## rq %p task %p rq->curr %p",
2834 +               p->pid, p->state, rq, p, rq->curr);
2835 +       check_preempt_curr(rq, p);
2836 +#ifdef CONFIG_SMP
2837 +       if (p->sched_class->task_wake_up)
2838 +               p->sched_class->task_wake_up(rq, p);
2839 +#endif
2840 +       task_rq_unlock(rq, &flags);
2841 +}
2842 +
2843 +#ifdef CONFIG_PREEMPT_NOTIFIERS
2844 +
2845 +/**
2846 + * preempt_notifier_register - tell me when current is being being preempted & rescheduled
2847 + * @notifier: notifier struct to register
2848 + */
2849 +void preempt_notifier_register(struct preempt_notifier *notifier)
2850 +{
2851 +       hlist_add_head(&notifier->link, &current->preempt_notifiers);
2852 +}
2853 +EXPORT_SYMBOL_GPL(preempt_notifier_register);
2854 +
2855 +/**
2856 + * preempt_notifier_unregister - no longer interested in preemption notifications
2857 + * @notifier: notifier struct to unregister
2858 + *
2859 + * This is safe to call from within a preemption notifier.
2860 + */
2861 +void preempt_notifier_unregister(struct preempt_notifier *notifier)
2862 +{
2863 +       hlist_del(&notifier->link);
2864 +}
2865 +EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2866 +
2867 +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2868 +{
2869 +       struct preempt_notifier *notifier;
2870 +       struct hlist_node *node;
2871 +
2872 +       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2873 +               notifier->ops->sched_in(notifier, raw_smp_processor_id());
2874 +}
2875 +
2876 +static void
2877 +fire_sched_out_preempt_notifiers(struct task_struct *curr,
2878 +                                struct task_struct *next)
2879 +{
2880 +       struct preempt_notifier *notifier;
2881 +       struct hlist_node *node;
2882 +
2883 +       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2884 +               notifier->ops->sched_out(notifier, next);
2885 +}
2886 +
2887 +#else /* !CONFIG_PREEMPT_NOTIFIERS */
2888 +
2889 +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2890 +{
2891 +}
2892 +
2893 +static void
2894 +fire_sched_out_preempt_notifiers(struct task_struct *curr,
2895 +                                struct task_struct *next)
2896 +{
2897 +}
2898 +
2899 +#endif /* CONFIG_PREEMPT_NOTIFIERS */
2900 +
2901 +/**
2902 + * prepare_task_switch - prepare to switch tasks
2903 + * @rq: the runqueue preparing to switch
2904 + * @prev: the current task that is being switched out
2905 + * @next: the task we are going to switch to.
2906 + *
2907 + * This is called with the rq lock held and interrupts off. It must
2908 + * be paired with a subsequent finish_task_switch after the context
2909 + * switch.
2910 + *
2911 + * prepare_task_switch sets up locking and calls architecture specific
2912 + * hooks.
2913 + */
2914 +static inline void
2915 +prepare_task_switch(struct rq *rq, struct task_struct *prev,
2916 +                   struct task_struct *next)
2917 +{
2918 +       fire_sched_out_preempt_notifiers(prev, next);
2919 +       prepare_lock_switch(rq, next);
2920 +       prepare_arch_switch(next);
2921 +}
2922 +
2923 +/**
2924 + * finish_task_switch - clean up after a task-switch
2925 + * @rq: runqueue associated with task-switch
2926 + * @prev: the thread we just switched away from.
2927 + *
2928 + * finish_task_switch must be called after the context switch, paired
2929 + * with a prepare_task_switch call before the context switch.
2930 + * finish_task_switch will reconcile locking set up by prepare_task_switch,
2931 + * and do any other architecture-specific cleanup actions.
2932 + *
2933 + * Note that we may have delayed dropping an mm in context_switch(). If
2934 + * so, we finish that here outside of the runqueue lock. (Doing it
2935 + * with the lock held can cause deadlocks; see schedule() for
2936 + * details.)
2937 + */
2938 +static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2939 +       __releases(rq->lock)
2940 +{
2941 +       struct mm_struct *mm = rq->prev_mm;
2942 +       long prev_state;
2943 +
2944 +       rq->prev_mm = NULL;
2945 +
2946 +       /*
2947 +        * A task struct has one reference for the use as "current".
2948 +        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2949 +        * schedule one last time. The schedule call will never return, and
2950 +        * the scheduled task must drop that reference.
2951 +        * The test for TASK_DEAD must occur while the runqueue locks are
2952 +        * still held, otherwise prev could be scheduled on another cpu, die
2953 +        * there before we look at prev->state, and then the reference would
2954 +        * be dropped twice.
2955 +        *              Manfred Spraul <manfred@colorfullife.com>
2956 +        */
2957 +       prev_state = prev->state;
2958 +       finish_arch_switch(prev);
2959 +       finish_lock_switch(rq, prev);
2960 +#ifdef CONFIG_SMP
2961 +       if (current->sched_class->post_schedule)
2962 +               current->sched_class->post_schedule(rq);
2963 +#endif
2964 +
2965 +       fire_sched_in_preempt_notifiers(current);
2966 +       if (mm)
2967 +               mmdrop(mm);
2968 +       if (unlikely(prev_state == TASK_DEAD)) {
2969 +               /*
2970 +                * Remove function-return probe instances associated with this
2971 +                * task and put them back on the free list.
2972 +                */
2973 +               kprobe_flush_task(prev);
2974 +               put_task_struct(prev);
2975 +       }
2976 +}
2977 +
2978 +/**
2979 + * schedule_tail - first thing a freshly forked thread must call.
2980 + * @prev: the thread we just switched away from.
2981 + */
2982 +asmlinkage void schedule_tail(struct task_struct *prev)
2983 +       __releases(rq->lock)
2984 +{
2985 +       struct rq *rq = this_rq();
2986 +
2987 +       finish_task_switch(rq, prev);
2988 +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2989 +       /* In this case, finish_task_switch does not reenable preemption */
2990 +       preempt_enable();
2991 +#endif
2992 +       if (current->set_child_tid)
2993 +               put_user(task_pid_vnr(current), current->set_child_tid);
2994 +}
2995 +
2996 +/*
2997 + * context_switch - switch to the new MM and the new
2998 + * thread's register state.
2999 + */
3000 +static inline void
3001 +context_switch(struct rq *rq, struct task_struct *prev,
3002 +              struct task_struct *next)
3003 +{
3004 +       struct mm_struct *mm, *oldmm;
3005 +
3006 +       prepare_task_switch(rq, prev, next);
3007 +       trace_mark(kernel_sched_schedule,
3008 +               "prev_pid %d next_pid %d prev_state %ld "
3009 +               "## rq %p prev %p next %p",
3010 +               prev->pid, next->pid, prev->state,
3011 +               rq, prev, next);
3012 +       mm = next->mm;
3013 +       oldmm = prev->active_mm;
3014 +       /*
3015 +        * For paravirt, this is coupled with an exit in switch_to to
3016 +        * combine the page table reload and the switch backend into
3017 +        * one hypercall.
3018 +        */
3019 +       arch_enter_lazy_cpu_mode();
3020 +
3021 +       if (unlikely(!mm)) {
3022 +               next->active_mm = oldmm;
3023 +               atomic_inc(&oldmm->mm_count);
3024 +               enter_lazy_tlb(oldmm, next);
3025 +       } else
3026 +               switch_mm(oldmm, mm, next);
3027 +
3028 +       if (unlikely(!prev->mm)) {
3029 +               prev->active_mm = NULL;
3030 +               rq->prev_mm = oldmm;
3031 +       }
3032 +       /*
3033 +        * Since the runqueue lock will be released by the next
3034 +        * task (which is an invalid locking op but in the case
3035 +        * of the scheduler it's an obvious special-case), so we
3036 +        * do an early lockdep release here:
3037 +        */
3038 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
3039 +       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3040 +#endif
3041 +
3042 +       /* Here we just switch the register state and the stack. */
3043 +       switch_to(prev, next, prev);
3044 +
3045 +       barrier();
3046 +       /*
3047 +        * this_rq must be evaluated again because prev may have moved
3048 +        * CPUs since it called schedule(), thus the 'rq' on its stack
3049 +        * frame will be invalid.
3050 +        */
3051 +       finish_task_switch(this_rq(), prev);
3052 +}
3053 +
3054 +/*
3055 + * nr_running, nr_uninterruptible and nr_context_switches:
3056 + *
3057 + * externally visible scheduler statistics: current number of runnable
3058 + * threads, current number of uninterruptible-sleeping threads, total
3059 + * number of context switches performed since bootup.
3060 + */
3061 +unsigned long nr_running(void)
3062 +{
3063 +       unsigned long i, sum = 0;
3064 +
3065 +       for_each_online_cpu(i)
3066 +               sum += cpu_rq(i)->nr_running;
3067 +
3068 +       return sum;
3069 +}
3070 +
3071 +unsigned long nr_uninterruptible(void)
3072 +{
3073 +       unsigned long i, sum = 0;
3074 +
3075 +       for_each_possible_cpu(i)
3076 +               sum += cpu_rq(i)->nr_uninterruptible;
3077 +
3078 +       /*
3079 +        * Since we read the counters lockless, it might be slightly
3080 +        * inaccurate. Do not allow it to go below zero though:
3081 +        */
3082 +       if (unlikely((long)sum < 0))
3083 +               sum = 0;
3084 +
3085 +       return sum;
3086 +}
3087 +
3088 +unsigned long long nr_context_switches(void)
3089 +{
3090 +       int i;
3091 +       unsigned long long sum = 0;
3092 +
3093 +       for_each_possible_cpu(i)
3094 +               sum += cpu_rq(i)->nr_switches;
3095 +
3096 +       return sum;
3097 +}
3098 +
3099 +unsigned long nr_iowait(void)
3100 +{
3101 +       unsigned long i, sum = 0;
3102 +
3103 +       for_each_possible_cpu(i)
3104 +               sum += atomic_read(&cpu_rq(i)->nr_iowait);
3105 +
3106 +       return sum;
3107 +}
3108 +
3109 +unsigned long nr_active(void)
3110 +{
3111 +       unsigned long i, running = 0, uninterruptible = 0;
3112 +
3113 +       for_each_online_cpu(i) {
3114 +               running += cpu_rq(i)->nr_running;
3115 +               uninterruptible += cpu_rq(i)->nr_uninterruptible;
3116 +       }
3117 +
3118 +       if (unlikely((long)uninterruptible < 0))
3119 +               uninterruptible = 0;
3120 +
3121 +       return running + uninterruptible;
3122 +}
3123 +
3124 +/*
3125 + * Update rq->cpu_load[] statistics. This function is usually called every
3126 + * scheduler tick (TICK_NSEC).
3127 + */
3128 +static void update_cpu_load(struct rq *this_rq)
3129 +{
3130 +       unsigned long this_load = this_rq->load.weight;
3131 +       int i, scale;
3132 +
3133 +       this_rq->nr_load_updates++;
3134 +
3135 +       /* Update our load: */
3136 +       for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3137 +               unsigned long old_load, new_load;
3138 +
3139 +               /* scale is effectively 1 << i now, and >> i divides by scale */
3140 +
3141 +               old_load = this_rq->cpu_load[i];
3142 +               new_load = this_load;
3143 +               /*
3144 +                * Round up the averaging division if load is increasing. This
3145 +                * prevents us from getting stuck on 9 if the load is 10, for
3146 +                * example.
3147 +                */
3148 +               if (new_load > old_load)
3149 +                       new_load += scale-1;
3150 +               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3151 +       }
3152 +}
3153 +
3154 +#ifdef CONFIG_SMP
3155 +
3156 +/*
3157 + * double_rq_lock - safely lock two runqueues
3158 + *
3159 + * Note this does not disable interrupts like task_rq_lock,
3160 + * you need to do so manually before calling.
3161 + */
3162 +static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3163 +       __acquires(rq1->lock)
3164 +       __acquires(rq2->lock)
3165 +{
3166 +       BUG_ON(!irqs_disabled());
3167 +       if (rq1 == rq2) {
3168 +               spin_lock(&rq1->lock);
3169 +               __acquire(rq2->lock);   /* Fake it out ;) */
3170 +       } else {
3171 +               if (rq1 < rq2) {
3172 +                       spin_lock(&rq1->lock);
3173 +                       spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3174 +               } else {
3175 +                       spin_lock(&rq2->lock);
3176 +                       spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3177 +               }
3178 +       }
3179 +       update_rq_clock(rq1);
3180 +       update_rq_clock(rq2);
3181 +}
3182 +
3183 +/*
3184 + * double_rq_unlock - safely unlock two runqueues
3185 + *
3186 + * Note this does not restore interrupts like task_rq_unlock,
3187 + * you need to do so manually after calling.
3188 + */
3189 +static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3190 +       __releases(rq1->lock)
3191 +       __releases(rq2->lock)
3192 +{
3193 +       spin_unlock(&rq1->lock);
3194 +       if (rq1 != rq2)
3195 +               spin_unlock(&rq2->lock);
3196 +       else
3197 +               __release(rq2->lock);
3198 +}
3199 +
3200 +/*
3201 + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
3202 + */
3203 +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
3204 +       __releases(this_rq->lock)
3205 +       __acquires(busiest->lock)
3206 +       __acquires(this_rq->lock)
3207 +{
3208 +       int ret = 0;
3209 +
3210 +       if (unlikely(!irqs_disabled())) {
3211 +               /* printk() doesn't work good under rq->lock */
3212 +               spin_unlock(&this_rq->lock);
3213 +               BUG_ON(1);
3214 +       }
3215 +       if (unlikely(!spin_trylock(&busiest->lock))) {
3216 +               if (busiest < this_rq) {
3217 +                       spin_unlock(&this_rq->lock);
3218 +                       spin_lock(&busiest->lock);
3219 +                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
3220 +                       ret = 1;
3221 +               } else
3222 +                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
3223 +       }
3224 +       return ret;
3225 +}
3226 +
3227 +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
3228 +       __releases(busiest->lock)
3229 +{
3230 +       spin_unlock(&busiest->lock);
3231 +       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
3232 +}
3233 +
3234 +/*
3235 + * If dest_cpu is allowed for this process, migrate the task to it.
3236 + * This is accomplished by forcing the cpu_allowed mask to only
3237 + * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3238 + * the cpu_allowed mask is restored.
3239 + */
3240 +static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3241 +{
3242 +       struct migration_req req;
3243 +       unsigned long flags;
3244 +       struct rq *rq;
3245 +
3246 +       rq = task_rq_lock(p, &flags);
3247 +       if (!cpu_isset(dest_cpu, p->cpus_allowed)
3248 +           || unlikely(!cpu_active(dest_cpu)))
3249 +               goto out;
3250 +
3251 +       /* force the process onto the specified CPU */
3252 +       if (migrate_task(p, dest_cpu, &req)) {
3253 +               /* Need to wait for migration thread (might exit: take ref). */
3254 +               struct task_struct *mt = rq->migration_thread;
3255 +
3256 +               get_task_struct(mt);
3257 +               task_rq_unlock(rq, &flags);
3258 +               wake_up_process(mt);
3259 +               put_task_struct(mt);
3260 +               wait_for_completion(&req.done);
3261 +
3262 +               return;
3263 +       }
3264 +out:
3265 +       task_rq_unlock(rq, &flags);
3266 +}
3267 +
3268 +/*
3269 + * sched_exec - execve() is a valuable balancing opportunity, because at
3270 + * this point the task has the smallest effective memory and cache footprint.
3271 + */
3272 +void sched_exec(void)
3273 +{
3274 +       int new_cpu, this_cpu = get_cpu();
3275 +       new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
3276 +       put_cpu();
3277 +       if (new_cpu != this_cpu)
3278 +               sched_migrate_task(current, new_cpu);
3279 +}
3280 +
3281 +/*
3282 + * pull_task - move a task from a remote runqueue to the local runqueue.
3283 + * Both runqueues must be locked.
3284 + */
3285 +static void pull_task(struct rq *src_rq, struct task_struct *p,
3286 +                     struct rq *this_rq, int this_cpu)
3287 +{
3288 +       deactivate_task(src_rq, p, 0);
3289 +       set_task_cpu(p, this_cpu);
3290 +       activate_task(this_rq, p, 0);
3291 +       /*
3292 +        * Note that idle threads have a prio of MAX_PRIO, for this test
3293 +        * to be always true for them.
3294 +        */
3295 +       check_preempt_curr(this_rq, p);
3296 +}
3297 +
3298 +/*
3299 + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3300 + */
3301 +static
3302 +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3303 +                    struct sched_domain *sd, enum cpu_idle_type idle,
3304 +                    int *all_pinned)
3305 +{
3306 +       /*
3307 +        * We do not migrate tasks that are:
3308 +        * 1) running (obviously), or
3309 +        * 2) cannot be migrated to this CPU due to cpus_allowed, or
3310 +        * 3) are cache-hot on their current CPU.
3311 +        */
3312 +       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
3313 +               schedstat_inc(p, se.nr_failed_migrations_affine);
3314 +               return 0;
3315 +       }
3316 +       *all_pinned = 0;
3317 +
3318 +       if (task_running(rq, p)) {
3319 +               schedstat_inc(p, se.nr_failed_migrations_running);
3320 +               return 0;
3321 +       }
3322 +
3323 +       /*
3324 +        * Aggressive migration if:
3325 +        * 1) task is cache cold, or
3326 +        * 2) too many balance attempts have failed.
3327 +        */
3328 +
3329 +       if (!task_hot(p, rq->clock, sd) ||
3330 +                       sd->nr_balance_failed > sd->cache_nice_tries) {
3331 +#ifdef CONFIG_SCHEDSTATS
3332 +               if (task_hot(p, rq->clock, sd)) {
3333 +                       schedstat_inc(sd, lb_hot_gained[idle]);
3334 +                       schedstat_inc(p, se.nr_forced_migrations);
3335 +               }
3336 +#endif
3337 +               return 1;
3338 +       }
3339 +
3340 +       if (task_hot(p, rq->clock, sd)) {
3341 +               schedstat_inc(p, se.nr_failed_migrations_hot);
3342 +               return 0;
3343 +       }
3344 +       return 1;
3345 +}
3346 +
3347 +static unsigned long
3348 +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3349 +             unsigned long max_load_move, struct sched_domain *sd,
3350 +             enum cpu_idle_type idle, int *all_pinned,
3351 +             int *this_best_prio, struct rq_iterator *iterator)
3352 +{
3353 +       int loops = 0, pulled = 0, pinned = 0;
3354 +       struct task_struct *p;
3355 +       long rem_load_move = max_load_move;
3356 +
3357 +       if (max_load_move == 0)
3358 +               goto out;
3359 +
3360 +       pinned = 1;
3361 +
3362 +       /*
3363 +        * Start the load-balancing iterator:
3364 +        */
3365 +       p = iterator->start(iterator->arg);
3366 +next:
3367 +       if (!p || loops++ > sysctl_sched_nr_migrate)
3368 +               goto out;
3369 +
3370 +       if ((p->se.load.weight >> 1) > rem_load_move ||
3371 +           !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3372 +               p = iterator->next(iterator->arg);
3373 +               goto next;
3374 +       }
3375 +
3376 +       pull_task(busiest, p, this_rq, this_cpu);
3377 +       pulled++;
3378 +       rem_load_move -= p->se.load.weight;
3379 +
3380 +       /*
3381 +        * We only want to steal up to the prescribed amount of weighted load.
3382 +        */
3383 +       if (rem_load_move > 0) {
3384 +               if (p->prio < *this_best_prio)
3385 +                       *this_best_prio = p->prio;
3386 +               p = iterator->next(iterator->arg);
3387 +               goto next;
3388 +       }
3389 +out:
3390 +       /*
3391 +        * Right now, this is one of only two places pull_task() is called,
3392 +        * so we can safely collect pull_task() stats here rather than
3393 +        * inside pull_task().
3394 +        */
3395 +       schedstat_add(sd, lb_gained[idle], pulled);
3396 +
3397 +       if (all_pinned)
3398 +               *all_pinned = pinned;
3399 +
3400 +       return max_load_move - rem_load_move;
3401 +}
3402 +
3403 +/*
3404 + * move_tasks tries to move up to max_load_move weighted load from busiest to
3405 + * this_rq, as part of a balancing operation within domain "sd".
3406 + * Returns 1 if successful and 0 otherwise.
3407 + *
3408 + * Called with both runqueues locked.
3409 + */
3410 +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3411 +                     unsigned long max_load_move,
3412 +                     struct sched_domain *sd, enum cpu_idle_type idle,
3413 +                     int *all_pinned)
3414 +{
3415 +       const struct sched_class *class = sched_class_highest;
3416 +       unsigned long total_load_moved = 0;
3417 +       int this_best_prio = this_rq->curr->prio;
3418 +
3419 +       do {
3420 +               total_load_moved +=
3421 +                       class->load_balance(this_rq, this_cpu, busiest,
3422 +                               max_load_move - total_load_moved,
3423 +                               sd, idle, all_pinned, &this_best_prio);
3424 +               class = class->next;
3425 +
3426 +               if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3427 +                       break;
3428 +
3429 +       } while (class && max_load_move > total_load_moved);
3430 +
3431 +       return total_load_moved > 0;
3432 +}
3433 +
3434 +static int
3435 +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3436 +                  struct sched_domain *sd, enum cpu_idle_type idle,
3437 +                  struct rq_iterator *iterator)
3438 +{
3439 +       struct task_struct *p = iterator->start(iterator->arg);
3440 +       int pinned = 0;
3441 +
3442 +       while (p) {
3443 +               if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3444 +                       pull_task(busiest, p, this_rq, this_cpu);
3445 +                       /*
3446 +                        * Right now, this is only the second place pull_task()
3447 +                        * is called, so we can safely collect pull_task()
3448 +                        * stats here rather than inside pull_task().
3449 +                        */
3450 +                       schedstat_inc(sd, lb_gained[idle]);
3451 +
3452 +                       return 1;
3453 +               }
3454 +               p = iterator->next(iterator->arg);
3455 +       }
3456 +
3457 +       return 0;
3458 +}
3459 +
3460 +/*
3461 + * move_one_task tries to move exactly one task from busiest to this_rq, as
3462 + * part of active balancing operations within "domain".
3463 + * Returns 1 if successful and 0 otherwise.
3464 + *
3465 + * Called with both runqueues locked.
3466 + */
3467 +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3468 +                        struct sched_domain *sd, enum cpu_idle_type idle)
3469 +{
3470 +       const struct sched_class *class;
3471 +
3472 +       for (class = sched_class_highest; class; class = class->next)
3473 +               if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3474 +                       return 1;
3475 +
3476 +       return 0;
3477 +}
3478 +
3479 +/*
3480 + * find_busiest_group finds and returns the busiest CPU group within the
3481 + * domain. It calculates and returns the amount of weighted load which
3482 + * should be moved to restore balance via the imbalance parameter.
3483 + */
3484 +static struct sched_group *
3485 +find_busiest_group(struct sched_domain *sd, int this_cpu,
3486 +                  unsigned long *imbalance, enum cpu_idle_type idle,
3487 +                  int *sd_idle, const cpumask_t *cpus, int *balance)
3488 +{
3489 +       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3490 +       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
3491 +       unsigned long max_pull;
3492 +       unsigned long busiest_load_per_task, busiest_nr_running;
3493 +       unsigned long this_load_per_task, this_nr_running;
3494 +       int load_idx, group_imb = 0;
3495 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3496 +       int power_savings_balance = 1;
3497 +       unsigned long leader_nr_running = 0, min_load_per_task = 0;
3498 +       unsigned long min_nr_running = ULONG_MAX;
3499 +       struct sched_group *group_min = NULL, *group_leader = NULL;
3500 +#endif
3501 +
3502 +       max_load = this_load = total_load = total_pwr = 0;
3503 +       busiest_load_per_task = busiest_nr_running = 0;
3504 +       this_load_per_task = this_nr_running = 0;
3505 +
3506 +       if (idle == CPU_NOT_IDLE)
3507 +               load_idx = sd->busy_idx;
3508 +       else if (idle == CPU_NEWLY_IDLE)
3509 +               load_idx = sd->newidle_idx;
3510 +       else
3511 +               load_idx = sd->idle_idx;
3512 +
3513 +       do {
3514 +               unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
3515 +               int local_group;
3516 +               int i;
3517 +               int __group_imb = 0;
3518 +               unsigned int balance_cpu = -1, first_idle_cpu = 0;
3519 +               unsigned long sum_nr_running, sum_weighted_load;
3520 +               unsigned long sum_avg_load_per_task;
3521 +               unsigned long avg_load_per_task;
3522 +
3523 +               local_group = cpu_isset(this_cpu, group->cpumask);
3524 +
3525 +               if (local_group)
3526 +                       balance_cpu = first_cpu(group->cpumask);
3527 +
3528 +               /* Tally up the load of all CPUs in the group */
3529 +               sum_weighted_load = sum_nr_running = avg_load = 0;
3530 +               sum_avg_load_per_task = avg_load_per_task = 0;
3531 +
3532 +               max_cpu_load = 0;
3533 +               min_cpu_load = ~0UL;
3534 +
3535 +               for_each_cpu_mask_nr(i, group->cpumask) {
3536 +                       struct rq *rq;
3537 +
3538 +                       if (!cpu_isset(i, *cpus))
3539 +                               continue;
3540 +
3541 +                       rq = cpu_rq(i);
3542 +
3543 +                       if (*sd_idle && rq->nr_running)
3544 +                               *sd_idle = 0;
3545 +
3546 +                       /* Bias balancing toward cpus of our domain */
3547 +                       if (local_group) {
3548 +                               if (idle_cpu(i) && !first_idle_cpu) {
3549 +                                       first_idle_cpu = 1;
3550 +                                       balance_cpu = i;
3551 +                               }
3552 +
3553 +                               load = target_load(i, load_idx);
3554 +                       } else {
3555 +                               load = source_load(i, load_idx);
3556 +                               if (load > max_cpu_load)
3557 +                                       max_cpu_load = load;
3558 +                               if (min_cpu_load > load)
3559 +                                       min_cpu_load = load;
3560 +                       }
3561 +
3562 +                       avg_load += load;
3563 +                       sum_nr_running += rq->nr_running;
3564 +                       sum_weighted_load += weighted_cpuload(i);
3565 +
3566 +                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
3567 +               }
3568 +
3569 +               /*
3570 +                * First idle cpu or the first cpu(busiest) in this sched group
3571 +                * is eligible for doing load balancing at this and above
3572 +                * domains. In the newly idle case, we will allow all the cpu's
3573 +                * to do the newly idle load balance.
3574 +                */
3575 +               if (idle != CPU_NEWLY_IDLE && local_group &&
3576 +                   balance_cpu != this_cpu && balance) {
3577 +                       *balance = 0;
3578 +                       goto ret;
3579 +               }
3580 +
3581 +               total_load += avg_load;
3582 +               total_pwr += group->__cpu_power;
3583 +
3584 +               /* Adjust by relative CPU power of the group */
3585 +               avg_load = sg_div_cpu_power(group,
3586 +                               avg_load * SCHED_LOAD_SCALE);
3587 +
3588 +
3589 +               /*
3590 +                * Consider the group unbalanced when the imbalance is larger
3591 +                * than the average weight of two tasks.
3592 +                *
3593 +                * APZ: with cgroup the avg task weight can vary wildly and
3594 +                *      might not be a suitable number - should we keep a
3595 +                *      normalized nr_running number somewhere that negates
3596 +                *      the hierarchy?
3597 +                */
3598 +               avg_load_per_task = sg_div_cpu_power(group,
3599 +                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
3600 +
3601 +               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3602 +                       __group_imb = 1;
3603 +
3604 +               group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3605 +
3606 +               if (local_group) {
3607 +                       this_load = avg_load;
3608 +                       this = group;
3609 +                       this_nr_running = sum_nr_running;
3610 +                       this_load_per_task = sum_weighted_load;
3611 +               } else if (avg_load > max_load &&
3612 +                          (sum_nr_running > group_capacity || __group_imb)) {
3613 +                       max_load = avg_load;
3614 +                       busiest = group;
3615 +                       busiest_nr_running = sum_nr_running;
3616 +                       busiest_load_per_task = sum_weighted_load;
3617 +                       group_imb = __group_imb;
3618 +               }
3619 +
3620 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3621 +               /*
3622 +                * Busy processors will not participate in power savings
3623 +                * balance.
3624 +                */
3625 +               if (idle == CPU_NOT_IDLE ||
3626 +                               !(sd->flags & SD_POWERSAVINGS_BALANCE))
3627 +                       goto group_next;
3628 +
3629 +               /*
3630 +                * If the local group is idle or completely loaded
3631 +                * no need to do power savings balance at this domain
3632 +                */
3633 +               if (local_group && (this_nr_running >= group_capacity ||
3634 +                                   !this_nr_running))
3635 +                       power_savings_balance = 0;
3636 +
3637 +               /*
3638 +                * If a group is already running at full capacity or idle,
3639 +                * don't include that group in power savings calculations
3640 +                */
3641 +               if (!power_savings_balance || sum_nr_running >= group_capacity
3642 +                   || !sum_nr_running)
3643 +                       goto group_next;
3644 +
3645 +               /*
3646 +                * Calculate the group which has the least non-idle load.
3647 +                * This is the group from where we need to pick up the load
3648 +                * for saving power
3649 +                */
3650 +               if ((sum_nr_running < min_nr_running) ||
3651 +                   (sum_nr_running == min_nr_running &&
3652 +                    first_cpu(group->cpumask) <
3653 +                    first_cpu(group_min->cpumask))) {
3654 +                       group_min = group;
3655 +                       min_nr_running = sum_nr_running;
3656 +                       min_load_per_task = sum_weighted_load /
3657 +                                               sum_nr_running;
3658 +               }
3659 +
3660 +               /*
3661 +                * Calculate the group which is almost near its
3662 +                * capacity but still has some space to pick up some load
3663 +                * from other group and save more power
3664 +                */
3665 +               if (sum_nr_running <= group_capacity - 1) {
3666 +                       if (sum_nr_running > leader_nr_running ||
3667 +                           (sum_nr_running == leader_nr_running &&
3668 +                            first_cpu(group->cpumask) >
3669 +                             first_cpu(group_leader->cpumask))) {
3670 +                               group_leader = group;
3671 +                               leader_nr_running = sum_nr_running;
3672 +                       }
3673 +               }
3674 +group_next:
3675 +#endif
3676 +               group = group->next;
3677 +       } while (group != sd->groups);
3678 +
3679 +       if (!busiest || this_load >= max_load || busiest_nr_running == 0)
3680 +               goto out_balanced;
3681 +
3682 +       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3683 +
3684 +       if (this_load >= avg_load ||
3685 +                       100*max_load <= sd->imbalance_pct*this_load)
3686 +               goto out_balanced;
3687 +
3688 +       busiest_load_per_task /= busiest_nr_running;
3689 +       if (group_imb)
3690 +               busiest_load_per_task = min(busiest_load_per_task, avg_load);
3691 +
3692 +       /*
3693 +        * We're trying to get all the cpus to the average_load, so we don't
3694 +        * want to push ourselves above the average load, nor do we wish to
3695 +        * reduce the max loaded cpu below the average load, as either of these
3696 +        * actions would just result in more rebalancing later, and ping-pong
3697 +        * tasks around. Thus we look for the minimum possible imbalance.
3698 +        * Negative imbalances (*we* are more loaded than anyone else) will
3699 +        * be counted as no imbalance for these purposes -- we can't fix that
3700 +        * by pulling tasks to us. Be careful of negative numbers as they'll
3701 +        * appear as very large values with unsigned longs.
3702 +        */
3703 +       if (max_load <= busiest_load_per_task)
3704 +               goto out_balanced;
3705 +
3706 +       /*
3707 +        * In the presence of smp nice balancing, certain scenarios can have
3708 +        * max load less than avg load(as we skip the groups at or below
3709 +        * its cpu_power, while calculating max_load..)
3710 +        */
3711 +       if (max_load < avg_load) {
3712 +               *imbalance = 0;
3713 +               goto small_imbalance;
3714 +       }
3715 +
3716 +       /* Don't want to pull so many tasks that a group would go idle */
3717 +       max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
3718 +
3719 +       /* How much load to actually move to equalise the imbalance */
3720 +       *imbalance = min(max_pull * busiest->__cpu_power,
3721 +                               (avg_load - this_load) * this->__cpu_power)
3722 +                       / SCHED_LOAD_SCALE;
3723 +
3724 +       /*
3725 +        * if *imbalance is less than the average load per runnable task
3726 +        * there is no gaurantee that any tasks will be moved so we'll have
3727 +        * a think about bumping its value to force at least one task to be
3728 +        * moved
3729 +        */
3730 +       if (*imbalance < busiest_load_per_task) {
3731 +               unsigned long tmp, pwr_now, pwr_move;
3732 +               unsigned int imbn;
3733 +
3734 +small_imbalance:
3735 +               pwr_move = pwr_now = 0;
3736 +               imbn = 2;
3737 +               if (this_nr_running) {
3738 +                       this_load_per_task /= this_nr_running;
3739 +                       if (busiest_load_per_task > this_load_per_task)
3740 +                               imbn = 1;
3741 +               } else
3742 +                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
3743 +
3744 +               if (max_load - this_load + 2*busiest_load_per_task >=
3745 +                                       busiest_load_per_task * imbn) {
3746 +                       *imbalance = busiest_load_per_task;
3747 +                       return busiest;
3748 +               }
3749 +
3750 +               /*
3751 +                * OK, we don't have enough imbalance to justify moving tasks,
3752 +                * however we may be able to increase total CPU power used by
3753 +                * moving them.
3754 +                */
3755 +
3756 +               pwr_now += busiest->__cpu_power *
3757 +                               min(busiest_load_per_task, max_load);
3758 +               pwr_now += this->__cpu_power *
3759 +                               min(this_load_per_task, this_load);
3760 +               pwr_now /= SCHED_LOAD_SCALE;
3761 +
3762 +               /* Amount of load we'd subtract */
3763 +               tmp = sg_div_cpu_power(busiest,
3764 +                               busiest_load_per_task * SCHED_LOAD_SCALE);
3765 +               if (max_load > tmp)
3766 +                       pwr_move += busiest->__cpu_power *
3767 +                               min(busiest_load_per_task, max_load - tmp);
3768 +
3769 +               /* Amount of load we'd add */
3770 +               if (max_load * busiest->__cpu_power <
3771 +                               busiest_load_per_task * SCHED_LOAD_SCALE)
3772 +                       tmp = sg_div_cpu_power(this,
3773 +                                       max_load * busiest->__cpu_power);
3774 +               else
3775 +                       tmp = sg_div_cpu_power(this,
3776 +                               busiest_load_per_task * SCHED_LOAD_SCALE);
3777 +               pwr_move += this->__cpu_power *
3778 +                               min(this_load_per_task, this_load + tmp);
3779 +               pwr_move /= SCHED_LOAD_SCALE;
3780 +
3781 +               /* Move if we gain throughput */
3782 +               if (pwr_move > pwr_now)
3783 +                       *imbalance = busiest_load_per_task;
3784 +       }
3785 +
3786 +       return busiest;
3787 +
3788 +out_balanced:
3789 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3790 +       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3791 +               goto ret;
3792 +
3793 +       if (this == group_leader && group_leader != group_min) {
3794 +               *imbalance = min_load_per_task;
3795 +               return group_min;
3796 +       }
3797 +#endif
3798 +ret:
3799 +       *imbalance = 0;
3800 +       return NULL;
3801 +}
3802 +
3803 +/*
3804 + * find_busiest_queue - find the busiest runqueue among the cpus in group.
3805 + */
3806 +static struct rq *
3807 +find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3808 +                  unsigned long imbalance, const cpumask_t *cpus)
3809 +{
3810 +       struct rq *busiest = NULL, *rq;
3811 +       unsigned long max_load = 0;
3812 +       int i;
3813 +
3814 +       for_each_cpu_mask_nr(i, group->cpumask) {
3815 +               unsigned long wl;
3816 +
3817 +               if (!cpu_isset(i, *cpus))
3818 +                       continue;
3819 +
3820 +               rq = cpu_rq(i);
3821 +               wl = weighted_cpuload(i);
3822 +
3823 +               if (rq->nr_running == 1 && wl > imbalance)
3824 +                       continue;
3825 +
3826 +               if (wl > max_load) {
3827 +                       max_load = wl;
3828 +                       busiest = rq;
3829 +               }
3830 +       }
3831 +
3832 +       return busiest;
3833 +}
3834 +
3835 +/*
3836 + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
3837 + * so long as it is large enough.
3838 + */
3839 +#define MAX_PINNED_INTERVAL    512
3840 +
3841 +/*
3842 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
3843 + * tasks if there is an imbalance.
3844 + */
3845 +static int load_balance(int this_cpu, struct rq *this_rq,
3846 +                       struct sched_domain *sd, enum cpu_idle_type idle,
3847 +                       int *balance, cpumask_t *cpus)
3848 +{
3849 +       int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3850 +       struct sched_group *group;
3851 +       unsigned long imbalance;
3852 +       struct rq *busiest;
3853 +       unsigned long flags;
3854 +
3855 +       cpus_setall(*cpus);
3856 +
3857 +       /*
3858 +        * When power savings policy is enabled for the parent domain, idle
3859 +        * sibling can pick up load irrespective of busy siblings. In this case,
3860 +        * let the state of idle sibling percolate up as CPU_IDLE, instead of
3861 +        * portraying it as CPU_NOT_IDLE.
3862 +        */
3863 +       if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3864 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3865 +               sd_idle = 1;
3866 +
3867 +       schedstat_inc(sd, lb_count[idle]);
3868 +
3869 +redo:
3870 +       update_shares(sd);
3871 +       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3872 +                                  cpus, balance);
3873 +
3874 +       if (*balance == 0)
3875 +               goto out_balanced;
3876 +
3877 +       if (!group) {
3878 +               schedstat_inc(sd, lb_nobusyg[idle]);
3879 +               goto out_balanced;
3880 +       }
3881 +
3882 +       busiest = find_busiest_queue(group, idle, imbalance, cpus);
3883 +       if (!busiest) {
3884 +               schedstat_inc(sd, lb_nobusyq[idle]);
3885 +               goto out_balanced;
3886 +       }
3887 +
3888 +       BUG_ON(busiest == this_rq);
3889 +
3890 +       schedstat_add(sd, lb_imbalance[idle], imbalance);
3891 +
3892 +       ld_moved = 0;
3893 +       if (busiest->nr_running > 1) {
3894 +               /*
3895 +                * Attempt to move tasks. If find_busiest_group has found
3896 +                * an imbalance but busiest->nr_running <= 1, the group is
3897 +                * still unbalanced. ld_moved simply stays zero, so it is
3898 +                * correctly treated as an imbalance.
3899 +                */
3900 +               local_irq_save(flags);
3901 +               double_rq_lock(this_rq, busiest);
3902 +               ld_moved = move_tasks(this_rq, this_cpu, busiest,
3903 +                                     imbalance, sd, idle, &all_pinned);
3904 +               double_rq_unlock(this_rq, busiest);
3905 +               local_irq_restore(flags);
3906 +
3907 +               /*
3908 +                * some other cpu did the load balance for us.
3909 +                */
3910 +               if (ld_moved && this_cpu != smp_processor_id())
3911 +                       resched_cpu(this_cpu);
3912 +
3913 +               /* All tasks on this runqueue were pinned by CPU affinity */
3914 +               if (unlikely(all_pinned)) {
3915 +                       cpu_clear(cpu_of(busiest), *cpus);
3916 +                       if (!cpus_empty(*cpus))
3917 +                               goto redo;
3918 +                       goto out_balanced;
3919 +               }
3920 +       }
3921 +
3922 +       if (!ld_moved) {
3923 +               schedstat_inc(sd, lb_failed[idle]);
3924 +               sd->nr_balance_failed++;
3925 +
3926 +               if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
3927 +
3928 +                       spin_lock_irqsave(&busiest->lock, flags);
3929 +
3930 +                       /* don't kick the migration_thread, if the curr
3931 +                        * task on busiest cpu can't be moved to this_cpu
3932 +                        */
3933 +                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
3934 +                               spin_unlock_irqrestore(&busiest->lock, flags);
3935 +                               all_pinned = 1;
3936 +                               goto out_one_pinned;
3937 +                       }
3938 +
3939 +                       if (!busiest->active_balance) {
3940 +                               busiest->active_balance = 1;
3941 +                               busiest->push_cpu = this_cpu;
3942 +                               active_balance = 1;
3943 +                       }
3944 +                       spin_unlock_irqrestore(&busiest->lock, flags);
3945 +                       if (active_balance)
3946 +                               wake_up_process(busiest->migration_thread);
3947 +
3948 +                       /*
3949 +                        * We've kicked active balancing, reset the failure
3950 +                        * counter.
3951 +                        */
3952 +                       sd->nr_balance_failed = sd->cache_nice_tries+1;
3953 +               }
3954 +       } else
3955 +               sd->nr_balance_failed = 0;
3956 +
3957 +       if (likely(!active_balance)) {
3958 +               /* We were unbalanced, so reset the balancing interval */
3959 +               sd->balance_interval = sd->min_interval;
3960 +       } else {
3961 +               /*
3962 +                * If we've begun active balancing, start to back off. This
3963 +                * case may not be covered by the all_pinned logic if there
3964 +                * is only 1 task on the busy runqueue (because we don't call
3965 +                * move_tasks).
3966 +                */
3967 +               if (sd->balance_interval < sd->max_interval)
3968 +                       sd->balance_interval *= 2;
3969 +       }
3970 +
3971 +       if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3972 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3973 +               ld_moved = -1;
3974 +
3975 +       goto out;
3976 +
3977 +out_balanced:
3978 +       schedstat_inc(sd, lb_balanced[idle]);
3979 +
3980 +       sd->nr_balance_failed = 0;
3981 +
3982 +out_one_pinned:
3983 +       /* tune up the balancing interval */
3984 +       if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3985 +                       (sd->balance_interval < sd->max_interval))
3986 +               sd->balance_interval *= 2;
3987 +
3988 +       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3989 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3990 +               ld_moved = -1;
3991 +       else
3992 +               ld_moved = 0;
3993 +out:
3994 +       if (ld_moved)
3995 +               update_shares(sd);
3996 +       return ld_moved;
3997 +}
3998 +
3999 +/*
4000 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
4001 + * tasks if there is an imbalance.
4002 + *
4003 + * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4004 + * this_rq is locked.
4005 + */
4006 +static int
4007 +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
4008 +                       cpumask_t *cpus)
4009 +{
4010 +       struct sched_group *group;
4011 +       struct rq *busiest = NULL;
4012 +       unsigned long imbalance;
4013 +       int ld_moved = 0;
4014 +       int sd_idle = 0;
4015 +       int all_pinned = 0;
4016 +
4017 +       cpus_setall(*cpus);
4018 +
4019 +       /*
4020 +        * When power savings policy is enabled for the parent domain, idle
4021 +        * sibling can pick up load irrespective of busy siblings. In this case,
4022 +        * let the state of idle sibling percolate up as IDLE, instead of
4023 +        * portraying it as CPU_NOT_IDLE.
4024 +        */
4025 +       if (sd->flags & SD_SHARE_CPUPOWER &&
4026 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4027 +               sd_idle = 1;
4028 +
4029 +       schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4030 +redo:
4031 +       update_shares_locked(this_rq, sd);
4032 +       group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4033 +                                  &sd_idle, cpus, NULL);
4034 +       if (!group) {
4035 +               schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4036 +               goto out_balanced;
4037 +       }
4038 +
4039 +       busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4040 +       if (!busiest) {
4041 +               schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4042 +               goto out_balanced;
4043 +       }
4044 +
4045 +       BUG_ON(busiest == this_rq);
4046 +
4047 +       schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4048 +
4049 +       ld_moved = 0;
4050 +       if (busiest->nr_running > 1) {
4051 +               /* Attempt to move tasks */
4052 +               double_lock_balance(this_rq, busiest);
4053 +               /* this_rq->clock is already updated */
4054 +               update_rq_clock(busiest);
4055 +               ld_moved = move_tasks(this_rq, this_cpu, busiest,
4056 +                                       imbalance, sd, CPU_NEWLY_IDLE,
4057 +                                       &all_pinned);
4058 +               double_unlock_balance(this_rq, busiest);
4059 +
4060 +               if (unlikely(all_pinned)) {
4061 +                       cpu_clear(cpu_of(busiest), *cpus);
4062 +                       if (!cpus_empty(*cpus))
4063 +                               goto redo;
4064 +               }
4065 +       }
4066 +
4067 +       if (!ld_moved) {
4068 +               schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4069 +               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4070 +                   !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4071 +                       return -1;
4072 +       } else
4073 +               sd->nr_balance_failed = 0;
4074 +
4075 +       update_shares_locked(this_rq, sd);
4076 +       return ld_moved;
4077 +
4078 +out_balanced:
4079 +       schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4080 +       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4081 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4082 +               return -1;
4083 +       sd->nr_balance_failed = 0;
4084 +
4085 +       return 0;
4086 +}
4087 +
4088 +/*
4089 + * idle_balance is called by schedule() if this_cpu is about to become
4090 + * idle. Attempts to pull tasks from other CPUs.
4091 + */
4092 +static void idle_balance(int this_cpu, struct rq *this_rq)
4093 +{
4094 +       struct sched_domain *sd;
4095 +       int pulled_task = -1;
4096 +       unsigned long next_balance = jiffies + HZ;
4097 +       cpumask_t tmpmask;
4098 +
4099 +       for_each_domain(this_cpu, sd) {
4100 +               unsigned long interval;
4101 +
4102 +               if (!(sd->flags & SD_LOAD_BALANCE))
4103 +                       continue;
4104 +
4105 +               if (sd->flags & SD_BALANCE_NEWIDLE)
4106 +                       /* If we've pulled tasks over stop searching: */
4107 +                       pulled_task = load_balance_newidle(this_cpu, this_rq,
4108 +                                                          sd, &tmpmask);
4109 +
4110 +               interval = msecs_to_jiffies(sd->balance_interval);
4111 +               if (time_after(next_balance, sd->last_balance + interval))
4112 +                       next_balance = sd->last_balance + interval;
4113 +               if (pulled_task)
4114 +                       break;
4115 +       }
4116 +       if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4117 +               /*
4118 +                * We are going idle. next_balance may be set based on
4119 +                * a busy processor. So reset next_balance.
4120 +                */
4121 +               this_rq->next_balance = next_balance;
4122 +       }
4123 +}
4124 +
4125 +/*
4126 + * active_load_balance is run by migration threads. It pushes running tasks
4127 + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4128 + * running on each physical CPU where possible, and avoids physical /
4129 + * logical imbalances.
4130 + *
4131 + * Called with busiest_rq locked.
4132 + */
4133 +static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4134 +{
4135 +       int target_cpu = busiest_rq->push_cpu;
4136 +       struct sched_domain *sd;
4137 +       struct rq *target_rq;
4138 +
4139 +       /* Is there any task to move? */
4140 +       if (busiest_rq->nr_running <= 1)
4141 +               return;
4142 +
4143 +       target_rq = cpu_rq(target_cpu);
4144 +
4145 +       /*
4146 +        * This condition is "impossible", if it occurs
4147 +        * we need to fix it. Originally reported by
4148 +        * Bjorn Helgaas on a 128-cpu setup.
4149 +        */
4150 +       BUG_ON(busiest_rq == target_rq);
4151 +
4152 +       /* move a task from busiest_rq to target_rq */
4153 +       double_lock_balance(busiest_rq, target_rq);
4154 +       update_rq_clock(busiest_rq);
4155 +       update_rq_clock(target_rq);
4156 +
4157 +       /* Search for an sd spanning us and the target CPU. */
4158 +       for_each_domain(target_cpu, sd) {
4159 +               if ((sd->flags & SD_LOAD_BALANCE) &&
4160 +                   cpu_isset(busiest_cpu, sd->span))
4161 +                               break;
4162 +       }
4163 +
4164 +       if (likely(sd)) {
4165 +               schedstat_inc(sd, alb_count);
4166 +
4167 +               if (move_one_task(target_rq, target_cpu, busiest_rq,
4168 +                                 sd, CPU_IDLE))
4169 +                       schedstat_inc(sd, alb_pushed);
4170 +               else
4171 +                       schedstat_inc(sd, alb_failed);
4172 +       }
4173 +       double_unlock_balance(busiest_rq, target_rq);
4174 +}
4175 +
4176 +#ifdef CONFIG_NO_HZ
4177 +static struct {
4178 +       atomic_t load_balancer;
4179 +       cpumask_t cpu_mask;
4180 +} nohz ____cacheline_aligned = {
4181 +       .load_balancer = ATOMIC_INIT(-1),
4182 +       .cpu_mask = CPU_MASK_NONE,
4183 +};
4184 +
4185 +/*
4186 + * This routine will try to nominate the ilb (idle load balancing)
4187 + * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4188 + * load balancing on behalf of all those cpus. If all the cpus in the system
4189 + * go into this tickless mode, then there will be no ilb owner (as there is
4190 + * no need for one) and all the cpus will sleep till the next wakeup event
4191 + * arrives...
4192 + *
4193 + * For the ilb owner, tick is not stopped. And this tick will be used
4194 + * for idle load balancing. ilb owner will still be part of
4195 + * nohz.cpu_mask..
4196 + *
4197 + * While stopping the tick, this cpu will become the ilb owner if there
4198 + * is no other owner. And will be the owner till that cpu becomes busy
4199 + * or if all cpus in the system stop their ticks at which point
4200 + * there is no need for ilb owner.
4201 + *
4202 + * When the ilb owner becomes busy, it nominates another owner, during the
4203 + * next busy scheduler_tick()
4204 + */
4205 +int select_nohz_load_balancer(int stop_tick)
4206 +{
4207 +       int cpu = smp_processor_id();
4208 +
4209 +       if (stop_tick) {
4210 +               cpu_set(cpu, nohz.cpu_mask);
4211 +               cpu_rq(cpu)->in_nohz_recently = 1;
4212 +
4213 +               /*
4214 +                * If we are going offline and still the leader, give up!
4215 +                */
4216 +               if (!cpu_active(cpu) &&
4217 +                   atomic_read(&nohz.load_balancer) == cpu) {
4218 +                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4219 +                               BUG();
4220 +                       return 0;
4221 +               }
4222 +
4223 +               /* time for ilb owner also to sleep */
4224 +               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
4225 +                       if (atomic_read(&nohz.load_balancer) == cpu)
4226 +                               atomic_set(&nohz.load_balancer, -1);
4227 +                       return 0;
4228 +               }
4229 +
4230 +               if (atomic_read(&nohz.load_balancer) == -1) {
4231 +                       /* make me the ilb owner */
4232 +                       if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4233 +                               return 1;
4234 +               } else if (atomic_read(&nohz.load_balancer) == cpu)
4235 +                       return 1;
4236 +       } else {
4237 +               if (!cpu_isset(cpu, nohz.cpu_mask))
4238 +                       return 0;
4239 +
4240 +               cpu_clear(cpu, nohz.cpu_mask);
4241 +
4242 +               if (atomic_read(&nohz.load_balancer) == cpu)
4243 +                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4244 +                               BUG();
4245 +       }
4246 +       return 0;
4247 +}
4248 +#endif
4249 +
4250 +static DEFINE_SPINLOCK(balancing);
4251 +
4252 +/*
4253 + * It checks each scheduling domain to see if it is due to be balanced,
4254 + * and initiates a balancing operation if so.
4255 + *
4256 + * Balancing parameters are set up in arch_init_sched_domains.
4257 + */
4258 +static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4259 +{
4260 +       int balance = 1;
4261 +       struct rq *rq = cpu_rq(cpu);
4262 +       unsigned long interval;
4263 +       struct sched_domain *sd;
4264 +       /* Earliest time when we have to do rebalance again */
4265 +       unsigned long next_balance = jiffies + 60*HZ;
4266 +       int update_next_balance = 0;
4267 +       int need_serialize;
4268 +       cpumask_t tmp;
4269 +
4270 +       for_each_domain(cpu, sd) {
4271 +               if (!(sd->flags & SD_LOAD_BALANCE))
4272 +                       continue;
4273 +
4274 +               interval = sd->balance_interval;
4275 +               if (idle != CPU_IDLE)
4276 +                       interval *= sd->busy_factor;
4277 +
4278 +               /* scale ms to jiffies */
4279 +               interval = msecs_to_jiffies(interval);
4280 +               if (unlikely(!interval))
4281 +                       interval = 1;
4282 +               if (interval > HZ*NR_CPUS/10)
4283 +                       interval = HZ*NR_CPUS/10;
4284 +
4285 +               need_serialize = sd->flags & SD_SERIALIZE;
4286 +
4287 +               if (need_serialize) {
4288 +                       if (!spin_trylock(&balancing))
4289 +                               goto out;
4290 +               }
4291 +
4292 +               if (time_after_eq(jiffies, sd->last_balance + interval)) {
4293 +                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
4294 +                               /*
4295 +                                * We've pulled tasks over so either we're no
4296 +                                * longer idle, or one of our SMT siblings is
4297 +                                * not idle.
4298 +                                */
4299 +                               idle = CPU_NOT_IDLE;
4300 +                       }
4301 +                       sd->last_balance = jiffies;
4302 +               }
4303 +               if (need_serialize)
4304 +                       spin_unlock(&balancing);
4305 +out:
4306 +               if (time_after(next_balance, sd->last_balance + interval)) {
4307 +                       next_balance = sd->last_balance + interval;
4308 +                       update_next_balance = 1;
4309 +               }
4310 +
4311 +               /*
4312 +                * Stop the load balance at this level. There is another
4313 +                * CPU in our sched group which is doing load balancing more
4314 +                * actively.
4315 +                */
4316 +               if (!balance)
4317 +                       break;
4318 +       }
4319 +
4320 +       /*
4321 +        * next_balance will be updated only when there is a need.
4322 +        * When the cpu is attached to null domain for ex, it will not be
4323 +        * updated.
4324 +        */
4325 +       if (likely(update_next_balance))
4326 +               rq->next_balance = next_balance;
4327 +}
4328 +
4329 +/*
4330 + * run_rebalance_domains is triggered when needed from the scheduler tick.
4331 + * In CONFIG_NO_HZ case, the idle load balance owner will do the
4332 + * rebalancing for all the cpus for whom scheduler ticks are stopped.
4333 + */
4334 +static void run_rebalance_domains(struct softirq_action *h)
4335 +{
4336 +       int this_cpu = smp_processor_id();
4337 +       struct rq *this_rq = cpu_rq(this_cpu);
4338 +       enum cpu_idle_type idle = this_rq->idle_at_tick ?
4339 +                                               CPU_IDLE : CPU_NOT_IDLE;
4340 +
4341 +       rebalance_domains(this_cpu, idle);
4342 +
4343 +#ifdef CONFIG_NO_HZ
4344 +       /*
4345 +        * If this cpu is the owner for idle load balancing, then do the
4346 +        * balancing on behalf of the other idle cpus whose ticks are
4347 +        * stopped.
4348 +        */
4349 +       if (this_rq->idle_at_tick &&
4350 +           atomic_read(&nohz.load_balancer) == this_cpu) {
4351 +               cpumask_t cpus = nohz.cpu_mask;
4352 +               struct rq *rq;
4353 +               int balance_cpu;
4354 +
4355 +               cpu_clear(this_cpu, cpus);
4356 +               for_each_cpu_mask_nr(balance_cpu, cpus) {
4357 +                       /*
4358 +                        * If this cpu gets work to do, stop the load balancing
4359 +                        * work being done for other cpus. Next load
4360 +                        * balancing owner will pick it up.
4361 +                        */
4362 +                       if (need_resched())
4363 +                               break;
4364 +
4365 +                       rebalance_domains(balance_cpu, CPU_IDLE);
4366 +
4367 +                       rq = cpu_rq(balance_cpu);
4368 +                       if (time_after(this_rq->next_balance, rq->next_balance))
4369 +                               this_rq->next_balance = rq->next_balance;
4370 +               }
4371 +       }
4372 +#endif
4373 +}
4374 +
4375 +/*
4376 + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4377 + *
4378 + * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4379 + * idle load balancing owner or decide to stop the periodic load balancing,
4380 + * if the whole system is idle.
4381 + */
4382 +static inline void trigger_load_balance(struct rq *rq, int cpu)
4383 +{
4384 +#ifdef CONFIG_NO_HZ
4385 +       /*
4386 +        * If we were in the nohz mode recently and busy at the current
4387 +        * scheduler tick, then check if we need to nominate new idle
4388 +        * load balancer.
4389 +        */
4390 +       if (rq->in_nohz_recently && !rq->idle_at_tick) {
4391 +               rq->in_nohz_recently = 0;
4392 +
4393 +               if (atomic_read(&nohz.load_balancer) == cpu) {
4394 +                       cpu_clear(cpu, nohz.cpu_mask);
4395 +                       atomic_set(&nohz.load_balancer, -1);
4396 +               }
4397 +
4398 +               if (atomic_read(&nohz.load_balancer) == -1) {
4399 +                       /*
4400 +                        * simple selection for now: Nominate the
4401 +                        * first cpu in the nohz list to be the next
4402 +                        * ilb owner.
4403 +                        *
4404 +                        * TBD: Traverse the sched domains and nominate
4405 +                        * the nearest cpu in the nohz.cpu_mask.
4406 +                        */
4407 +                       int ilb = first_cpu(nohz.cpu_mask);
4408 +
4409 +                       if (ilb < nr_cpu_ids)
4410 +                               resched_cpu(ilb);
4411 +               }
4412 +       }
4413 +
4414 +       /*
4415 +        * If this cpu is idle and doing idle load balancing for all the
4416 +        * cpus with ticks stopped, is it time for that to stop?
4417 +        */
4418 +       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4419 +           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
4420 +               resched_cpu(cpu);
4421 +               return;
4422 +       }
4423 +
4424 +       /*
4425 +        * If this cpu is idle and the idle load balancing is done by
4426 +        * someone else, then no need raise the SCHED_SOFTIRQ
4427 +        */
4428 +       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4429 +           cpu_isset(cpu, nohz.cpu_mask))
4430 +               return;
4431 +#endif
4432 +       if (time_after_eq(jiffies, rq->next_balance))
4433 +               raise_softirq(SCHED_SOFTIRQ);
4434 +}
4435 +
4436 +#else  /* CONFIG_SMP */
4437 +
4438 +/*
4439 + * on UP we do not need to balance between CPUs:
4440 + */
4441 +static inline void idle_balance(int cpu, struct rq *rq)
4442 +{
4443 +}
4444 +
4445 +#endif
4446 +
4447 +DEFINE_PER_CPU(struct kernel_stat, kstat);
4448 +
4449 +EXPORT_PER_CPU_SYMBOL(kstat);
4450 +
4451 +/*
4452 + * Return p->sum_exec_runtime plus any more ns on the sched_clock
4453 + * that have not yet been banked in case the task is currently running.
4454 + */
4455 +unsigned long long task_sched_runtime(struct task_struct *p)
4456 +{
4457 +       unsigned long flags;
4458 +       u64 ns, delta_exec;
4459 +       struct rq *rq;
4460 +
4461 +       rq = task_rq_lock(p, &flags);
4462 +       ns = p->se.sum_exec_runtime;
4463 +       if (task_current(rq, p)) {
4464 +               update_rq_clock(rq);
4465 +               delta_exec = rq->clock - p->se.exec_start;
4466 +               if ((s64)delta_exec > 0)
4467 +                       ns += delta_exec;
4468 +       }
4469 +       task_rq_unlock(rq, &flags);
4470 +
4471 +       return ns;
4472 +}
4473 +
4474 +/*
4475 + * Account user cpu time to a process.
4476 + * @p: the process that the cpu time gets accounted to
4477 + * @cputime: the cpu time spent in user space since the last update
4478 + */
4479 +void account_user_time(struct task_struct *p, cputime_t cputime)
4480 +{
4481 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4482 +       struct vx_info *vxi = p->vx_info;  /* p is _always_ current */
4483 +       cputime64_t tmp;
4484 +       int nice = (TASK_NICE(p) > 0);
4485 +
4486 +       p->utime = cputime_add(p->utime, cputime);
4487 +       vx_account_user(vxi, cputime, nice);
4488 +
4489 +       /* Add user time to cpustat. */
4490 +       tmp = cputime_to_cputime64(cputime);
4491 +       if (nice)
4492 +               cpustat->nice = cputime64_add(cpustat->nice, tmp);
4493 +       else
4494 +               cpustat->user = cputime64_add(cpustat->user, tmp);
4495 +       /* Account for user time used */
4496 +       acct_update_integrals(p);
4497 +}
4498 +
4499 +/*
4500 + * Account guest cpu time to a process.
4501 + * @p: the process that the cpu time gets accounted to
4502 + * @cputime: the cpu time spent in virtual machine since the last update
4503 + */
4504 +static void account_guest_time(struct task_struct *p, cputime_t cputime)
4505 +{
4506 +       cputime64_t tmp;
4507 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4508 +
4509 +       tmp = cputime_to_cputime64(cputime);
4510 +
4511 +       p->utime = cputime_add(p->utime, cputime);
4512 +       p->gtime = cputime_add(p->gtime, cputime);
4513 +
4514 +       cpustat->user = cputime64_add(cpustat->user, tmp);
4515 +       cpustat->guest = cputime64_add(cpustat->guest, tmp);
4516 +}
4517 +
4518 +/*
4519 + * Account scaled user cpu time to a process.
4520 + * @p: the process that the cpu time gets accounted to
4521 + * @cputime: the cpu time spent in user space since the last update
4522 + */
4523 +void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4524 +{
4525 +       p->utimescaled = cputime_add(p->utimescaled, cputime);
4526 +}
4527 +
4528 +/*
4529 + * Account system cpu time to a process.
4530 + * @p: the process that the cpu time gets accounted to
4531 + * @hardirq_offset: the offset to subtract from hardirq_count()
4532 + * @cputime: the cpu time spent in kernel space since the last update
4533 + */
4534 +void account_system_time(struct task_struct *p, int hardirq_offset,
4535 +                        cputime_t cputime)
4536 +{
4537 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4538 +       struct vx_info *vxi = p->vx_info;  /* p is _always_ current */
4539 +       struct rq *rq = this_rq();
4540 +       cputime64_t tmp;
4541 +
4542 +       if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4543 +               account_guest_time(p, cputime);
4544 +               return;
4545 +       }
4546 +
4547 +       p->stime = cputime_add(p->stime, cputime);
4548 +       vx_account_system(vxi, cputime, (p == rq->idle));
4549 +
4550 +       /* Add system time to cpustat. */
4551 +       tmp = cputime_to_cputime64(cputime);
4552 +       if (hardirq_count() - hardirq_offset)
4553 +               cpustat->irq = cputime64_add(cpustat->irq, tmp);
4554 +       else if (softirq_count())
4555 +               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4556 +       else if (p != rq->idle)
4557 +               cpustat->system = cputime64_add(cpustat->system, tmp);
4558 +       else if (atomic_read(&rq->nr_iowait) > 0)
4559 +               cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4560 +       else
4561 +               cpustat->idle = cputime64_add(cpustat->idle, tmp);
4562 +       /* Account for system time used */
4563 +       acct_update_integrals(p);
4564 +}
4565 +
4566 +/*
4567 + * Account scaled system cpu time to a process.
4568 + * @p: the process that the cpu time gets accounted to
4569 + * @hardirq_offset: the offset to subtract from hardirq_count()
4570 + * @cputime: the cpu time spent in kernel space since the last update
4571 + */
4572 +void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
4573 +{
4574 +       p->stimescaled = cputime_add(p->stimescaled, cputime);
4575 +}
4576 +
4577 +/*
4578 + * Account for involuntary wait time.
4579 + * @p: the process from which the cpu time has been stolen
4580 + * @steal: the cpu time spent in involuntary wait
4581 + */
4582 +void account_steal_time(struct task_struct *p, cputime_t steal)
4583 +{
4584 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4585 +       cputime64_t tmp = cputime_to_cputime64(steal);
4586 +       struct rq *rq = this_rq();
4587 +
4588 +       if (p == rq->idle) {
4589 +               p->stime = cputime_add(p->stime, steal);
4590 +               if (atomic_read(&rq->nr_iowait) > 0)
4591 +                       cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4592 +               else
4593 +                       cpustat->idle = cputime64_add(cpustat->idle, tmp);
4594 +       } else
4595 +               cpustat->steal = cputime64_add(cpustat->steal, tmp);
4596 +}
4597 +
4598 +/*
4599 + * Use precise platform statistics if available:
4600 + */
4601 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4602 +cputime_t task_utime(struct task_struct *p)
4603 +{
4604 +       return p->utime;
4605 +}
4606 +
4607 +cputime_t task_stime(struct task_struct *p)
4608 +{
4609 +       return p->stime;
4610 +}
4611 +#else
4612 +cputime_t task_utime(struct task_struct *p)
4613 +{
4614 +       clock_t utime = cputime_to_clock_t(p->utime),
4615 +               total = utime + cputime_to_clock_t(p->stime);
4616 +       u64 temp;
4617 +
4618 +       /*
4619 +        * Use CFS's precise accounting:
4620 +        */
4621 +       temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4622 +
4623 +       if (total) {
4624 +               temp *= utime;
4625 +               do_div(temp, total);
4626 +       }
4627 +       utime = (clock_t)temp;
4628 +
4629 +       p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4630 +       return p->prev_utime;
4631 +}
4632 +
4633 +cputime_t task_stime(struct task_struct *p)
4634 +{
4635 +       clock_t stime;
4636 +
4637 +       /*
4638 +        * Use CFS's precise accounting. (we subtract utime from
4639 +        * the total, to make sure the total observed by userspace
4640 +        * grows monotonically - apps rely on that):
4641 +        */
4642 +       stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4643 +                       cputime_to_clock_t(task_utime(p));
4644 +
4645 +       if (stime >= 0)
4646 +               p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4647 +
4648 +       return p->prev_stime;
4649 +}
4650 +#endif
4651 +
4652 +inline cputime_t task_gtime(struct task_struct *p)
4653 +{
4654 +       return p->gtime;
4655 +}
4656 +
4657 +/*
4658 + * This function gets called by the timer code, with HZ frequency.
4659 + * We call it with interrupts disabled.
4660 + *
4661 + * It also gets called by the fork code, when changing the parent's
4662 + * timeslices.
4663 + */
4664 +void scheduler_tick(void)
4665 +{
4666 +       int cpu = smp_processor_id();
4667 +       struct rq *rq = cpu_rq(cpu);
4668 +       struct task_struct *curr = rq->curr;
4669 +
4670 +       sched_clock_tick();
4671 +
4672 +       spin_lock(&rq->lock);
4673 +       update_rq_clock(rq);
4674 +       update_cpu_load(rq);
4675 +       curr->sched_class->task_tick(rq, curr, 0);
4676 +       spin_unlock(&rq->lock);
4677 +
4678 +#ifdef CONFIG_SMP
4679 +       rq->idle_at_tick = idle_cpu(cpu);
4680 +       trigger_load_balance(rq, cpu);
4681 +#endif
4682 +}
4683 +
4684 +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4685 +                               defined(CONFIG_PREEMPT_TRACER))
4686 +
4687 +static inline unsigned long get_parent_ip(unsigned long addr)
4688 +{
4689 +       if (in_lock_functions(addr)) {
4690 +               addr = CALLER_ADDR2;
4691 +               if (in_lock_functions(addr))
4692 +                       addr = CALLER_ADDR3;
4693 +       }
4694 +       return addr;
4695 +}
4696 +
4697 +void __kprobes add_preempt_count(int val)
4698 +{
4699 +#ifdef CONFIG_DEBUG_PREEMPT
4700 +       /*
4701 +        * Underflow?
4702 +        */
4703 +       if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4704 +               return;
4705 +#endif
4706 +       preempt_count() += val;
4707 +#ifdef CONFIG_DEBUG_PREEMPT
4708 +       /*
4709 +        * Spinlock count overflowing soon?
4710 +        */
4711 +       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4712 +                               PREEMPT_MASK - 10);
4713 +#endif
4714 +       if (preempt_count() == val)
4715 +               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4716 +}
4717 +EXPORT_SYMBOL(add_preempt_count);
4718 +
4719 +void __kprobes sub_preempt_count(int val)
4720 +{
4721 +#ifdef CONFIG_DEBUG_PREEMPT
4722 +       /*
4723 +        * Underflow?
4724 +        */
4725 +       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4726 +               return;
4727 +       /*
4728 +        * Is the spinlock portion underflowing?
4729 +        */
4730 +       if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4731 +                       !(preempt_count() & PREEMPT_MASK)))
4732 +               return;
4733 +#endif
4734 +
4735 +       if (preempt_count() == val)
4736 +               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4737 +       preempt_count() -= val;
4738 +}
4739 +EXPORT_SYMBOL(sub_preempt_count);
4740 +
4741 +#endif
4742 +
4743 +/*
4744 + * Print scheduling while atomic bug:
4745 + */
4746 +static noinline void __schedule_bug(struct task_struct *prev)
4747 +{
4748 +       struct pt_regs *regs = get_irq_regs();
4749 +
4750 +       printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4751 +               prev->comm, prev->pid, preempt_count());
4752 +
4753 +       debug_show_held_locks(prev);
4754 +       print_modules();
4755 +       if (irqs_disabled())
4756 +               print_irqtrace_events(prev);
4757 +
4758 +       if (regs)
4759 +               show_regs(regs);
4760 +       else
4761 +               dump_stack();
4762 +}
4763 +
4764 +/*
4765 + * Various schedule()-time debugging checks and statistics:
4766 + */
4767 +static inline void schedule_debug(struct task_struct *prev)
4768 +{
4769 +       /*
4770 +        * Test if we are atomic. Since do_exit() needs to call into
4771 +        * schedule() atomically, we ignore that path for now.
4772 +        * Otherwise, whine if we are scheduling when we should not be.
4773 +        */
4774 +       if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4775 +               __schedule_bug(prev);
4776 +
4777 +       profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4778 +
4779 +       schedstat_inc(this_rq(), sched_count);
4780 +#ifdef CONFIG_SCHEDSTATS
4781 +       if (unlikely(prev->lock_depth >= 0)) {
4782 +               schedstat_inc(this_rq(), bkl_count);
4783 +               schedstat_inc(prev, sched_info.bkl_count);
4784 +       }
4785 +#endif
4786 +}
4787 +
4788 +/*
4789 + * Pick up the highest-prio task:
4790 + */
4791 +static inline struct task_struct *
4792 +pick_next_task(struct rq *rq, struct task_struct *prev)
4793 +{
4794 +       const struct sched_class *class;
4795 +       struct task_struct *p;
4796 +
4797 +       /*
4798 +        * Optimization: we know that if all tasks are in
4799 +        * the fair class we can call that function directly:
4800 +        */
4801 +       if (likely(rq->nr_running == rq->cfs.nr_running)) {
4802 +               p = fair_sched_class.pick_next_task(rq);
4803 +               if (likely(p))
4804 +                       return p;
4805 +       }
4806 +
4807 +       class = sched_class_highest;
4808 +       for ( ; ; ) {
4809 +               p = class->pick_next_task(rq);
4810 +               if (p)
4811 +                       return p;
4812 +               /*
4813 +                * Will never be NULL as the idle class always
4814 +                * returns a non-NULL p:
4815 +                */
4816 +               class = class->next;
4817 +       }
4818 +}
4819 +
4820 +/*
4821 + * schedule() is the main scheduler function.
4822 + */
4823 +asmlinkage void __sched schedule(void)
4824 +{
4825 +       struct task_struct *prev, *next;
4826 +       unsigned long *switch_count;
4827 +       struct rq *rq;
4828 +       int cpu;
4829 +
4830 +need_resched:
4831 +       preempt_disable();
4832 +       cpu = smp_processor_id();
4833 +       rq = cpu_rq(cpu);
4834 +       rcu_qsctr_inc(cpu);
4835 +       prev = rq->curr;
4836 +       switch_count = &prev->nivcsw;
4837 +
4838 +       release_kernel_lock(prev);
4839 +need_resched_nonpreemptible:
4840 +
4841 +       schedule_debug(prev);
4842 +
4843 +       if (sched_feat(HRTICK))
4844 +               hrtick_clear(rq);
4845 +
4846 +       /*
4847 +        * Do the rq-clock update outside the rq lock:
4848 +        */
4849 +       local_irq_disable();
4850 +       update_rq_clock(rq);
4851 +       spin_lock(&rq->lock);
4852 +       clear_tsk_need_resched(prev);
4853 +
4854 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4855 +               if (unlikely(signal_pending_state(prev->state, prev)))
4856 +                       prev->state = TASK_RUNNING;
4857 +               else
4858 +                       deactivate_task(rq, prev, 1);
4859 +               switch_count = &prev->nvcsw;
4860 +       }
4861 +
4862 +#ifdef CONFIG_SMP
4863 +       if (prev->sched_class->pre_schedule)
4864 +               prev->sched_class->pre_schedule(rq, prev);
4865 +#endif
4866 +
4867 +       if (unlikely(!rq->nr_running))
4868 +               idle_balance(cpu, rq);
4869 +
4870 +       prev->sched_class->put_prev_task(rq, prev);
4871 +       next = pick_next_task(rq, prev);
4872 +
4873 +       if (likely(prev != next)) {
4874 +               sched_info_switch(prev, next);
4875 +
4876 +               rq->nr_switches++;
4877 +               rq->curr = next;
4878 +               ++*switch_count;
4879 +
4880 +               context_switch(rq, prev, next); /* unlocks the rq */
4881 +               /*
4882 +                * the context switch might have flipped the stack from under
4883 +                * us, hence refresh the local variables.
4884 +                */
4885 +               cpu = smp_processor_id();
4886 +               rq = cpu_rq(cpu);
4887 +       } else
4888 +               spin_unlock_irq(&rq->lock);
4889 +
4890 +       if (unlikely(reacquire_kernel_lock(current) < 0))
4891 +               goto need_resched_nonpreemptible;
4892 +
4893 +       preempt_enable_no_resched();
4894 +       if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4895 +               goto need_resched;
4896 +}
4897 +EXPORT_SYMBOL(schedule);
4898 +
4899 +#ifdef CONFIG_PREEMPT
4900 +/*
4901 + * this is the entry point to schedule() from in-kernel preemption
4902 + * off of preempt_enable. Kernel preemptions off return from interrupt
4903 + * occur there and call schedule directly.
4904 + */
4905 +asmlinkage void __sched preempt_schedule(void)
4906 +{
4907 +       struct thread_info *ti = current_thread_info();
4908 +
4909 +       /*
4910 +        * If there is a non-zero preempt_count or interrupts are disabled,
4911 +        * we do not want to preempt the current task. Just return..
4912 +        */
4913 +       if (likely(ti->preempt_count || irqs_disabled()))
4914 +               return;
4915 +
4916 +       do {
4917 +               add_preempt_count(PREEMPT_ACTIVE);
4918 +               schedule();
4919 +               sub_preempt_count(PREEMPT_ACTIVE);
4920 +
4921 +               /*
4922 +                * Check again in case we missed a preemption opportunity
4923 +                * between schedule and now.
4924 +                */
4925 +               barrier();
4926 +       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4927 +}
4928 +EXPORT_SYMBOL(preempt_schedule);
4929 +
4930 +/*
4931 + * this is the entry point to schedule() from kernel preemption
4932 + * off of irq context.
4933 + * Note, that this is called and return with irqs disabled. This will
4934 + * protect us against recursive calling from irq.
4935 + */
4936 +asmlinkage void __sched preempt_schedule_irq(void)
4937 +{
4938 +       struct thread_info *ti = current_thread_info();
4939 +
4940 +       /* Catch callers which need to be fixed */
4941 +       BUG_ON(ti->preempt_count || !irqs_disabled());
4942 +
4943 +       do {
4944 +               add_preempt_count(PREEMPT_ACTIVE);
4945 +               local_irq_enable();
4946 +               schedule();
4947 +               local_irq_disable();
4948 +               sub_preempt_count(PREEMPT_ACTIVE);
4949 +
4950 +               /*
4951 +                * Check again in case we missed a preemption opportunity
4952 +                * between schedule and now.
4953 +                */
4954 +               barrier();
4955 +       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
4956 +}
4957 +
4958 +#endif /* CONFIG_PREEMPT */
4959 +
4960 +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
4961 +                         void *key)
4962 +{
4963 +       return try_to_wake_up(curr->private, mode, sync);
4964 +}
4965 +EXPORT_SYMBOL(default_wake_function);
4966 +
4967 +/*
4968 + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
4969 + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
4970 + * number) then we wake all the non-exclusive tasks and one exclusive task.
4971 + *
4972 + * There are circumstances in which we can try to wake a task which has already
4973 + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4974 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
4975 + */
4976 +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4977 +                            int nr_exclusive, int sync, void *key)
4978 +{
4979 +       wait_queue_t *curr, *next;
4980 +
4981 +       list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4982 +               unsigned flags = curr->flags;
4983 +
4984 +               if (curr->func(curr, mode, sync, key) &&
4985 +                               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4986 +                       break;
4987 +       }
4988 +}
4989 +
4990 +/**
4991 + * __wake_up - wake up threads blocked on a waitqueue.
4992 + * @q: the waitqueue
4993 + * @mode: which threads
4994 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
4995 + * @key: is directly passed to the wakeup function
4996 + */
4997 +void __wake_up(wait_queue_head_t *q, unsigned int mode,
4998 +                       int nr_exclusive, void *key)
4999 +{
5000 +       unsigned long flags;
5001 +
5002 +       spin_lock_irqsave(&q->lock, flags);
5003 +       __wake_up_common(q, mode, nr_exclusive, 0, key);
5004 +       spin_unlock_irqrestore(&q->lock, flags);
5005 +}
5006 +EXPORT_SYMBOL(__wake_up);
5007 +
5008 +/*
5009 + * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
5010 + */
5011 +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5012 +{
5013 +       __wake_up_common(q, mode, 1, 0, NULL);
5014 +}
5015 +
5016 +/**
5017 + * __wake_up_sync - wake up threads blocked on a waitqueue.
5018 + * @q: the waitqueue
5019 + * @mode: which threads
5020 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
5021 + *
5022 + * The sync wakeup differs that the waker knows that it will schedule
5023 + * away soon, so while the target thread will be woken up, it will not
5024 + * be migrated to another CPU - ie. the two threads are 'synchronized'
5025 + * with each other. This can prevent needless bouncing between CPUs.
5026 + *
5027 + * On UP it can prevent extra preemption.
5028 + */
5029 +void
5030 +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5031 +{
5032 +       unsigned long flags;
5033 +       int sync = 1;
5034 +
5035 +       if (unlikely(!q))
5036 +               return;
5037 +
5038 +       if (unlikely(!nr_exclusive))
5039 +               sync = 0;
5040 +
5041 +       spin_lock_irqsave(&q->lock, flags);
5042 +       __wake_up_common(q, mode, nr_exclusive, sync, NULL);
5043 +       spin_unlock_irqrestore(&q->lock, flags);
5044 +}
5045 +EXPORT_SYMBOL_GPL(__wake_up_sync);     /* For internal use only */
5046 +
5047 +void complete(struct completion *x)
5048 +{
5049 +       unsigned long flags;
5050 +
5051 +       spin_lock_irqsave(&x->wait.lock, flags);
5052 +       x->done++;
5053 +       __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
5054 +       spin_unlock_irqrestore(&x->wait.lock, flags);
5055 +}
5056 +EXPORT_SYMBOL(complete);
5057 +
5058 +void complete_all(struct completion *x)
5059 +{
5060 +       unsigned long flags;
5061 +
5062 +       spin_lock_irqsave(&x->wait.lock, flags);
5063 +       x->done += UINT_MAX/2;
5064 +       __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
5065 +       spin_unlock_irqrestore(&x->wait.lock, flags);
5066 +}
5067 +EXPORT_SYMBOL(complete_all);
5068 +
5069 +static inline long __sched
5070 +do_wait_for_common(struct completion *x, long timeout, int state)
5071 +{
5072 +       if (!x->done) {
5073 +               DECLARE_WAITQUEUE(wait, current);
5074 +
5075 +               wait.flags |= WQ_FLAG_EXCLUSIVE;
5076 +               __add_wait_queue_tail(&x->wait, &wait);
5077 +               do {
5078 +                       if ((state == TASK_INTERRUPTIBLE &&
5079 +                            signal_pending(current)) ||
5080 +                           (state == TASK_KILLABLE &&
5081 +                            fatal_signal_pending(current))) {
5082 +                               timeout = -ERESTARTSYS;
5083 +                               break;
5084 +                       }
5085 +                       __set_current_state(state);
5086 +                       spin_unlock_irq(&x->wait.lock);
5087 +                       timeout = schedule_timeout(timeout);
5088 +                       spin_lock_irq(&x->wait.lock);
5089 +               } while (!x->done && timeout);
5090 +               __remove_wait_queue(&x->wait, &wait);
5091 +               if (!x->done)
5092 +                       return timeout;
5093 +       }
5094 +       x->done--;
5095 +       return timeout ?: 1;
5096 +}
5097 +
5098 +static long __sched
5099 +wait_for_common(struct completion *x, long timeout, int state)
5100 +{
5101 +       might_sleep();
5102 +
5103 +       spin_lock_irq(&x->wait.lock);
5104 +       timeout = do_wait_for_common(x, timeout, state);
5105 +       spin_unlock_irq(&x->wait.lock);
5106 +       return timeout;
5107 +}
5108 +
5109 +void __sched wait_for_completion(struct completion *x)
5110 +{
5111 +       wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
5112 +}
5113 +EXPORT_SYMBOL(wait_for_completion);
5114 +
5115 +unsigned long __sched
5116 +wait_for_completion_timeout(struct completion *x, unsigned long timeout)
5117 +{
5118 +       return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
5119 +}
5120 +EXPORT_SYMBOL(wait_for_completion_timeout);
5121 +
5122 +int __sched wait_for_completion_interruptible(struct completion *x)
5123 +{
5124 +       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
5125 +       if (t == -ERESTARTSYS)
5126 +               return t;
5127 +       return 0;
5128 +}
5129 +EXPORT_SYMBOL(wait_for_completion_interruptible);
5130 +
5131 +unsigned long __sched
5132 +wait_for_completion_interruptible_timeout(struct completion *x,
5133 +                                         unsigned long timeout)
5134 +{
5135 +       return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
5136 +}
5137 +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
5138 +
5139 +int __sched wait_for_completion_killable(struct completion *x)
5140 +{
5141 +       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
5142 +       if (t == -ERESTARTSYS)
5143 +               return t;
5144 +       return 0;
5145 +}
5146 +EXPORT_SYMBOL(wait_for_completion_killable);
5147 +
5148 +/**
5149 + *     try_wait_for_completion - try to decrement a completion without blocking
5150 + *     @x:     completion structure
5151 + *
5152 + *     Returns: 0 if a decrement cannot be done without blocking
5153 + *              1 if a decrement succeeded.
5154 + *
5155 + *     If a completion is being used as a counting completion,
5156 + *     attempt to decrement the counter without blocking. This
5157 + *     enables us to avoid waiting if the resource the completion
5158 + *     is protecting is not available.
5159 + */
5160 +bool try_wait_for_completion(struct completion *x)
5161 +{
5162 +       int ret = 1;
5163 +
5164 +       spin_lock_irq(&x->wait.lock);
5165 +       if (!x->done)
5166 +               ret = 0;
5167 +       else
5168 +               x->done--;
5169 +       spin_unlock_irq(&x->wait.lock);
5170 +       return ret;
5171 +}
5172 +EXPORT_SYMBOL(try_wait_for_completion);
5173 +
5174 +/**
5175 + *     completion_done - Test to see if a completion has any waiters
5176 + *     @x:     completion structure
5177 + *
5178 + *     Returns: 0 if there are waiters (wait_for_completion() in progress)
5179 + *              1 if there are no waiters.
5180 + *
5181 + */
5182 +bool completion_done(struct completion *x)
5183 +{
5184 +       int ret = 1;
5185 +
5186 +       spin_lock_irq(&x->wait.lock);
5187 +       if (!x->done)
5188 +               ret = 0;
5189 +       spin_unlock_irq(&x->wait.lock);
5190 +       return ret;
5191 +}
5192 +EXPORT_SYMBOL(completion_done);
5193 +
5194 +static long __sched
5195 +sleep_on_common(wait_queue_head_t *q, int state, long timeout)
5196 +{
5197 +       unsigned long flags;
5198 +       wait_queue_t wait;
5199 +
5200 +       init_waitqueue_entry(&wait, current);
5201 +
5202 +       __set_current_state(state);
5203 +
5204 +       spin_lock_irqsave(&q->lock, flags);
5205 +       __add_wait_queue(q, &wait);
5206 +       spin_unlock(&q->lock);
5207 +       timeout = schedule_timeout(timeout);
5208 +       spin_lock_irq(&q->lock);
5209 +       __remove_wait_queue(q, &wait);
5210 +       spin_unlock_irqrestore(&q->lock, flags);
5211 +
5212 +       return timeout;
5213 +}
5214 +
5215 +void __sched interruptible_sleep_on(wait_queue_head_t *q)
5216 +{
5217 +       sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5218 +}
5219 +EXPORT_SYMBOL(interruptible_sleep_on);
5220 +
5221 +long __sched
5222 +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
5223 +{
5224 +       return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
5225 +}
5226 +EXPORT_SYMBOL(interruptible_sleep_on_timeout);
5227 +
5228 +void __sched sleep_on(wait_queue_head_t *q)
5229 +{
5230 +       sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5231 +}
5232 +EXPORT_SYMBOL(sleep_on);
5233 +
5234 +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
5235 +{
5236 +       return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
5237 +}
5238 +EXPORT_SYMBOL(sleep_on_timeout);
5239 +
5240 +#ifdef CONFIG_RT_MUTEXES
5241 +
5242 +/*
5243 + * rt_mutex_setprio - set the current priority of a task
5244 + * @p: task
5245 + * @prio: prio value (kernel-internal form)
5246 + *
5247 + * This function changes the 'effective' priority of a task. It does
5248 + * not touch ->normal_prio like __setscheduler().
5249 + *
5250 + * Used by the rt_mutex code to implement priority inheritance logic.
5251 + */
5252 +void rt_mutex_setprio(struct task_struct *p, int prio)
5253 +{
5254 +       unsigned long flags;
5255 +       int oldprio, on_rq, running;
5256 +       struct rq *rq;
5257 +       const struct sched_class *prev_class = p->sched_class;
5258 +
5259 +       BUG_ON(prio < 0 || prio > MAX_PRIO);
5260 +
5261 +       rq = task_rq_lock(p, &flags);
5262 +       update_rq_clock(rq);
5263 +
5264 +       oldprio = p->prio;
5265 +       on_rq = p->se.on_rq;
5266 +       running = task_current(rq, p);
5267 +       if (on_rq)
5268 +               dequeue_task(rq, p, 0);
5269 +       if (running)
5270 +               p->sched_class->put_prev_task(rq, p);
5271 +
5272 +       if (rt_prio(prio))
5273 +               p->sched_class = &rt_sched_class;
5274 +       else
5275 +               p->sched_class = &fair_sched_class;
5276 +
5277 +       p->prio = prio;
5278 +
5279 +       if (running)
5280 +               p->sched_class->set_curr_task(rq);
5281 +       if (on_rq) {
5282 +               enqueue_task(rq, p, 0);
5283 +
5284 +               check_class_changed(rq, p, prev_class, oldprio, running);
5285 +       }
5286 +       task_rq_unlock(rq, &flags);
5287 +}
5288 +
5289 +#endif
5290 +
5291 +void set_user_nice(struct task_struct *p, long nice)
5292 +{
5293 +       int old_prio, delta, on_rq;
5294 +       unsigned long flags;
5295 +       struct rq *rq;
5296 +
5297 +       if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
5298 +               return;
5299 +       /*
5300 +        * We have to be careful, if called from sys_setpriority(),
5301 +        * the task might be in the middle of scheduling on another CPU.
5302 +        */
5303 +       rq = task_rq_lock(p, &flags);
5304 +       update_rq_clock(rq);
5305 +       /*
5306 +        * The RT priorities are set via sched_setscheduler(), but we still
5307 +        * allow the 'normal' nice value to be set - but as expected
5308 +        * it wont have any effect on scheduling until the task is
5309 +        * SCHED_FIFO/SCHED_RR:
5310 +        */
5311 +       if (task_has_rt_policy(p)) {
5312 +               p->static_prio = NICE_TO_PRIO(nice);
5313 +               goto out_unlock;
5314 +       }
5315 +       on_rq = p->se.on_rq;
5316 +       if (on_rq)
5317 +               dequeue_task(rq, p, 0);
5318 +
5319 +       p->static_prio = NICE_TO_PRIO(nice);
5320 +       set_load_weight(p);
5321 +       old_prio = p->prio;
5322 +       p->prio = effective_prio(p);
5323 +       delta = p->prio - old_prio;
5324 +
5325 +       if (on_rq) {
5326 +               enqueue_task(rq, p, 0);
5327 +               /*
5328 +                * If the task increased its priority or is running and
5329 +                * lowered its priority, then reschedule its CPU:
5330 +                */
5331 +               if (delta < 0 || (delta > 0 && task_running(rq, p)))
5332 +                       resched_task(rq->curr);
5333 +       }
5334 +out_unlock:
5335 +       task_rq_unlock(rq, &flags);
5336 +}
5337 +EXPORT_SYMBOL(set_user_nice);
5338 +
5339 +/*
5340 + * can_nice - check if a task can reduce its nice value
5341 + * @p: task
5342 + * @nice: nice value
5343 + */
5344 +int can_nice(const struct task_struct *p, const int nice)
5345 +{
5346 +       /* convert nice value [19,-20] to rlimit style value [1,40] */
5347 +       int nice_rlim = 20 - nice;
5348 +
5349 +       return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
5350 +               capable(CAP_SYS_NICE));
5351 +}
5352 +
5353 +#ifdef __ARCH_WANT_SYS_NICE
5354 +
5355 +/*
5356 + * sys_nice - change the priority of the current process.
5357 + * @increment: priority increment
5358 + *
5359 + * sys_setpriority is a more generic, but much slower function that
5360 + * does similar things.
5361 + */
5362 +SYSCALL_DEFINE1(nice, int, increment)
5363 +{
5364 +       long nice, retval;
5365 +
5366 +       /*
5367 +        * Setpriority might change our priority at the same moment.
5368 +        * We don't have to worry. Conceptually one call occurs first
5369 +        * and we have a single winner.
5370 +        */
5371 +       if (increment < -40)
5372 +               increment = -40;
5373 +       if (increment > 40)
5374 +               increment = 40;
5375 +
5376 +       nice = PRIO_TO_NICE(current->static_prio) + increment;
5377 +       if (nice < -20)
5378 +               nice = -20;
5379 +       if (nice > 19)
5380 +               nice = 19;
5381 +
5382 +       if (increment < 0 && !can_nice(current, nice))
5383 +               return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM;
5384 +
5385 +       retval = security_task_setnice(current, nice);
5386 +       if (retval)
5387 +               return retval;
5388 +
5389 +       set_user_nice(current, nice);
5390 +       return 0;
5391 +}
5392 +
5393 +#endif
5394 +
5395 +/**
5396 + * task_prio - return the priority value of a given task.
5397 + * @p: the task in question.
5398 + *
5399 + * This is the priority value as seen by users in /proc.
5400 + * RT tasks are offset by -200. Normal tasks are centered
5401 + * around 0, value goes from -16 to +15.
5402 + */
5403 +int task_prio(const struct task_struct *p)
5404 +{
5405 +       return p->prio - MAX_RT_PRIO;
5406 +}
5407 +
5408 +/**
5409 + * task_nice - return the nice value of a given task.
5410 + * @p: the task in question.
5411 + */
5412 +int task_nice(const struct task_struct *p)
5413 +{
5414 +       return TASK_NICE(p);
5415 +}
5416 +EXPORT_SYMBOL(task_nice);
5417 +
5418 +/**
5419 + * idle_cpu - is a given cpu idle currently?
5420 + * @cpu: the processor in question.
5421 + */
5422 +int idle_cpu(int cpu)
5423 +{
5424 +       return cpu_curr(cpu) == cpu_rq(cpu)->idle;
5425 +}
5426 +
5427 +/**
5428 + * idle_task - return the idle task for a given cpu.
5429 + * @cpu: the processor in question.
5430 + */
5431 +struct task_struct *idle_task(int cpu)
5432 +{
5433 +       return cpu_rq(cpu)->idle;
5434 +}
5435 +
5436 +/**
5437 + * find_process_by_pid - find a process with a matching PID value.
5438 + * @pid: the pid in question.
5439 + */
5440 +static struct task_struct *find_process_by_pid(pid_t pid)
5441 +{
5442 +       return pid ? find_task_by_vpid(pid) : current;
5443 +}
5444 +
5445 +/* Actually do priority change: must hold rq lock. */
5446 +static void
5447 +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5448 +{
5449 +       BUG_ON(p->se.on_rq);
5450 +
5451 +       p->policy = policy;
5452 +       switch (p->policy) {
5453 +       case SCHED_NORMAL:
5454 +       case SCHED_BATCH:
5455 +       case SCHED_IDLE:
5456 +               p->sched_class = &fair_sched_class;
5457 +               break;
5458 +       case SCHED_FIFO:
5459 +       case SCHED_RR:
5460 +               p->sched_class = &rt_sched_class;
5461 +               break;
5462 +       }
5463 +
5464 +       p->rt_priority = prio;
5465 +       p->normal_prio = normal_prio(p);
5466 +       /* we are holding p->pi_lock already */
5467 +       p->prio = rt_mutex_getprio(p);
5468 +       set_load_weight(p);
5469 +}
5470 +
5471 +static int __sched_setscheduler(struct task_struct *p, int policy,
5472 +                               struct sched_param *param, bool user)
5473 +{
5474 +       int retval, oldprio, oldpolicy = -1, on_rq, running;
5475 +       unsigned long flags;
5476 +       const struct sched_class *prev_class = p->sched_class;
5477 +       struct rq *rq;
5478 +
5479 +       /* may grab non-irq protected spin_locks */
5480 +       BUG_ON(in_interrupt());
5481 +recheck:
5482 +       /* double check policy once rq lock held */
5483 +       if (policy < 0)
5484 +               policy = oldpolicy = p->policy;
5485 +       else if (policy != SCHED_FIFO && policy != SCHED_RR &&
5486 +                       policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5487 +                       policy != SCHED_IDLE)
5488 +               return -EINVAL;
5489 +       /*
5490 +        * Valid priorities for SCHED_FIFO and SCHED_RR are
5491 +        * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5492 +        * SCHED_BATCH and SCHED_IDLE is 0.
5493 +        */
5494 +       if (param->sched_priority < 0 ||
5495 +           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5496 +           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5497 +               return -EINVAL;
5498 +       if (rt_policy(policy) != (param->sched_priority != 0))
5499 +               return -EINVAL;
5500 +
5501 +       /*
5502 +        * Allow unprivileged RT tasks to decrease priority:
5503 +        */
5504 +       if (user && !capable(CAP_SYS_NICE)) {
5505 +               if (rt_policy(policy)) {
5506 +                       unsigned long rlim_rtprio;
5507 +
5508 +                       if (!lock_task_sighand(p, &flags))
5509 +                               return -ESRCH;
5510 +                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
5511 +                       unlock_task_sighand(p, &flags);
5512 +
5513 +                       /* can't set/change the rt policy */
5514 +                       if (policy != p->policy && !rlim_rtprio)
5515 +                               return -EPERM;
5516 +
5517 +                       /* can't increase priority */
5518 +                       if (param->sched_priority > p->rt_priority &&
5519 +                           param->sched_priority > rlim_rtprio)
5520 +                               return -EPERM;
5521 +               }
5522 +               /*
5523 +                * Like positive nice levels, dont allow tasks to
5524 +                * move out of SCHED_IDLE either:
5525 +                */
5526 +               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
5527 +                       return -EPERM;
5528 +
5529 +               /* can't change other user's priorities */
5530 +               if ((current->euid != p->euid) &&
5531 +                   (current->euid != p->uid))
5532 +                       return -EPERM;
5533 +       }
5534 +
5535 +       if (user) {
5536 +#ifdef CONFIG_RT_GROUP_SCHED
5537 +               /*
5538 +                * Do not allow realtime tasks into groups that have no runtime
5539 +                * assigned.
5540 +                */
5541 +               if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
5542 +                       return -EPERM;
5543 +#endif
5544 +
5545 +               retval = security_task_setscheduler(p, policy, param);
5546 +               if (retval)
5547 +                       return retval;
5548 +       }
5549 +
5550 +       /*
5551 +        * make sure no PI-waiters arrive (or leave) while we are
5552 +        * changing the priority of the task:
5553 +        */
5554 +       spin_lock_irqsave(&p->pi_lock, flags);
5555 +       /*
5556 +        * To be able to change p->policy safely, the apropriate
5557 +        * runqueue lock must be held.
5558 +        */
5559 +       rq = __task_rq_lock(p);
5560 +       /* recheck policy now with rq lock held */
5561 +       if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5562 +               policy = oldpolicy = -1;
5563 +               __task_rq_unlock(rq);
5564 +               spin_unlock_irqrestore(&p->pi_lock, flags);
5565 +               goto recheck;
5566 +       }
5567 +       update_rq_clock(rq);
5568 +       on_rq = p->se.on_rq;
5569 +       running = task_current(rq, p);
5570 +       if (on_rq)
5571 +               deactivate_task(rq, p, 0);
5572 +       if (running)
5573 +               p->sched_class->put_prev_task(rq, p);
5574 +
5575 +       oldprio = p->prio;
5576 +       __setscheduler(rq, p, policy, param->sched_priority);
5577 +
5578 +       if (running)
5579 +               p->sched_class->set_curr_task(rq);
5580 +       if (on_rq) {
5581 +               activate_task(rq, p, 0);
5582 +
5583 +               check_class_changed(rq, p, prev_class, oldprio, running);
5584 +       }
5585 +       __task_rq_unlock(rq);
5586 +       spin_unlock_irqrestore(&p->pi_lock, flags);
5587 +
5588 +       rt_mutex_adjust_pi(p);
5589 +
5590 +       return 0;
5591 +}
5592 +
5593 +/**
5594 + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5595 + * @p: the task in question.
5596 + * @policy: new policy.
5597 + * @param: structure containing the new RT priority.
5598 + *
5599 + * NOTE that the task may be already dead.
5600 + */
5601 +int sched_setscheduler(struct task_struct *p, int policy,
5602 +                      struct sched_param *param)
5603 +{
5604 +       return __sched_setscheduler(p, policy, param, true);
5605 +}
5606 +EXPORT_SYMBOL_GPL(sched_setscheduler);
5607 +
5608 +/**
5609 + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5610 + * @p: the task in question.
5611 + * @policy: new policy.
5612 + * @param: structure containing the new RT priority.
5613 + *
5614 + * Just like sched_setscheduler, only don't bother checking if the
5615 + * current context has permission.  For example, this is needed in
5616 + * stop_machine(): we create temporary high priority worker threads,
5617 + * but our caller might not have that capability.
5618 + */
5619 +int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5620 +                              struct sched_param *param)
5621 +{
5622 +       return __sched_setscheduler(p, policy, param, false);
5623 +}
5624 +
5625 +static int
5626 +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5627 +{
5628 +       struct sched_param lparam;
5629 +       struct task_struct *p;
5630 +       int retval;
5631 +
5632 +       if (!param || pid < 0)
5633 +               return -EINVAL;
5634 +       if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5635 +               return -EFAULT;
5636 +
5637 +       rcu_read_lock();
5638 +       retval = -ESRCH;
5639 +       p = find_process_by_pid(pid);
5640 +       if (p != NULL)
5641 +               retval = sched_setscheduler(p, policy, &lparam);
5642 +       rcu_read_unlock();
5643 +
5644 +       return retval;
5645 +}
5646 +
5647 +/**
5648 + * sys_sched_setscheduler - set/change the scheduler policy and RT priority
5649 + * @pid: the pid in question.
5650 + * @policy: new policy.
5651 + * @param: structure containing the new RT priority.
5652 + */
5653 +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5654 +               struct sched_param __user *, param)
5655 +{
5656 +       /* negative values for policy are not valid */
5657 +       if (policy < 0)
5658 +               return -EINVAL;
5659 +
5660 +       return do_sched_setscheduler(pid, policy, param);
5661 +}
5662 +
5663 +/**
5664 + * sys_sched_setparam - set/change the RT priority of a thread
5665 + * @pid: the pid in question.
5666 + * @param: structure containing the new RT priority.
5667 + */
5668 +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5669 +{
5670 +       return do_sched_setscheduler(pid, -1, param);
5671 +}
5672 +
5673 +/**
5674 + * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5675 + * @pid: the pid in question.
5676 + */
5677 +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5678 +{
5679 +       struct task_struct *p;
5680 +       int retval;
5681 +
5682 +       if (pid < 0)
5683 +               return -EINVAL;
5684 +
5685 +       retval = -ESRCH;
5686 +       read_lock(&tasklist_lock);
5687 +       p = find_process_by_pid(pid);
5688 +       if (p) {
5689 +               retval = security_task_getscheduler(p);
5690 +               if (!retval)
5691 +                       retval = p->policy;
5692 +       }
5693 +       read_unlock(&tasklist_lock);
5694 +       return retval;
5695 +}
5696 +
5697 +/**
5698 + * sys_sched_getscheduler - get the RT priority of a thread
5699 + * @pid: the pid in question.
5700 + * @param: structure containing the RT priority.
5701 + */
5702 +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5703 +{
5704 +       struct sched_param lp;
5705 +       struct task_struct *p;
5706 +       int retval;
5707 +
5708 +       if (!param || pid < 0)
5709 +               return -EINVAL;
5710 +
5711 +       read_lock(&tasklist_lock);
5712 +       p = find_process_by_pid(pid);
5713 +       retval = -ESRCH;
5714 +       if (!p)
5715 +               goto out_unlock;
5716 +
5717 +       retval = security_task_getscheduler(p);
5718 +       if (retval)
5719 +               goto out_unlock;
5720 +
5721 +       lp.sched_priority = p->rt_priority;
5722 +       read_unlock(&tasklist_lock);
5723 +
5724 +       /*
5725 +        * This one might sleep, we cannot do it with a spinlock held ...
5726 +        */
5727 +       retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5728 +
5729 +       return retval;
5730 +
5731 +out_unlock:
5732 +       read_unlock(&tasklist_lock);
5733 +       return retval;
5734 +}
5735 +
5736 +long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5737 +{
5738 +       cpumask_t cpus_allowed;
5739 +       cpumask_t new_mask = *in_mask;
5740 +       struct task_struct *p;
5741 +       int retval;
5742 +
5743 +       get_online_cpus();
5744 +       read_lock(&tasklist_lock);
5745 +
5746 +       p = find_process_by_pid(pid);
5747 +       if (!p) {
5748 +               read_unlock(&tasklist_lock);
5749 +               put_online_cpus();
5750 +               return -ESRCH;
5751 +       }
5752 +
5753 +       /*
5754 +        * It is not safe to call set_cpus_allowed with the
5755 +        * tasklist_lock held. We will bump the task_struct's
5756 +        * usage count and then drop tasklist_lock.
5757 +        */
5758 +       get_task_struct(p);
5759 +       read_unlock(&tasklist_lock);
5760 +
5761 +
5762 +       retval = -EPERM;
5763 +       if ((current->euid != p->euid) && (current->euid != p->uid) &&
5764 +                       !capable(CAP_SYS_NICE))
5765 +               goto out_unlock;
5766 +
5767 +       retval = security_task_setscheduler(p, 0, NULL);
5768 +       if (retval)
5769 +               goto out_unlock;
5770 +
5771 +       cpuset_cpus_allowed(p, &cpus_allowed);
5772 +       cpus_and(new_mask, new_mask, cpus_allowed);
5773 + again:
5774 +       retval = set_cpus_allowed_ptr(p, &new_mask);
5775 +
5776 +       if (!retval) {
5777 +               cpuset_cpus_allowed(p, &cpus_allowed);
5778 +               if (!cpus_subset(new_mask, cpus_allowed)) {
5779 +                       /*
5780 +                        * We must have raced with a concurrent cpuset
5781 +                        * update. Just reset the cpus_allowed to the
5782 +                        * cpuset's cpus_allowed
5783 +                        */
5784 +                       new_mask = cpus_allowed;
5785 +                       goto again;
5786 +               }
5787 +       }
5788 +out_unlock:
5789 +       put_task_struct(p);
5790 +       put_online_cpus();
5791 +       return retval;
5792 +}
5793 +
5794 +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5795 +                            cpumask_t *new_mask)
5796 +{
5797 +       if (len < sizeof(cpumask_t)) {
5798 +               memset(new_mask, 0, sizeof(cpumask_t));
5799 +       } else if (len > sizeof(cpumask_t)) {
5800 +               len = sizeof(cpumask_t);
5801 +       }
5802 +       return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5803 +}
5804 +
5805 +/**
5806 + * sys_sched_setaffinity - set the cpu affinity of a process
5807 + * @pid: pid of the process
5808 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5809 + * @user_mask_ptr: user-space pointer to the new cpu mask
5810 + */
5811 +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5812 +               unsigned long __user *, user_mask_ptr)
5813 +{
5814 +       cpumask_t new_mask;
5815 +       int retval;
5816 +
5817 +       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
5818 +       if (retval)
5819 +               return retval;
5820 +
5821 +       return sched_setaffinity(pid, &new_mask);
5822 +}
5823 +
5824 +long sched_getaffinity(pid_t pid, cpumask_t *mask)
5825 +{
5826 +       struct task_struct *p;
5827 +       int retval;
5828 +
5829 +       get_online_cpus();
5830 +       read_lock(&tasklist_lock);
5831 +
5832 +       retval = -ESRCH;
5833 +       p = find_process_by_pid(pid);
5834 +       if (!p)
5835 +               goto out_unlock;
5836 +
5837 +       retval = security_task_getscheduler(p);
5838 +       if (retval)
5839 +               goto out_unlock;
5840 +
5841 +       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
5842 +
5843 +out_unlock:
5844 +       read_unlock(&tasklist_lock);
5845 +       put_online_cpus();
5846 +
5847 +       return retval;
5848 +}
5849 +
5850 +/**
5851 + * sys_sched_getaffinity - get the cpu affinity of a process
5852 + * @pid: pid of the process
5853 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5854 + * @user_mask_ptr: user-space pointer to hold the current cpu mask
5855 + */
5856 +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5857 +               unsigned long __user *, user_mask_ptr)
5858 +{
5859 +       int ret;
5860 +       cpumask_t mask;
5861 +
5862 +       if (len < sizeof(cpumask_t))
5863 +               return -EINVAL;
5864 +
5865 +       ret = sched_getaffinity(pid, &mask);
5866 +       if (ret < 0)
5867 +               return ret;
5868 +
5869 +       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
5870 +               return -EFAULT;
5871 +
5872 +       return sizeof(cpumask_t);
5873 +}
5874 +
5875 +/**
5876 + * sys_sched_yield - yield the current processor to other threads.
5877 + *
5878 + * This function yields the current CPU to other tasks. If there are no
5879 + * other threads running on this CPU then this function will return.
5880 + */
5881 +SYSCALL_DEFINE0(sched_yield)
5882 +{
5883 +       struct rq *rq = this_rq_lock();
5884 +
5885 +       schedstat_inc(rq, yld_count);
5886 +       current->sched_class->yield_task(rq);
5887 +
5888 +       /*
5889 +        * Since we are going to call schedule() anyway, there's
5890 +        * no need to preempt or enable interrupts:
5891 +        */
5892 +       __release(rq->lock);
5893 +       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5894 +       _raw_spin_unlock(&rq->lock);
5895 +       preempt_enable_no_resched();
5896 +
5897 +       schedule();
5898 +
5899 +       return 0;
5900 +}
5901 +
5902 +static void __cond_resched(void)
5903 +{
5904 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5905 +       __might_sleep(__FILE__, __LINE__);
5906 +#endif
5907 +       /*
5908 +        * The BKS might be reacquired before we have dropped
5909 +        * PREEMPT_ACTIVE, which could trigger a second
5910 +        * cond_resched() call.
5911 +        */
5912 +       do {
5913 +               add_preempt_count(PREEMPT_ACTIVE);
5914 +               schedule();
5915 +               sub_preempt_count(PREEMPT_ACTIVE);
5916 +       } while (need_resched());
5917 +}
5918 +
5919 +int __sched _cond_resched(void)
5920 +{
5921 +       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
5922 +                                       system_state == SYSTEM_RUNNING) {
5923 +               __cond_resched();
5924 +               return 1;
5925 +       }
5926 +       return 0;
5927 +}
5928 +EXPORT_SYMBOL(_cond_resched);
5929 +
5930 +/*
5931 + * cond_resched_lock() - if a reschedule is pending, drop the given lock,
5932 + * call schedule, and on return reacquire the lock.
5933 + *
5934 + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
5935 + * operations here to prevent schedule() from being called twice (once via
5936 + * spin_unlock(), once by hand).
5937 + */
5938 +int cond_resched_lock(spinlock_t *lock)
5939 +{
5940 +       int resched = need_resched() && system_state == SYSTEM_RUNNING;
5941 +       int ret = 0;
5942 +
5943 +       if (spin_needbreak(lock) || resched) {
5944 +               spin_unlock(lock);
5945 +               if (resched && need_resched())
5946 +                       __cond_resched();
5947 +               else
5948 +                       cpu_relax();
5949 +               ret = 1;
5950 +               spin_lock(lock);
5951 +       }
5952 +       return ret;
5953 +}
5954 +EXPORT_SYMBOL(cond_resched_lock);
5955 +
5956 +int __sched cond_resched_softirq(void)
5957 +{
5958 +       BUG_ON(!in_softirq());
5959 +
5960 +       if (need_resched() && system_state == SYSTEM_RUNNING) {
5961 +               local_bh_enable();
5962 +               __cond_resched();
5963 +               local_bh_disable();
5964 +               return 1;
5965 +       }
5966 +       return 0;
5967 +}
5968 +EXPORT_SYMBOL(cond_resched_softirq);
5969 +
5970 +/**
5971 + * yield - yield the current processor to other threads.
5972 + *
5973 + * This is a shortcut for kernel-space yielding - it marks the
5974 + * thread runnable and calls sys_sched_yield().
5975 + */
5976 +void __sched yield(void)
5977 +{
5978 +       set_current_state(TASK_RUNNING);
5979 +       sys_sched_yield();
5980 +}
5981 +EXPORT_SYMBOL(yield);
5982 +
5983 +/*
5984 + * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5985 + * that process accounting knows that this is a task in IO wait state.
5986 + *
5987 + * But don't do that if it is a deliberate, throttling IO wait (this task
5988 + * has set its backing_dev_info: the queue against which it should throttle)
5989 + */
5990 +void __sched io_schedule(void)
5991 +{
5992 +       struct rq *rq = &__raw_get_cpu_var(runqueues);
5993 +
5994 +       delayacct_blkio_start();
5995 +       atomic_inc(&rq->nr_iowait);
5996 +       schedule();
5997 +       atomic_dec(&rq->nr_iowait);
5998 +       delayacct_blkio_end();
5999 +}
6000 +EXPORT_SYMBOL(io_schedule);
6001 +
6002 +long __sched io_schedule_timeout(long timeout)
6003 +{
6004 +       struct rq *rq = &__raw_get_cpu_var(runqueues);
6005 +       long ret;
6006 +
6007 +       delayacct_blkio_start();
6008 +       atomic_inc(&rq->nr_iowait);
6009 +       ret = schedule_timeout(timeout);
6010 +       atomic_dec(&rq->nr_iowait);
6011 +       delayacct_blkio_end();
6012 +       return ret;
6013 +}
6014 +
6015 +/**
6016 + * sys_sched_get_priority_max - return maximum RT priority.
6017 + * @policy: scheduling class.
6018 + *
6019 + * this syscall returns the maximum rt_priority that can be used
6020 + * by a given scheduling class.
6021 + */
6022 +SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6023 +{
6024 +       int ret = -EINVAL;
6025 +
6026 +       switch (policy) {
6027 +       case SCHED_FIFO:
6028 +       case SCHED_RR:
6029 +               ret = MAX_USER_RT_PRIO-1;
6030 +               break;
6031 +       case SCHED_NORMAL:
6032 +       case SCHED_BATCH:
6033 +       case SCHED_IDLE:
6034 +               ret = 0;
6035 +               break;
6036 +       }
6037 +       return ret;
6038 +}
6039 +
6040 +/**
6041 + * sys_sched_get_priority_min - return minimum RT priority.
6042 + * @policy: scheduling class.
6043 + *
6044 + * this syscall returns the minimum rt_priority that can be used
6045 + * by a given scheduling class.
6046 + */
6047 +SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6048 +{
6049 +       int ret = -EINVAL;
6050 +
6051 +       switch (policy) {
6052 +       case SCHED_FIFO:
6053 +       case SCHED_RR:
6054 +               ret = 1;
6055 +               break;
6056 +       case SCHED_NORMAL:
6057 +       case SCHED_BATCH:
6058 +       case SCHED_IDLE:
6059 +               ret = 0;
6060 +       }
6061 +       return ret;
6062 +}
6063 +
6064 +/**
6065 + * sys_sched_rr_get_interval - return the default timeslice of a process.
6066 + * @pid: pid of the process.
6067 + * @interval: userspace pointer to the timeslice value.
6068 + *
6069 + * this syscall writes the default timeslice value of a given process
6070 + * into the user-space timespec buffer. A value of '0' means infinity.
6071 + */
6072 +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6073 +               struct timespec __user *, interval)
6074 +{
6075 +       struct task_struct *p;
6076 +       unsigned int time_slice;
6077 +       int retval;
6078 +       struct timespec t;
6079 +
6080 +       if (pid < 0)
6081 +               return -EINVAL;
6082 +
6083 +       retval = -ESRCH;
6084 +       read_lock(&tasklist_lock);
6085 +       p = find_process_by_pid(pid);
6086 +       if (!p)
6087 +               goto out_unlock;
6088 +
6089 +       retval = security_task_getscheduler(p);
6090 +       if (retval)
6091 +               goto out_unlock;
6092 +
6093 +       /*
6094 +        * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6095 +        * tasks that are on an otherwise idle runqueue:
6096 +        */
6097 +       time_slice = 0;
6098 +       if (p->policy == SCHED_RR) {
6099 +               time_slice = DEF_TIMESLICE;
6100 +       } else if (p->policy != SCHED_FIFO) {
6101 +               struct sched_entity *se = &p->se;
6102 +               unsigned long flags;
6103 +               struct rq *rq;
6104 +
6105 +               rq = task_rq_lock(p, &flags);
6106 +               if (rq->cfs.load.weight)
6107 +                       time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6108 +               task_rq_unlock(rq, &flags);
6109 +       }
6110 +       read_unlock(&tasklist_lock);
6111 +       jiffies_to_timespec(time_slice, &t);
6112 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6113 +       return retval;
6114 +
6115 +out_unlock:
6116 +       read_unlock(&tasklist_lock);
6117 +       return retval;
6118 +}
6119 +
6120 +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
6121 +
6122 +void sched_show_task(struct task_struct *p)
6123 +{
6124 +       unsigned long free = 0;
6125 +       unsigned state;
6126 +
6127 +       state = p->state ? __ffs(p->state) + 1 : 0;
6128 +       printk(KERN_INFO "%-13.13s %c", p->comm,
6129 +               state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
6130 +#if BITS_PER_LONG == 32
6131 +       if (state == TASK_RUNNING)
6132 +               printk(KERN_CONT " running  ");
6133 +       else
6134 +               printk(KERN_CONT " %08lx ", thread_saved_pc(p));
6135 +#else
6136 +       if (state == TASK_RUNNING)
6137 +               printk(KERN_CONT "  running task    ");
6138 +       else
6139 +               printk(KERN_CONT " %016lx ", thread_saved_pc(p));
6140 +#endif
6141 +#ifdef CONFIG_DEBUG_STACK_USAGE
6142 +       {
6143 +               unsigned long *n = end_of_stack(p);
6144 +               while (!*n)
6145 +                       n++;
6146 +               free = (unsigned long)n - (unsigned long)end_of_stack(p);
6147 +       }
6148 +#endif
6149 +       printk(KERN_CONT "%5lu %5d %6d\n", free,
6150 +               task_pid_nr(p), task_pid_nr(p->real_parent));
6151 +
6152 +       show_stack(p, NULL);
6153 +}
6154 +
6155 +void show_state_filter(unsigned long state_filter)
6156 +{
6157 +       struct task_struct *g, *p;
6158 +
6159 +#if BITS_PER_LONG == 32
6160 +       printk(KERN_INFO
6161 +               "  task                PC stack   pid father\n");
6162 +#else
6163 +       printk(KERN_INFO
6164 +               "  task                        PC stack   pid father\n");
6165 +#endif
6166 +       read_lock(&tasklist_lock);
6167 +       do_each_thread(g, p) {
6168 +               /*
6169 +                * reset the NMI-timeout, listing all files on a slow
6170 +                * console might take alot of time:
6171 +                */
6172 +               touch_nmi_watchdog();
6173 +               if (!state_filter || (p->state & state_filter))
6174 +                       sched_show_task(p);
6175 +       } while_each_thread(g, p);
6176 +
6177 +       touch_all_softlockup_watchdogs();
6178 +
6179 +#ifdef CONFIG_SCHED_DEBUG
6180 +       sysrq_sched_debug_show();
6181 +#endif
6182 +       read_unlock(&tasklist_lock);
6183 +       /*
6184 +        * Only show locks if all tasks are dumped:
6185 +        */
6186 +       if (state_filter == -1)
6187 +               debug_show_all_locks();
6188 +}
6189 +
6190 +void __cpuinit init_idle_bootup_task(struct task_struct *idle)
6191 +{
6192 +       idle->sched_class = &idle_sched_class;
6193 +}
6194 +
6195 +/**
6196 + * init_idle - set up an idle thread for a given CPU
6197 + * @idle: task in question
6198 + * @cpu: cpu the idle task belongs to
6199 + *
6200 + * NOTE: this function does not set the idle thread's NEED_RESCHED
6201 + * flag, to make booting more robust.
6202 + */
6203 +void __cpuinit init_idle(struct task_struct *idle, int cpu)
6204 +{
6205 +       struct rq *rq = cpu_rq(cpu);
6206 +       unsigned long flags;
6207 +
6208 +       __sched_fork(idle);
6209 +       idle->se.exec_start = sched_clock();
6210 +
6211 +       idle->prio = idle->normal_prio = MAX_PRIO;
6212 +       idle->cpus_allowed = cpumask_of_cpu(cpu);
6213 +       __set_task_cpu(idle, cpu);
6214 +
6215 +       spin_lock_irqsave(&rq->lock, flags);
6216 +       rq->curr = rq->idle = idle;
6217 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6218 +       idle->oncpu = 1;
6219 +#endif
6220 +       spin_unlock_irqrestore(&rq->lock, flags);
6221 +
6222 +       /* Set the preempt count _outside_ the spinlocks! */
6223 +#if defined(CONFIG_PREEMPT)
6224 +       task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
6225 +#else
6226 +       task_thread_info(idle)->preempt_count = 0;
6227 +#endif
6228 +       /*
6229 +        * The idle tasks have their own, simple scheduling class:
6230 +        */
6231 +       idle->sched_class = &idle_sched_class;
6232 +}
6233 +
6234 +/*
6235 + * In a system that switches off the HZ timer nohz_cpu_mask
6236 + * indicates which cpus entered this state. This is used
6237 + * in the rcu update to wait only for active cpus. For system
6238 + * which do not switch off the HZ timer nohz_cpu_mask should
6239 + * always be CPU_MASK_NONE.
6240 + */
6241 +cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
6242 +
6243 +/*
6244 + * Increase the granularity value when there are more CPUs,
6245 + * because with more CPUs the 'effective latency' as visible
6246 + * to users decreases. But the relationship is not linear,
6247 + * so pick a second-best guess by going with the log2 of the
6248 + * number of CPUs.
6249 + *
6250 + * This idea comes from the SD scheduler of Con Kolivas:
6251 + */
6252 +static inline void sched_init_granularity(void)
6253 +{
6254 +       unsigned int factor = 1 + ilog2(num_online_cpus());
6255 +       const unsigned long limit = 200000000;
6256 +
6257 +       sysctl_sched_min_granularity *= factor;
6258 +       if (sysctl_sched_min_granularity > limit)
6259 +               sysctl_sched_min_granularity = limit;
6260 +
6261 +       sysctl_sched_latency *= factor;
6262 +       if (sysctl_sched_latency > limit)
6263 +               sysctl_sched_latency = limit;
6264 +
6265 +       sysctl_sched_wakeup_granularity *= factor;
6266 +
6267 +       sysctl_sched_shares_ratelimit *= factor;
6268 +}
6269 +
6270 +#ifdef CONFIG_SMP
6271 +/*
6272 + * This is how migration works:
6273 + *
6274 + * 1) we queue a struct migration_req structure in the source CPU's
6275 + *    runqueue and wake up that CPU's migration thread.
6276 + * 2) we down() the locked semaphore => thread blocks.
6277 + * 3) migration thread wakes up (implicitly it forces the migrated
6278 + *    thread off the CPU)
6279 + * 4) it gets the migration request and checks whether the migrated
6280 + *    task is still in the wrong runqueue.
6281 + * 5) if it's in the wrong runqueue then the migration thread removes
6282 + *    it and puts it into the right queue.
6283 + * 6) migration thread up()s the semaphore.
6284 + * 7) we wake up and the migration is done.
6285 + */
6286 +
6287 +/*
6288 + * Change a given task's CPU affinity. Migrate the thread to a
6289 + * proper CPU and schedule it away if the CPU it's executing on
6290 + * is removed from the allowed bitmask.
6291 + *
6292 + * NOTE: the caller must have a valid reference to the task, the
6293 + * task must not exit() & deallocate itself prematurely. The
6294 + * call is not atomic; no spinlocks may be held.
6295 + */
6296 +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
6297 +{
6298 +       struct migration_req req;
6299 +       unsigned long flags;
6300 +       struct rq *rq;
6301 +       int ret = 0;
6302 +
6303 +       rq = task_rq_lock(p, &flags);
6304 +       if (!cpus_intersects(*new_mask, cpu_online_map)) {
6305 +               ret = -EINVAL;
6306 +               goto out;
6307 +       }
6308 +
6309 +       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
6310 +                    !cpus_equal(p->cpus_allowed, *new_mask))) {
6311 +               ret = -EINVAL;
6312 +               goto out;
6313 +       }
6314 +
6315 +       if (p->sched_class->set_cpus_allowed)
6316 +               p->sched_class->set_cpus_allowed(p, new_mask);
6317 +       else {
6318 +               p->cpus_allowed = *new_mask;
6319 +               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
6320 +       }
6321 +
6322 +       /* Can the task run on the task's current CPU? If so, we're done */
6323 +       if (cpu_isset(task_cpu(p), *new_mask))
6324 +               goto out;
6325 +
6326 +       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
6327 +               /* Need help from migration thread: drop lock and wait. */
6328 +               task_rq_unlock(rq, &flags);
6329 +               wake_up_process(rq->migration_thread);
6330 +               wait_for_completion(&req.done);
6331 +               tlb_migrate_finish(p->mm);
6332 +               return 0;
6333 +       }
6334 +out:
6335 +       task_rq_unlock(rq, &flags);
6336 +
6337 +       return ret;
6338 +}
6339 +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
6340 +
6341 +/*
6342 + * Move (not current) task off this cpu, onto dest cpu. We're doing
6343 + * this because either it can't run here any more (set_cpus_allowed()
6344 + * away from this CPU, or CPU going down), or because we're
6345 + * attempting to rebalance this task on exec (sched_exec).
6346 + *
6347 + * So we race with normal scheduler movements, but that's OK, as long
6348 + * as the task is no longer on this CPU.
6349 + *
6350 + * Returns non-zero if task was successfully migrated.
6351 + */
6352 +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6353 +{
6354 +       struct rq *rq_dest, *rq_src;
6355 +       int ret = 0, on_rq;
6356 +
6357 +       if (unlikely(!cpu_active(dest_cpu)))
6358 +               return ret;
6359 +
6360 +       rq_src = cpu_rq(src_cpu);
6361 +       rq_dest = cpu_rq(dest_cpu);
6362 +
6363 +       double_rq_lock(rq_src, rq_dest);
6364 +       /* Already moved. */
6365 +       if (task_cpu(p) != src_cpu)
6366 +               goto done;
6367 +       /* Affinity changed (again). */
6368 +       if (!cpu_isset(dest_cpu, p->cpus_allowed))
6369 +               goto fail;
6370 +
6371 +       on_rq = p->se.on_rq;
6372 +       if (on_rq)
6373 +               deactivate_task(rq_src, p, 0);
6374 +
6375 +       set_task_cpu(p, dest_cpu);
6376 +       if (on_rq) {
6377 +               activate_task(rq_dest, p, 0);
6378 +               check_preempt_curr(rq_dest, p);
6379 +       }
6380 +done:
6381 +       ret = 1;
6382 +fail:
6383 +       double_rq_unlock(rq_src, rq_dest);
6384 +       return ret;
6385 +}
6386 +
6387 +/*
6388 + * migration_thread - this is a highprio system thread that performs
6389 + * thread migration by bumping thread off CPU then 'pushing' onto
6390 + * another runqueue.
6391 + */
6392 +static int migration_thread(void *data)
6393 +{
6394 +       int cpu = (long)data;
6395 +       struct rq *rq;
6396 +
6397 +       rq = cpu_rq(cpu);
6398 +       BUG_ON(rq->migration_thread != current);
6399 +
6400 +       set_current_state(TASK_INTERRUPTIBLE);
6401 +       while (!kthread_should_stop()) {
6402 +               struct migration_req *req;
6403 +               struct list_head *head;
6404 +
6405 +               spin_lock_irq(&rq->lock);
6406 +
6407 +               if (cpu_is_offline(cpu)) {
6408 +                       spin_unlock_irq(&rq->lock);
6409 +                       goto wait_to_die;
6410 +               }
6411 +
6412 +               if (rq->active_balance) {
6413 +                       active_load_balance(rq, cpu);
6414 +                       rq->active_balance = 0;
6415 +               }
6416 +
6417 +               head = &rq->migration_queue;
6418 +
6419 +               if (list_empty(head)) {
6420 +                       spin_unlock_irq(&rq->lock);
6421 +                       schedule();
6422 +                       set_current_state(TASK_INTERRUPTIBLE);
6423 +                       continue;
6424 +               }
6425 +               req = list_entry(head->next, struct migration_req, list);
6426 +               list_del_init(head->next);
6427 +
6428 +               spin_unlock(&rq->lock);
6429 +               __migrate_task(req->task, cpu, req->dest_cpu);
6430 +               local_irq_enable();
6431 +
6432 +               complete(&req->done);
6433 +       }
6434 +       __set_current_state(TASK_RUNNING);
6435 +       return 0;
6436 +
6437 +wait_to_die:
6438 +       /* Wait for kthread_stop */
6439 +       set_current_state(TASK_INTERRUPTIBLE);
6440 +       while (!kthread_should_stop()) {
6441 +               schedule();
6442 +               set_current_state(TASK_INTERRUPTIBLE);
6443 +       }
6444 +       __set_current_state(TASK_RUNNING);
6445 +       return 0;
6446 +}
6447 +
6448 +#ifdef CONFIG_HOTPLUG_CPU
6449 +
6450 +static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6451 +{
6452 +       int ret;
6453 +
6454 +       local_irq_disable();
6455 +       ret = __migrate_task(p, src_cpu, dest_cpu);
6456 +       local_irq_enable();
6457 +       return ret;
6458 +}
6459 +
6460 +/*
6461 + * Figure out where task on dead CPU should go, use force if necessary.
6462 + * NOTE: interrupts should be disabled by the caller
6463 + */
6464 +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6465 +{
6466 +       unsigned long flags;
6467 +       cpumask_t mask;
6468 +       struct rq *rq;
6469 +       int dest_cpu;
6470 +
6471 +       do {
6472 +               /* On same node? */
6473 +               mask = node_to_cpumask(cpu_to_node(dead_cpu));
6474 +               cpus_and(mask, mask, p->cpus_allowed);
6475 +               dest_cpu = any_online_cpu(mask);
6476 +
6477 +               /* On any allowed CPU? */
6478 +               if (dest_cpu >= nr_cpu_ids)
6479 +                       dest_cpu = any_online_cpu(p->cpus_allowed);
6480 +
6481 +               /* No more Mr. Nice Guy. */
6482 +               if (dest_cpu >= nr_cpu_ids) {
6483 +                       cpumask_t cpus_allowed;
6484 +
6485 +                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
6486 +                       /*
6487 +                        * Try to stay on the same cpuset, where the
6488 +                        * current cpuset may be a subset of all cpus.
6489 +                        * The cpuset_cpus_allowed_locked() variant of
6490 +                        * cpuset_cpus_allowed() will not block. It must be
6491 +                        * called within calls to cpuset_lock/cpuset_unlock.
6492 +                        */
6493 +                       rq = task_rq_lock(p, &flags);
6494 +                       p->cpus_allowed = cpus_allowed;
6495 +                       dest_cpu = any_online_cpu(p->cpus_allowed);
6496 +                       task_rq_unlock(rq, &flags);
6497 +
6498 +                       /*
6499 +                        * Don't tell them about moving exiting tasks or
6500 +                        * kernel threads (both mm NULL), since they never
6501 +                        * leave kernel.
6502 +                        */
6503 +                       if (p->mm && printk_ratelimit()) {
6504 +                               printk(KERN_INFO "process %d (%s) no "
6505 +                                      "longer affine to cpu%d\n",
6506 +                                       task_pid_nr(p), p->comm, dead_cpu);
6507 +                       }
6508 +               }
6509 +       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
6510 +}
6511 +
6512 +/*
6513 + * While a dead CPU has no uninterruptible tasks queued at this point,
6514 + * it might still have a nonzero ->nr_uninterruptible counter, because
6515 + * for performance reasons the counter is not stricly tracking tasks to
6516 + * their home CPUs. So we just add the counter to another CPU's counter,
6517 + * to keep the global sum constant after CPU-down:
6518 + */
6519 +static void migrate_nr_uninterruptible(struct rq *rq_src)
6520 +{
6521 +       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
6522 +       unsigned long flags;
6523 +
6524 +       local_irq_save(flags);
6525 +       double_rq_lock(rq_src, rq_dest);
6526 +       rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6527 +       rq_src->nr_uninterruptible = 0;
6528 +       double_rq_unlock(rq_src, rq_dest);
6529 +       local_irq_restore(flags);
6530 +}
6531 +
6532 +/* Run through task list and migrate tasks from the dead cpu. */
6533 +static void migrate_live_tasks(int src_cpu)
6534 +{
6535 +       struct task_struct *p, *t;
6536 +
6537 +       read_lock(&tasklist_lock);
6538 +
6539 +       do_each_thread(t, p) {
6540 +               if (p == current)
6541 +                       continue;
6542 +
6543 +               if (task_cpu(p) == src_cpu)
6544 +                       move_task_off_dead_cpu(src_cpu, p);
6545 +       } while_each_thread(t, p);
6546 +
6547 +       read_unlock(&tasklist_lock);
6548 +}
6549 +
6550 +/*
6551 + * Schedules idle task to be the next runnable task on current CPU.
6552 + * It does so by boosting its priority to highest possible.
6553 + * Used by CPU offline code.
6554 + */
6555 +void sched_idle_next(void)
6556 +{
6557 +       int this_cpu = smp_processor_id();
6558 +       struct rq *rq = cpu_rq(this_cpu);
6559 +       struct task_struct *p = rq->idle;
6560 +       unsigned long flags;
6561 +
6562 +       /* cpu has to be offline */
6563 +       BUG_ON(cpu_online(this_cpu));
6564 +
6565 +       /*
6566 +        * Strictly not necessary since rest of the CPUs are stopped by now
6567 +        * and interrupts disabled on the current cpu.
6568 +        */
6569 +       spin_lock_irqsave(&rq->lock, flags);
6570 +
6571 +       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6572 +
6573 +       update_rq_clock(rq);
6574 +       activate_task(rq, p, 0);
6575 +
6576 +       spin_unlock_irqrestore(&rq->lock, flags);
6577 +}
6578 +
6579 +/*
6580 + * Ensures that the idle task is using init_mm right before its cpu goes
6581 + * offline.
6582 + */
6583 +void idle_task_exit(void)
6584 +{
6585 +       struct mm_struct *mm = current->active_mm;
6586 +
6587 +       BUG_ON(cpu_online(smp_processor_id()));
6588 +
6589 +       if (mm != &init_mm)
6590 +               switch_mm(mm, &init_mm, current);
6591 +       mmdrop(mm);
6592 +}
6593 +
6594 +/* called under rq->lock with disabled interrupts */
6595 +static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6596 +{
6597 +       struct rq *rq = cpu_rq(dead_cpu);
6598 +
6599 +       /* Must be exiting, otherwise would be on tasklist. */
6600 +       BUG_ON(!p->exit_state);
6601 +
6602 +       /* Cannot have done final schedule yet: would have vanished. */
6603 +       BUG_ON(p->state == TASK_DEAD);
6604 +
6605 +       get_task_struct(p);
6606 +
6607 +       /*
6608 +        * Drop lock around migration; if someone else moves it,
6609 +        * that's OK. No task can be added to this CPU, so iteration is
6610 +        * fine.
6611 +        */
6612 +       spin_unlock_irq(&rq->lock);
6613 +       move_task_off_dead_cpu(dead_cpu, p);
6614 +       spin_lock_irq(&rq->lock);
6615 +
6616 +       put_task_struct(p);
6617 +}
6618 +
6619 +/* release_task() removes task from tasklist, so we won't find dead tasks. */
6620 +static void migrate_dead_tasks(unsigned int dead_cpu)
6621 +{
6622 +       struct rq *rq = cpu_rq(dead_cpu);
6623 +       struct task_struct *next;
6624 +
6625 +       for ( ; ; ) {
6626 +               if (!rq->nr_running)
6627 +                       break;
6628 +               update_rq_clock(rq);
6629 +               next = pick_next_task(rq, rq->curr);
6630 +               if (!next)
6631 +                       break;
6632 +               next->sched_class->put_prev_task(rq, next);
6633 +               migrate_dead(dead_cpu, next);
6634 +
6635 +       }
6636 +}
6637 +#endif /* CONFIG_HOTPLUG_CPU */
6638 +
6639 +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6640 +
6641 +static struct ctl_table sd_ctl_dir[] = {
6642 +       {
6643 +               .procname       = "sched_domain",
6644 +               .mode           = 0555,
6645 +       },
6646 +       {0, },
6647 +};
6648 +
6649 +static struct ctl_table sd_ctl_root[] = {
6650 +       {
6651 +               .ctl_name       = CTL_KERN,
6652 +               .procname       = "kernel",
6653 +               .mode           = 0555,
6654 +               .child          = sd_ctl_dir,
6655 +       },
6656 +       {0, },
6657 +};
6658 +
6659 +static struct ctl_table *sd_alloc_ctl_entry(int n)
6660 +{
6661 +       struct ctl_table *entry =
6662 +               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6663 +
6664 +       return entry;
6665 +}
6666 +
6667 +static void sd_free_ctl_entry(struct ctl_table **tablep)
6668 +{
6669 +       struct ctl_table *entry;
6670 +
6671 +       /*
6672 +        * In the intermediate directories, both the child directory and
6673 +        * procname are dynamically allocated and could fail but the mode
6674 +        * will always be set. In the lowest directory the names are
6675 +        * static strings and all have proc handlers.
6676 +        */
6677 +       for (entry = *tablep; entry->mode; entry++) {
6678 +               if (entry->child)
6679 +                       sd_free_ctl_entry(&entry->child);
6680 +               if (entry->proc_handler == NULL)
6681 +                       kfree(entry->procname);
6682 +       }
6683 +
6684 +       kfree(*tablep);
6685 +       *tablep = NULL;
6686 +}
6687 +
6688 +static void
6689 +set_table_entry(struct ctl_table *entry,
6690 +               const char *procname, void *data, int maxlen,
6691 +               mode_t mode, proc_handler *proc_handler)
6692 +{
6693 +       entry->procname = procname;
6694 +       entry->data = data;
6695 +       entry->maxlen = maxlen;
6696 +       entry->mode = mode;
6697 +       entry->proc_handler = proc_handler;
6698 +}
6699 +
6700 +static struct ctl_table *
6701 +sd_alloc_ctl_domain_table(struct sched_domain *sd)
6702 +{
6703 +       struct ctl_table *table = sd_alloc_ctl_entry(12);
6704 +
6705 +       if (table == NULL)
6706 +               return NULL;
6707 +
6708 +       set_table_entry(&table[0], "min_interval", &sd->min_interval,
6709 +               sizeof(long), 0644, proc_doulongvec_minmax);
6710 +       set_table_entry(&table[1], "max_interval", &sd->max_interval,
6711 +               sizeof(long), 0644, proc_doulongvec_minmax);
6712 +       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6713 +               sizeof(int), 0644, proc_dointvec_minmax);
6714 +       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6715 +               sizeof(int), 0644, proc_dointvec_minmax);
6716 +       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6717 +               sizeof(int), 0644, proc_dointvec_minmax);
6718 +       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6719 +               sizeof(int), 0644, proc_dointvec_minmax);
6720 +       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6721 +               sizeof(int), 0644, proc_dointvec_minmax);
6722 +       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6723 +               sizeof(int), 0644, proc_dointvec_minmax);
6724 +       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6725 +               sizeof(int), 0644, proc_dointvec_minmax);
6726 +       set_table_entry(&table[9], "cache_nice_tries",
6727 +               &sd->cache_nice_tries,
6728 +               sizeof(int), 0644, proc_dointvec_minmax);
6729 +       set_table_entry(&table[10], "flags", &sd->flags,
6730 +               sizeof(int), 0644, proc_dointvec_minmax);
6731 +       /* &table[11] is terminator */
6732 +
6733 +       return table;
6734 +}
6735 +
6736 +static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6737 +{
6738 +       struct ctl_table *entry, *table;
6739 +       struct sched_domain *sd;
6740 +       int domain_num = 0, i;
6741 +       char buf[32];
6742 +
6743 +       for_each_domain(cpu, sd)
6744 +               domain_num++;
6745 +       entry = table = sd_alloc_ctl_entry(domain_num + 1);
6746 +       if (table == NULL)
6747 +               return NULL;
6748 +
6749 +       i = 0;
6750 +       for_each_domain(cpu, sd) {
6751 +               snprintf(buf, 32, "domain%d", i);
6752 +               entry->procname = kstrdup(buf, GFP_KERNEL);
6753 +               entry->mode = 0555;
6754 +               entry->child = sd_alloc_ctl_domain_table(sd);
6755 +               entry++;
6756 +               i++;
6757 +       }
6758 +       return table;
6759 +}
6760 +
6761 +static struct ctl_table_header *sd_sysctl_header;
6762 +static void register_sched_domain_sysctl(void)
6763 +{
6764 +       int i, cpu_num = num_online_cpus();
6765 +       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6766 +       char buf[32];
6767 +
6768 +       WARN_ON(sd_ctl_dir[0].child);
6769 +       sd_ctl_dir[0].child = entry;
6770 +
6771 +       if (entry == NULL)
6772 +               return;
6773 +
6774 +       for_each_online_cpu(i) {
6775 +               snprintf(buf, 32, "cpu%d", i);
6776 +               entry->procname = kstrdup(buf, GFP_KERNEL);
6777 +               entry->mode = 0555;
6778 +               entry->child = sd_alloc_ctl_cpu_table(i);
6779 +               entry++;
6780 +       }
6781 +
6782 +       WARN_ON(sd_sysctl_header);
6783 +       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6784 +}
6785 +
6786 +/* may be called multiple times per register */
6787 +static void unregister_sched_domain_sysctl(void)
6788 +{
6789 +       if (sd_sysctl_header)
6790 +               unregister_sysctl_table(sd_sysctl_header);
6791 +       sd_sysctl_header = NULL;
6792 +       if (sd_ctl_dir[0].child)
6793 +               sd_free_ctl_entry(&sd_ctl_dir[0].child);
6794 +}
6795 +#else
6796 +static void register_sched_domain_sysctl(void)
6797 +{
6798 +}
6799 +static void unregister_sched_domain_sysctl(void)
6800 +{
6801 +}
6802 +#endif
6803 +
6804 +static void set_rq_online(struct rq *rq)
6805 +{
6806 +       if (!rq->online) {
6807 +               const struct sched_class *class;
6808 +
6809 +               cpu_set(rq->cpu, rq->rd->online);
6810 +               rq->online = 1;
6811 +
6812 +               for_each_class(class) {
6813 +                       if (class->rq_online)
6814 +                               class->rq_online(rq);
6815 +               }
6816 +       }
6817 +}
6818 +
6819 +static void set_rq_offline(struct rq *rq)
6820 +{
6821 +       if (rq->online) {
6822 +               const struct sched_class *class;
6823 +
6824 +               for_each_class(class) {
6825 +                       if (class->rq_offline)
6826 +                               class->rq_offline(rq);
6827 +               }
6828 +
6829 +               cpu_clear(rq->cpu, rq->rd->online);
6830 +               rq->online = 0;
6831 +       }
6832 +}
6833 +
6834 +/*
6835 + * migration_call - callback that gets triggered when a CPU is added.
6836 + * Here we can start up the necessary migration thread for the new CPU.
6837 + */
6838 +static int __cpuinit
6839 +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6840 +{
6841 +       struct task_struct *p;
6842 +       int cpu = (long)hcpu;
6843 +       unsigned long flags;
6844 +       struct rq *rq;
6845 +
6846 +       switch (action) {
6847 +
6848 +       case CPU_UP_PREPARE:
6849 +       case CPU_UP_PREPARE_FROZEN:
6850 +               p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
6851 +               if (IS_ERR(p))
6852 +                       return NOTIFY_BAD;
6853 +               kthread_bind(p, cpu);
6854 +               /* Must be high prio: stop_machine expects to yield to it. */
6855 +               rq = task_rq_lock(p, &flags);
6856 +               __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6857 +               task_rq_unlock(rq, &flags);
6858 +               cpu_rq(cpu)->migration_thread = p;
6859 +               break;
6860 +
6861 +       case CPU_ONLINE:
6862 +       case CPU_ONLINE_FROZEN:
6863 +               /* Strictly unnecessary, as first user will wake it. */
6864 +               wake_up_process(cpu_rq(cpu)->migration_thread);
6865 +
6866 +               /* Update our root-domain */
6867 +               rq = cpu_rq(cpu);
6868 +               spin_lock_irqsave(&rq->lock, flags);
6869 +               if (rq->rd) {
6870 +                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
6871 +
6872 +                       set_rq_online(rq);
6873 +               }
6874 +               spin_unlock_irqrestore(&rq->lock, flags);
6875 +               break;
6876 +
6877 +#ifdef CONFIG_HOTPLUG_CPU
6878 +       case CPU_UP_CANCELED:
6879 +       case CPU_UP_CANCELED_FROZEN:
6880 +               if (!cpu_rq(cpu)->migration_thread)
6881 +                       break;
6882 +               /* Unbind it from offline cpu so it can run. Fall thru. */
6883 +               kthread_bind(cpu_rq(cpu)->migration_thread,
6884 +                            any_online_cpu(cpu_online_map));
6885 +               kthread_stop(cpu_rq(cpu)->migration_thread);
6886 +               cpu_rq(cpu)->migration_thread = NULL;
6887 +               break;
6888 +
6889 +       case CPU_DEAD:
6890 +       case CPU_DEAD_FROZEN:
6891 +               cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
6892 +               migrate_live_tasks(cpu);
6893 +               rq = cpu_rq(cpu);
6894 +               kthread_stop(rq->migration_thread);
6895 +               rq->migration_thread = NULL;
6896 +               /* Idle task back to normal (off runqueue, low prio) */
6897 +               spin_lock_irq(&rq->lock);
6898 +               update_rq_clock(rq);
6899 +               deactivate_task(rq, rq->idle, 0);
6900 +               rq->idle->static_prio = MAX_PRIO;
6901 +               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6902 +               rq->idle->sched_class = &idle_sched_class;
6903 +               migrate_dead_tasks(cpu);
6904 +               spin_unlock_irq(&rq->lock);
6905 +               cpuset_unlock();
6906 +               migrate_nr_uninterruptible(rq);
6907 +               BUG_ON(rq->nr_running != 0);
6908 +
6909 +               /*
6910 +                * No need to migrate the tasks: it was best-effort if
6911 +                * they didn't take sched_hotcpu_mutex. Just wake up
6912 +                * the requestors.
6913 +                */
6914 +               spin_lock_irq(&rq->lock);
6915 +               while (!list_empty(&rq->migration_queue)) {
6916 +                       struct migration_req *req;
6917 +
6918 +                       req = list_entry(rq->migration_queue.next,
6919 +                                        struct migration_req, list);
6920 +                       list_del_init(&req->list);
6921 +                       spin_unlock_irq(&rq->lock);
6922 +                       complete(&req->done);
6923 +                       spin_lock_irq(&rq->lock);
6924 +               }
6925 +               spin_unlock_irq(&rq->lock);
6926 +               break;
6927 +
6928 +       case CPU_DYING:
6929 +       case CPU_DYING_FROZEN:
6930 +               /* Update our root-domain */
6931 +               rq = cpu_rq(cpu);
6932 +               spin_lock_irqsave(&rq->lock, flags);
6933 +               if (rq->rd) {
6934 +                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
6935 +                       set_rq_offline(rq);
6936 +               }
6937 +               spin_unlock_irqrestore(&rq->lock, flags);
6938 +               break;
6939 +#endif
6940 +       }
6941 +       return NOTIFY_OK;
6942 +}
6943 +
6944 +/* Register at highest priority so that task migration (migrate_all_tasks)
6945 + * happens before everything else.
6946 + */
6947 +static struct notifier_block __cpuinitdata migration_notifier = {
6948 +       .notifier_call = migration_call,
6949 +       .priority = 10
6950 +};
6951 +
6952 +static int __init migration_init(void)
6953 +{
6954 +       void *cpu = (void *)(long)smp_processor_id();
6955 +       int err;
6956 +
6957 +       /* Start one for the boot CPU: */
6958 +       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6959 +       BUG_ON(err == NOTIFY_BAD);
6960 +       migration_call(&migration_notifier, CPU_ONLINE, cpu);
6961 +       register_cpu_notifier(&migration_notifier);
6962 +
6963 +       return err;
6964 +}
6965 +early_initcall(migration_init);
6966 +#endif
6967 +
6968 +#ifdef CONFIG_SMP
6969 +
6970 +#ifdef CONFIG_SCHED_DEBUG
6971 +
6972 +static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6973 +{
6974 +       switch (lvl) {
6975 +       case SD_LV_NONE:
6976 +                       return "NONE";
6977 +       case SD_LV_SIBLING:
6978 +                       return "SIBLING";
6979 +       case SD_LV_MC:
6980 +                       return "MC";
6981 +       case SD_LV_CPU:
6982 +                       return "CPU";
6983 +       case SD_LV_NODE:
6984 +                       return "NODE";
6985 +       case SD_LV_ALLNODES:
6986 +                       return "ALLNODES";
6987 +       case SD_LV_MAX:
6988 +                       return "MAX";
6989 +
6990 +       }
6991 +       return "MAX";
6992 +}
6993 +
6994 +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6995 +                                 cpumask_t *groupmask)
6996 +{
6997 +       struct sched_group *group = sd->groups;
6998 +       char str[256];
6999 +
7000 +       cpulist_scnprintf(str, sizeof(str), sd->span);
7001 +       cpus_clear(*groupmask);
7002 +
7003 +       printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
7004 +
7005 +       if (!(sd->flags & SD_LOAD_BALANCE)) {
7006 +               printk("does not load-balance\n");
7007 +               if (sd->parent)
7008 +                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
7009 +                                       " has parent");
7010 +               return -1;
7011 +       }
7012 +
7013 +       printk(KERN_CONT "span %s level %s\n",
7014 +               str, sd_level_to_string(sd->level));
7015 +
7016 +       if (!cpu_isset(cpu, sd->span)) {
7017 +               printk(KERN_ERR "ERROR: domain->span does not contain "
7018 +                               "CPU%d\n", cpu);
7019 +       }
7020 +       if (!cpu_isset(cpu, group->cpumask)) {
7021 +               printk(KERN_ERR "ERROR: domain->groups does not contain"
7022 +                               " CPU%d\n", cpu);
7023 +       }
7024 +
7025 +       printk(KERN_DEBUG "%*s groups:", level + 1, "");
7026 +       do {
7027 +               if (!group) {
7028 +                       printk("\n");
7029 +                       printk(KERN_ERR "ERROR: group is NULL\n");
7030 +                       break;
7031 +               }
7032 +
7033 +               if (!group->__cpu_power) {
7034 +                       printk(KERN_CONT "\n");
7035 +                       printk(KERN_ERR "ERROR: domain->cpu_power not "
7036 +                                       "set\n");
7037 +                       break;
7038 +               }
7039 +
7040 +               if (!cpus_weight(group->cpumask)) {
7041 +                       printk(KERN_CONT "\n");
7042 +                       printk(KERN_ERR "ERROR: empty group\n");
7043 +                       break;
7044 +               }
7045 +
7046 +               if (cpus_intersects(*groupmask, group->cpumask)) {
7047 +                       printk(KERN_CONT "\n");
7048 +                       printk(KERN_ERR "ERROR: repeated CPUs\n");
7049 +                       break;
7050 +               }
7051 +
7052 +               cpus_or(*groupmask, *groupmask, group->cpumask);
7053 +
7054 +               cpulist_scnprintf(str, sizeof(str), group->cpumask);
7055 +               printk(KERN_CONT " %s", str);
7056 +
7057 +               group = group->next;
7058 +       } while (group != sd->groups);
7059 +       printk(KERN_CONT "\n");
7060 +
7061 +       if (!cpus_equal(sd->span, *groupmask))
7062 +               printk(KERN_ERR "ERROR: groups don't span domain->span\n");
7063 +
7064 +       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
7065 +               printk(KERN_ERR "ERROR: parent span is not a superset "
7066 +                       "of domain->span\n");
7067 +       return 0;
7068 +}
7069 +
7070 +static void sched_domain_debug(struct sched_domain *sd, int cpu)
7071 +{
7072 +       cpumask_t *groupmask;
7073 +       int level = 0;
7074 +
7075 +       if (!sd) {
7076 +               printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7077 +               return;
7078 +       }
7079 +
7080 +       printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
7081 +
7082 +       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
7083 +       if (!groupmask) {
7084 +               printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
7085 +               return;
7086 +       }
7087 +
7088 +       for (;;) {
7089 +               if (sched_domain_debug_one(sd, cpu, level, groupmask))
7090 +                       break;
7091 +               level++;
7092 +               sd = sd->parent;
7093 +               if (!sd)
7094 +                       break;
7095 +       }
7096 +       kfree(groupmask);
7097 +}
7098 +#else /* !CONFIG_SCHED_DEBUG */
7099 +# define sched_domain_debug(sd, cpu) do { } while (0)
7100 +#endif /* CONFIG_SCHED_DEBUG */
7101 +
7102 +static int sd_degenerate(struct sched_domain *sd)
7103 +{
7104 +       if (cpus_weight(sd->span) == 1)
7105 +               return 1;
7106 +
7107 +       /* Following flags need at least 2 groups */
7108 +       if (sd->flags & (SD_LOAD_BALANCE |
7109 +                        SD_BALANCE_NEWIDLE |
7110 +                        SD_BALANCE_FORK |
7111 +                        SD_BALANCE_EXEC |
7112 +                        SD_SHARE_CPUPOWER |
7113 +                        SD_SHARE_PKG_RESOURCES)) {
7114 +               if (sd->groups != sd->groups->next)
7115 +                       return 0;
7116 +       }
7117 +
7118 +       /* Following flags don't use groups */
7119 +       if (sd->flags & (SD_WAKE_IDLE |
7120 +                        SD_WAKE_AFFINE |
7121 +                        SD_WAKE_BALANCE))
7122 +               return 0;
7123 +
7124 +       return 1;
7125 +}
7126 +
7127 +static int
7128 +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7129 +{
7130 +       unsigned long cflags = sd->flags, pflags = parent->flags;
7131 +
7132 +       if (sd_degenerate(parent))
7133 +               return 1;
7134 +
7135 +       if (!cpus_equal(sd->span, parent->span))
7136 +               return 0;
7137 +
7138 +       /* Does parent contain flags not in child? */
7139 +       /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7140 +       if (cflags & SD_WAKE_AFFINE)
7141 +               pflags &= ~SD_WAKE_BALANCE;
7142 +       /* Flags needing groups don't count if only 1 group in parent */
7143 +       if (parent->groups == parent->groups->next) {
7144 +               pflags &= ~(SD_LOAD_BALANCE |
7145 +                               SD_BALANCE_NEWIDLE |
7146 +                               SD_BALANCE_FORK |
7147 +                               SD_BALANCE_EXEC |
7148 +                               SD_SHARE_CPUPOWER |
7149 +                               SD_SHARE_PKG_RESOURCES);
7150 +       }
7151 +       if (~cflags & pflags)
7152 +               return 0;
7153 +
7154 +       return 1;
7155 +}
7156 +
7157 +static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7158 +{
7159 +       unsigned long flags;
7160 +
7161 +       spin_lock_irqsave(&rq->lock, flags);
7162 +
7163 +       if (rq->rd) {
7164 +               struct root_domain *old_rd = rq->rd;
7165 +
7166 +               if (cpu_isset(rq->cpu, old_rd->online))
7167 +                       set_rq_offline(rq);
7168 +
7169 +               cpu_clear(rq->cpu, old_rd->span);
7170 +
7171 +               if (atomic_dec_and_test(&old_rd->refcount))
7172 +                       kfree(old_rd);
7173 +       }
7174 +
7175 +       atomic_inc(&rd->refcount);
7176 +       rq->rd = rd;
7177 +
7178 +       cpu_set(rq->cpu, rd->span);
7179 +       if (cpu_isset(rq->cpu, cpu_online_map))
7180 +               set_rq_online(rq);
7181 +
7182 +       spin_unlock_irqrestore(&rq->lock, flags);
7183 +}
7184 +
7185 +static void init_rootdomain(struct root_domain *rd)
7186 +{
7187 +       memset(rd, 0, sizeof(*rd));
7188 +
7189 +       cpus_clear(rd->span);
7190 +       cpus_clear(rd->online);
7191 +
7192 +       cpupri_init(&rd->cpupri);
7193 +}
7194 +
7195 +static void init_defrootdomain(void)
7196 +{
7197 +       init_rootdomain(&def_root_domain);
7198 +       atomic_set(&def_root_domain.refcount, 1);
7199 +}
7200 +
7201 +static struct root_domain *alloc_rootdomain(void)
7202 +{
7203 +       struct root_domain *rd;
7204 +
7205 +       rd = kmalloc(sizeof(*rd), GFP_KERNEL);
7206 +       if (!rd)
7207 +               return NULL;
7208 +
7209 +       init_rootdomain(rd);
7210 +
7211 +       return rd;
7212 +}
7213 +
7214 +/*
7215 + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
7216 + * hold the hotplug lock.
7217 + */
7218 +static void
7219 +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7220 +{
7221 +       struct rq *rq = cpu_rq(cpu);
7222 +       struct sched_domain *tmp;
7223 +
7224 +       /* Remove the sched domains which do not contribute to scheduling. */
7225 +       for (tmp = sd; tmp; ) {
7226 +               struct sched_domain *parent = tmp->parent;
7227 +               if (!parent)
7228 +                       break;
7229 +
7230 +               if (sd_parent_degenerate(tmp, parent)) {
7231 +                       tmp->parent = parent->parent;
7232 +                       if (parent->parent)
7233 +                               parent->parent->child = tmp;
7234 +               } else
7235 +                       tmp = tmp->parent;
7236 +       }
7237 +
7238 +       if (sd && sd_degenerate(sd)) {
7239 +               sd = sd->parent;
7240 +               if (sd)
7241 +                       sd->child = NULL;
7242 +       }
7243 +
7244 +       sched_domain_debug(sd, cpu);
7245 +
7246 +       rq_attach_root(rq, rd);
7247 +       rcu_assign_pointer(rq->sd, sd);
7248 +}
7249 +
7250 +/* cpus with isolated domains */
7251 +static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
7252 +
7253 +/* Setup the mask of cpus configured for isolated domains */
7254 +static int __init isolated_cpu_setup(char *str)
7255 +{
7256 +       static int __initdata ints[NR_CPUS];
7257 +       int i;
7258 +
7259 +       str = get_options(str, ARRAY_SIZE(ints), ints);
7260 +       cpus_clear(cpu_isolated_map);
7261 +       for (i = 1; i <= ints[0]; i++)
7262 +               if (ints[i] < NR_CPUS)
7263 +                       cpu_set(ints[i], cpu_isolated_map);
7264 +       return 1;
7265 +}
7266 +
7267 +__setup("isolcpus=", isolated_cpu_setup);
7268 +
7269 +/*
7270 + * init_sched_build_groups takes the cpumask we wish to span, and a pointer
7271 + * to a function which identifies what group(along with sched group) a CPU
7272 + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
7273 + * (due to the fact that we keep track of groups covered with a cpumask_t).
7274 + *
7275 + * init_sched_build_groups will build a circular linked list of the groups
7276 + * covered by the given span, and will set each group's ->cpumask correctly,
7277 + * and ->cpu_power to 0.
7278 + */
7279 +static void
7280 +init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
7281 +                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
7282 +                                       struct sched_group **sg,
7283 +                                       cpumask_t *tmpmask),
7284 +                       cpumask_t *covered, cpumask_t *tmpmask)
7285 +{
7286 +       struct sched_group *first = NULL, *last = NULL;
7287 +       int i;
7288 +
7289 +       cpus_clear(*covered);
7290 +
7291 +       for_each_cpu_mask_nr(i, *span) {
7292 +               struct sched_group *sg;
7293 +               int group = group_fn(i, cpu_map, &sg, tmpmask);
7294 +               int j;
7295 +
7296 +               if (cpu_isset(i, *covered))
7297 +                       continue;
7298 +
7299 +               cpus_clear(sg->cpumask);
7300 +               sg->__cpu_power = 0;
7301 +
7302 +               for_each_cpu_mask_nr(j, *span) {
7303 +                       if (group_fn(j, cpu_map, NULL, tmpmask) != group)
7304 +                               continue;
7305 +
7306 +                       cpu_set(j, *covered);
7307 +                       cpu_set(j, sg->cpumask);
7308 +               }
7309 +               if (!first)
7310 +                       first = sg;
7311 +               if (last)
7312 +                       last->next = sg;
7313 +               last = sg;
7314 +       }
7315 +       last->next = first;
7316 +}
7317 +
7318 +#define SD_NODES_PER_DOMAIN 16
7319 +
7320 +#ifdef CONFIG_NUMA
7321 +
7322 +/**
7323 + * find_next_best_node - find the next node to include in a sched_domain
7324 + * @node: node whose sched_domain we're building
7325 + * @used_nodes: nodes already in the sched_domain
7326 + *
7327 + * Find the next node to include in a given scheduling domain. Simply
7328 + * finds the closest node not already in the @used_nodes map.
7329 + *
7330 + * Should use nodemask_t.
7331 + */
7332 +static int find_next_best_node(int node, nodemask_t *used_nodes)
7333 +{
7334 +       int i, n, val, min_val, best_node = 0;
7335 +
7336 +       min_val = INT_MAX;
7337 +
7338 +       for (i = 0; i < nr_node_ids; i++) {
7339 +               /* Start at @node */
7340 +               n = (node + i) % nr_node_ids;
7341 +
7342 +               if (!nr_cpus_node(n))
7343 +                       continue;
7344 +
7345 +               /* Skip already used nodes */
7346 +               if (node_isset(n, *used_nodes))
7347 +                       continue;
7348 +
7349 +               /* Simple min distance search */
7350 +               val = node_distance(node, n);
7351 +
7352 +               if (val < min_val) {
7353 +                       min_val = val;
7354 +                       best_node = n;
7355 +               }
7356 +       }
7357 +
7358 +       node_set(best_node, *used_nodes);
7359 +       return best_node;
7360 +}
7361 +
7362 +/**
7363 + * sched_domain_node_span - get a cpumask for a node's sched_domain
7364 + * @node: node whose cpumask we're constructing
7365 + * @span: resulting cpumask
7366 + *
7367 + * Given a node, construct a good cpumask for its sched_domain to span. It
7368 + * should be one that prevents unnecessary balancing, but also spreads tasks
7369 + * out optimally.
7370 + */
7371 +static void sched_domain_node_span(int node, cpumask_t *span)
7372 +{
7373 +       nodemask_t used_nodes;
7374 +       node_to_cpumask_ptr(nodemask, node);
7375 +       int i;
7376 +
7377 +       cpus_clear(*span);
7378 +       nodes_clear(used_nodes);
7379 +
7380 +       cpus_or(*span, *span, *nodemask);
7381 +       node_set(node, used_nodes);
7382 +
7383 +       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7384 +               int next_node = find_next_best_node(node, &used_nodes);
7385 +
7386 +               node_to_cpumask_ptr_next(nodemask, next_node);
7387 +               cpus_or(*span, *span, *nodemask);
7388 +       }
7389 +}
7390 +#endif /* CONFIG_NUMA */
7391 +
7392 +int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7393 +
7394 +/*
7395 + * SMT sched-domains:
7396 + */
7397 +#ifdef CONFIG_SCHED_SMT
7398 +static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
7399 +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
7400 +
7401 +static int
7402 +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7403 +                cpumask_t *unused)
7404 +{
7405 +       if (sg)
7406 +               *sg = &per_cpu(sched_group_cpus, cpu);
7407 +       return cpu;
7408 +}
7409 +#endif /* CONFIG_SCHED_SMT */
7410 +
7411 +/*
7412 + * multi-core sched-domains:
7413 + */
7414 +#ifdef CONFIG_SCHED_MC
7415 +static DEFINE_PER_CPU(struct sched_domain, core_domains);
7416 +static DEFINE_PER_CPU(struct sched_group, sched_group_core);
7417 +#endif /* CONFIG_SCHED_MC */
7418 +
7419 +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7420 +static int
7421 +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7422 +                 cpumask_t *mask)
7423 +{
7424 +       int group;
7425 +
7426 +       *mask = per_cpu(cpu_sibling_map, cpu);
7427 +       cpus_and(*mask, *mask, *cpu_map);
7428 +       group = first_cpu(*mask);
7429 +       if (sg)
7430 +               *sg = &per_cpu(sched_group_core, group);
7431 +       return group;
7432 +}
7433 +#elif defined(CONFIG_SCHED_MC)
7434 +static int
7435 +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7436 +                 cpumask_t *unused)
7437 +{
7438 +       if (sg)
7439 +               *sg = &per_cpu(sched_group_core, cpu);
7440 +       return cpu;
7441 +}
7442 +#endif
7443 +
7444 +static DEFINE_PER_CPU(struct sched_domain, phys_domains);
7445 +static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
7446 +
7447 +static int
7448 +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7449 +                 cpumask_t *mask)
7450 +{
7451 +       int group;
7452 +#ifdef CONFIG_SCHED_MC
7453 +       *mask = cpu_coregroup_map(cpu);
7454 +       cpus_and(*mask, *mask, *cpu_map);
7455 +       group = first_cpu(*mask);
7456 +#elif defined(CONFIG_SCHED_SMT)
7457 +       *mask = per_cpu(cpu_sibling_map, cpu);
7458 +       cpus_and(*mask, *mask, *cpu_map);
7459 +       group = first_cpu(*mask);
7460 +#else
7461 +       group = cpu;
7462 +#endif
7463 +       if (sg)
7464 +               *sg = &per_cpu(sched_group_phys, group);
7465 +       return group;
7466 +}
7467 +
7468 +#ifdef CONFIG_NUMA
7469 +/*
7470 + * The init_sched_build_groups can't handle what we want to do with node
7471 + * groups, so roll our own. Now each node has its own list of groups which
7472 + * gets dynamically allocated.
7473 + */
7474 +static DEFINE_PER_CPU(struct sched_domain, node_domains);
7475 +static struct sched_group ***sched_group_nodes_bycpu;
7476 +
7477 +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7478 +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
7479 +
7480 +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
7481 +                                struct sched_group **sg, cpumask_t *nodemask)
7482 +{
7483 +       int group;
7484 +
7485 +       *nodemask = node_to_cpumask(cpu_to_node(cpu));
7486 +       cpus_and(*nodemask, *nodemask, *cpu_map);
7487 +       group = first_cpu(*nodemask);
7488 +
7489 +       if (sg)
7490 +               *sg = &per_cpu(sched_group_allnodes, group);
7491 +       return group;
7492 +}
7493 +
7494 +static void init_numa_sched_groups_power(struct sched_group *group_head)
7495 +{
7496 +       struct sched_group *sg = group_head;
7497 +       int j;
7498 +
7499 +       if (!sg)
7500 +               return;
7501 +       do {
7502 +               for_each_cpu_mask_nr(j, sg->cpumask) {
7503 +                       struct sched_domain *sd;
7504 +
7505 +                       sd = &per_cpu(phys_domains, j);
7506 +                       if (j != first_cpu(sd->groups->cpumask)) {
7507 +                               /*
7508 +                                * Only add "power" once for each
7509 +                                * physical package.
7510 +                                */
7511 +                               continue;
7512 +                       }
7513 +
7514 +                       sg_inc_cpu_power(sg, sd->groups->__cpu_power);
7515 +               }
7516 +               sg = sg->next;
7517 +       } while (sg != group_head);
7518 +}
7519 +#endif /* CONFIG_NUMA */
7520 +
7521 +#ifdef CONFIG_NUMA
7522 +/* Free memory allocated for various sched_group structures */
7523 +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7524 +{
7525 +       int cpu, i;
7526 +
7527 +       for_each_cpu_mask_nr(cpu, *cpu_map) {
7528 +               struct sched_group **sched_group_nodes
7529 +                       = sched_group_nodes_bycpu[cpu];
7530 +
7531 +               if (!sched_group_nodes)
7532 +                       continue;
7533 +
7534 +               for (i = 0; i < nr_node_ids; i++) {
7535 +                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
7536 +
7537 +                       *nodemask = node_to_cpumask(i);
7538 +                       cpus_and(*nodemask, *nodemask, *cpu_map);
7539 +                       if (cpus_empty(*nodemask))
7540 +                               continue;
7541 +
7542 +                       if (sg == NULL)
7543 +                               continue;
7544 +                       sg = sg->next;
7545 +next_sg:
7546 +                       oldsg = sg;
7547 +                       sg = sg->next;
7548 +                       kfree(oldsg);
7549 +                       if (oldsg != sched_group_nodes[i])
7550 +                               goto next_sg;
7551 +               }
7552 +               kfree(sched_group_nodes);
7553 +               sched_group_nodes_bycpu[cpu] = NULL;
7554 +       }
7555 +}
7556 +#else /* !CONFIG_NUMA */
7557 +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7558 +{
7559 +}
7560 +#endif /* CONFIG_NUMA */
7561 +
7562 +/*
7563 + * Initialize sched groups cpu_power.
7564 + *
7565 + * cpu_power indicates the capacity of sched group, which is used while
7566 + * distributing the load between different sched groups in a sched domain.
7567 + * Typically cpu_power for all the groups in a sched domain will be same unless
7568 + * there are asymmetries in the topology. If there are asymmetries, group
7569 + * having more cpu_power will pickup more load compared to the group having
7570 + * less cpu_power.
7571 + *
7572 + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
7573 + * the maximum number of tasks a group can handle in the presence of other idle
7574 + * or lightly loaded groups in the same sched domain.
7575 + */
7576 +static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7577 +{
7578 +       struct sched_domain *child;
7579 +       struct sched_group *group;
7580 +
7581 +       WARN_ON(!sd || !sd->groups);
7582 +
7583 +       if (cpu != first_cpu(sd->groups->cpumask))
7584 +               return;
7585 +
7586 +       child = sd->child;
7587 +
7588 +       sd->groups->__cpu_power = 0;
7589 +
7590 +       /*
7591 +        * For perf policy, if the groups in child domain share resources
7592 +        * (for example cores sharing some portions of the cache hierarchy
7593 +        * or SMT), then set this domain groups cpu_power such that each group
7594 +        * can handle only one task, when there are other idle groups in the
7595 +        * same sched domain.
7596 +        */
7597 +       if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
7598 +                      (child->flags &
7599 +                       (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
7600 +               sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
7601 +               return;
7602 +       }
7603 +
7604 +       /*
7605 +        * add cpu_power of each child group to this groups cpu_power
7606 +        */
7607 +       group = child->groups;
7608 +       do {
7609 +               sg_inc_cpu_power(sd->groups, group->__cpu_power);
7610 +               group = group->next;
7611 +       } while (group != child->groups);
7612 +}
7613 +
7614 +/*
7615 + * Initializers for schedule domains
7616 + * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7617 + */
7618 +
7619 +#define        SD_INIT(sd, type)       sd_init_##type(sd)
7620 +#define SD_INIT_FUNC(type)     \
7621 +static noinline void sd_init_##type(struct sched_domain *sd)   \
7622 +{                                                              \
7623 +       memset(sd, 0, sizeof(*sd));                             \
7624 +       *sd = SD_##type##_INIT;                                 \
7625 +       sd->level = SD_LV_##type;                               \
7626 +}
7627 +
7628 +SD_INIT_FUNC(CPU)
7629 +#ifdef CONFIG_NUMA
7630 + SD_INIT_FUNC(ALLNODES)
7631 + SD_INIT_FUNC(NODE)
7632 +#endif
7633 +#ifdef CONFIG_SCHED_SMT
7634 + SD_INIT_FUNC(SIBLING)
7635 +#endif
7636 +#ifdef CONFIG_SCHED_MC
7637 + SD_INIT_FUNC(MC)
7638 +#endif
7639 +
7640 +/*
7641 + * To minimize stack usage kmalloc room for cpumasks and share the
7642 + * space as the usage in build_sched_domains() dictates.  Used only
7643 + * if the amount of space is significant.
7644 + */
7645 +struct allmasks {
7646 +       cpumask_t tmpmask;                      /* make this one first */
7647 +       union {
7648 +               cpumask_t nodemask;
7649 +               cpumask_t this_sibling_map;
7650 +               cpumask_t this_core_map;
7651 +       };
7652 +       cpumask_t send_covered;
7653 +
7654 +#ifdef CONFIG_NUMA
7655 +       cpumask_t domainspan;
7656 +       cpumask_t covered;
7657 +       cpumask_t notcovered;
7658 +#endif
7659 +};
7660 +
7661 +#if    NR_CPUS > 128
7662 +#define        SCHED_CPUMASK_ALLOC             1
7663 +#define        SCHED_CPUMASK_FREE(v)           kfree(v)
7664 +#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
7665 +#else
7666 +#define        SCHED_CPUMASK_ALLOC             0
7667 +#define        SCHED_CPUMASK_FREE(v)
7668 +#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
7669 +#endif
7670 +
7671 +#define        SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
7672 +                       ((unsigned long)(a) + offsetof(struct allmasks, v))
7673 +
7674 +static int default_relax_domain_level = -1;
7675 +
7676 +static int __init setup_relax_domain_level(char *str)
7677 +{
7678 +       unsigned long val;
7679 +
7680 +       val = simple_strtoul(str, NULL, 0);
7681 +       if (val < SD_LV_MAX)
7682 +               default_relax_domain_level = val;
7683 +
7684 +       return 1;
7685 +}
7686 +__setup("relax_domain_level=", setup_relax_domain_level);
7687 +
7688 +static void set_domain_attribute(struct sched_domain *sd,
7689 +                                struct sched_domain_attr *attr)
7690 +{
7691 +       int request;
7692 +
7693 +       if (!attr || attr->relax_domain_level < 0) {
7694 +               if (default_relax_domain_level < 0)
7695 +                       return;
7696 +               else
7697 +                       request = default_relax_domain_level;
7698 +       } else
7699 +               request = attr->relax_domain_level;
7700 +       if (request < sd->level) {
7701 +               /* turn off idle balance on this domain */
7702 +               sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
7703 +       } else {
7704 +               /* turn on idle balance on this domain */
7705 +               sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
7706 +       }
7707 +}
7708 +
7709 +/*
7710 + * Build sched domains for a given set of cpus and attach the sched domains
7711 + * to the individual cpus
7712 + */
7713 +static int __build_sched_domains(const cpumask_t *cpu_map,
7714 +                                struct sched_domain_attr *attr)
7715 +{
7716 +       int i;
7717 +       struct root_domain *rd;
7718 +       SCHED_CPUMASK_DECLARE(allmasks);
7719 +       cpumask_t *tmpmask;
7720 +#ifdef CONFIG_NUMA
7721 +       struct sched_group **sched_group_nodes = NULL;
7722 +       int sd_allnodes = 0;
7723 +
7724 +       /*
7725 +        * Allocate the per-node list of sched groups
7726 +        */
7727 +       sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
7728 +                                   GFP_KERNEL);
7729 +       if (!sched_group_nodes) {
7730 +               printk(KERN_WARNING "Can not alloc sched group node list\n");
7731 +               return -ENOMEM;
7732 +       }
7733 +#endif
7734 +
7735 +       rd = alloc_rootdomain();
7736 +       if (!rd) {
7737 +               printk(KERN_WARNING "Cannot alloc root domain\n");
7738 +#ifdef CONFIG_NUMA
7739 +               kfree(sched_group_nodes);
7740 +#endif
7741 +               return -ENOMEM;
7742 +       }
7743 +
7744 +#if SCHED_CPUMASK_ALLOC
7745 +       /* get space for all scratch cpumask variables */
7746 +       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7747 +       if (!allmasks) {
7748 +               printk(KERN_WARNING "Cannot alloc cpumask array\n");
7749 +               kfree(rd);
7750 +#ifdef CONFIG_NUMA
7751 +               kfree(sched_group_nodes);
7752 +#endif
7753 +               return -ENOMEM;
7754 +       }
7755 +#endif
7756 +       tmpmask = (cpumask_t *)allmasks;
7757 +
7758 +
7759 +#ifdef CONFIG_NUMA
7760 +       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7761 +#endif
7762 +
7763 +       /*
7764 +        * Set up domains for cpus specified by the cpu_map.
7765 +        */
7766 +       for_each_cpu_mask_nr(i, *cpu_map) {
7767 +               struct sched_domain *sd = NULL, *p;
7768 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
7769 +
7770 +               *nodemask = node_to_cpumask(cpu_to_node(i));
7771 +               cpus_and(*nodemask, *nodemask, *cpu_map);
7772 +
7773 +#ifdef CONFIG_NUMA
7774 +               if (cpus_weight(*cpu_map) >
7775 +                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
7776 +                       sd = &per_cpu(allnodes_domains, i);
7777 +                       SD_INIT(sd, ALLNODES);
7778 +                       set_domain_attribute(sd, attr);
7779 +                       sd->span = *cpu_map;
7780 +                       cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7781 +                       p = sd;
7782 +                       sd_allnodes = 1;
7783 +               } else
7784 +                       p = NULL;
7785 +
7786 +               sd = &per_cpu(node_domains, i);
7787 +               SD_INIT(sd, NODE);
7788 +               set_domain_attribute(sd, attr);
7789 +               sched_domain_node_span(cpu_to_node(i), &sd->span);
7790 +               sd->parent = p;
7791 +               if (p)
7792 +                       p->child = sd;
7793 +               cpus_and(sd->span, sd->span, *cpu_map);
7794 +#endif
7795 +
7796 +               p = sd;
7797 +               sd = &per_cpu(phys_domains, i);
7798 +               SD_INIT(sd, CPU);
7799 +               set_domain_attribute(sd, attr);
7800 +               sd->span = *nodemask;
7801 +               sd->parent = p;
7802 +               if (p)
7803 +                       p->child = sd;
7804 +               cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
7805 +
7806 +#ifdef CONFIG_SCHED_MC
7807 +               p = sd;
7808 +               sd = &per_cpu(core_domains, i);
7809 +               SD_INIT(sd, MC);
7810 +               set_domain_attribute(sd, attr);
7811 +               sd->span = cpu_coregroup_map(i);
7812 +               cpus_and(sd->span, sd->span, *cpu_map);
7813 +               sd->parent = p;
7814 +               p->child = sd;
7815 +               cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
7816 +#endif
7817 +
7818 +#ifdef CONFIG_SCHED_SMT
7819 +               p = sd;
7820 +               sd = &per_cpu(cpu_domains, i);
7821 +               SD_INIT(sd, SIBLING);
7822 +               set_domain_attribute(sd, attr);
7823 +               sd->span = per_cpu(cpu_sibling_map, i);
7824 +               cpus_and(sd->span, sd->span, *cpu_map);
7825 +               sd->parent = p;
7826 +               p->child = sd;
7827 +               cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
7828 +#endif
7829 +       }
7830 +
7831 +#ifdef CONFIG_SCHED_SMT
7832 +       /* Set up CPU (sibling) groups */
7833 +       for_each_cpu_mask_nr(i, *cpu_map) {
7834 +               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7835 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
7836 +
7837 +               *this_sibling_map = per_cpu(cpu_sibling_map, i);
7838 +               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7839 +               if (i != first_cpu(*this_sibling_map))
7840 +                       continue;
7841 +
7842 +               init_sched_build_groups(this_sibling_map, cpu_map,
7843 +                                       &cpu_to_cpu_group,
7844 +                                       send_covered, tmpmask);
7845 +       }
7846 +#endif
7847 +
7848 +#ifdef CONFIG_SCHED_MC
7849 +       /* Set up multi-core groups */
7850 +       for_each_cpu_mask_nr(i, *cpu_map) {
7851 +               SCHED_CPUMASK_VAR(this_core_map, allmasks);
7852 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
7853 +
7854 +               *this_core_map = cpu_coregroup_map(i);
7855 +               cpus_and(*this_core_map, *this_core_map, *cpu_map);
7856 +               if (i != first_cpu(*this_core_map))
7857 +                       continue;
7858 +
7859 +               init_sched_build_groups(this_core_map, cpu_map,
7860 +                                       &cpu_to_core_group,
7861 +                                       send_covered, tmpmask);
7862 +       }
7863 +#endif
7864 +
7865 +       /* Set up physical groups */
7866 +       for (i = 0; i < nr_node_ids; i++) {
7867 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
7868 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
7869 +
7870 +               *nodemask = node_to_cpumask(i);
7871 +               cpus_and(*nodemask, *nodemask, *cpu_map);
7872 +               if (cpus_empty(*nodemask))
7873 +                       continue;
7874 +
7875 +               init_sched_build_groups(nodemask, cpu_map,
7876 +                                       &cpu_to_phys_group,
7877 +                                       send_covered, tmpmask);
7878 +       }
7879 +
7880 +#ifdef CONFIG_NUMA
7881 +       /* Set up node groups */
7882 +       if (sd_allnodes) {
7883 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
7884 +
7885 +               init_sched_build_groups(cpu_map, cpu_map,
7886 +                                       &cpu_to_allnodes_group,
7887 +                                       send_covered, tmpmask);
7888 +       }
7889 +
7890 +       for (i = 0; i < nr_node_ids; i++) {
7891 +               /* Set up node groups */
7892 +               struct sched_group *sg, *prev;
7893 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
7894 +               SCHED_CPUMASK_VAR(domainspan, allmasks);
7895 +               SCHED_CPUMASK_VAR(covered, allmasks);
7896 +               int j;
7897 +
7898 +               *nodemask = node_to_cpumask(i);
7899 +               cpus_clear(*covered);
7900 +
7901 +               cpus_and(*nodemask, *nodemask, *cpu_map);
7902 +               if (cpus_empty(*nodemask)) {
7903 +                       sched_group_nodes[i] = NULL;
7904 +                       continue;
7905 +               }
7906 +
7907 +               sched_domain_node_span(i, domainspan);
7908 +               cpus_and(*domainspan, *domainspan, *cpu_map);
7909 +
7910 +               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
7911 +               if (!sg) {
7912 +                       printk(KERN_WARNING "Can not alloc domain group for "
7913 +                               "node %d\n", i);
7914 +                       goto error;
7915 +               }
7916 +               sched_group_nodes[i] = sg;
7917 +               for_each_cpu_mask_nr(j, *nodemask) {
7918 +                       struct sched_domain *sd;
7919 +
7920 +                       sd = &per_cpu(node_domains, j);
7921 +                       sd->groups = sg;
7922 +               }
7923 +               sg->__cpu_power = 0;
7924 +               sg->cpumask = *nodemask;
7925 +               sg->next = sg;
7926 +               cpus_or(*covered, *covered, *nodemask);
7927 +               prev = sg;
7928 +
7929 +               for (j = 0; j < nr_node_ids; j++) {
7930 +                       SCHED_CPUMASK_VAR(notcovered, allmasks);
7931 +                       int n = (i + j) % nr_node_ids;
7932 +                       node_to_cpumask_ptr(pnodemask, n);
7933 +
7934 +                       cpus_complement(*notcovered, *covered);
7935 +                       cpus_and(*tmpmask, *notcovered, *cpu_map);
7936 +                       cpus_and(*tmpmask, *tmpmask, *domainspan);
7937 +                       if (cpus_empty(*tmpmask))
7938 +                               break;
7939 +
7940 +                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
7941 +                       if (cpus_empty(*tmpmask))
7942 +                               continue;
7943 +
7944 +                       sg = kmalloc_node(sizeof(struct sched_group),
7945 +                                         GFP_KERNEL, i);
7946 +                       if (!sg) {
7947 +                               printk(KERN_WARNING
7948 +                               "Can not alloc domain group for node %d\n", j);
7949 +                               goto error;
7950 +                       }
7951 +                       sg->__cpu_power = 0;
7952 +                       sg->cpumask = *tmpmask;
7953 +                       sg->next = prev->next;
7954 +                       cpus_or(*covered, *covered, *tmpmask);
7955 +                       prev->next = sg;
7956 +                       prev = sg;
7957 +               }
7958 +       }
7959 +#endif
7960 +
7961 +       /* Calculate CPU power for physical packages and nodes */
7962 +#ifdef CONFIG_SCHED_SMT
7963 +       for_each_cpu_mask_nr(i, *cpu_map) {
7964 +               struct sched_domain *sd = &per_cpu(cpu_domains, i);
7965 +
7966 +               init_sched_groups_power(i, sd);
7967 +       }
7968 +#endif
7969 +#ifdef CONFIG_SCHED_MC
7970 +       for_each_cpu_mask_nr(i, *cpu_map) {
7971 +               struct sched_domain *sd = &per_cpu(core_domains, i);
7972 +
7973 +               init_sched_groups_power(i, sd);
7974 +       }
7975 +#endif
7976 +
7977 +       for_each_cpu_mask_nr(i, *cpu_map) {
7978 +               struct sched_domain *sd = &per_cpu(phys_domains, i);
7979 +
7980 +               init_sched_groups_power(i, sd);
7981 +       }
7982 +
7983 +#ifdef CONFIG_NUMA
7984 +       for (i = 0; i < nr_node_ids; i++)
7985 +               init_numa_sched_groups_power(sched_group_nodes[i]);
7986 +
7987 +       if (sd_allnodes) {
7988 +               struct sched_group *sg;
7989 +
7990 +               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
7991 +                                                               tmpmask);
7992 +               init_numa_sched_groups_power(sg);
7993 +       }
7994 +#endif
7995 +
7996 +       /* Attach the domains */
7997 +       for_each_cpu_mask_nr(i, *cpu_map) {
7998 +               struct sched_domain *sd;
7999 +#ifdef CONFIG_SCHED_SMT
8000 +               sd = &per_cpu(cpu_domains, i);
8001 +#elif defined(CONFIG_SCHED_MC)
8002 +               sd = &per_cpu(core_domains, i);
8003 +#else
8004 +               sd = &per_cpu(phys_domains, i);
8005 +#endif
8006 +               cpu_attach_domain(sd, rd, i);
8007 +       }
8008 +
8009 +       SCHED_CPUMASK_FREE((void *)allmasks);
8010 +       return 0;
8011 +
8012 +#ifdef CONFIG_NUMA
8013 +error:
8014 +       free_sched_groups(cpu_map, tmpmask);
8015 +       SCHED_CPUMASK_FREE((void *)allmasks);
8016 +       return -ENOMEM;
8017 +#endif
8018 +}
8019 +
8020 +static int build_sched_domains(const cpumask_t *cpu_map)
8021 +{
8022 +       return __build_sched_domains(cpu_map, NULL);
8023 +}
8024 +
8025 +static cpumask_t *doms_cur;    /* current sched domains */
8026 +static int ndoms_cur;          /* number of sched domains in 'doms_cur' */
8027 +static struct sched_domain_attr *dattr_cur;
8028 +                               /* attribues of custom domains in 'doms_cur' */
8029 +
8030 +/*
8031 + * Special case: If a kmalloc of a doms_cur partition (array of
8032 + * cpumask_t) fails, then fallback to a single sched domain,
8033 + * as determined by the single cpumask_t fallback_doms.
8034 + */
8035 +static cpumask_t fallback_doms;
8036 +
8037 +void __attribute__((weak)) arch_update_cpu_topology(void)
8038 +{
8039 +}
8040 +
8041 +/*
8042 + * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8043 + * For now this just excludes isolated cpus, but could be used to
8044 + * exclude other special cases in the future.
8045 + */
8046 +static int arch_init_sched_domains(const cpumask_t *cpu_map)
8047 +{
8048 +       int err;
8049 +
8050 +       arch_update_cpu_topology();
8051 +       ndoms_cur = 1;
8052 +       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
8053 +       if (!doms_cur)
8054 +               doms_cur = &fallback_doms;
8055 +       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
8056 +       dattr_cur = NULL;
8057 +       err = build_sched_domains(doms_cur);
8058 +       register_sched_domain_sysctl();
8059 +
8060 +       return err;
8061 +}
8062 +
8063 +static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
8064 +                                      cpumask_t *tmpmask)
8065 +{
8066 +       free_sched_groups(cpu_map, tmpmask);
8067 +}
8068 +
8069 +/*
8070 + * Detach sched domains from a group of cpus specified in cpu_map
8071 + * These cpus will now be attached to the NULL domain
8072 + */
8073 +static void detach_destroy_domains(const cpumask_t *cpu_map)
8074 +{
8075 +       cpumask_t tmpmask;
8076 +       int i;
8077 +
8078 +       unregister_sched_domain_sysctl();
8079 +
8080 +       for_each_cpu_mask_nr(i, *cpu_map)
8081 +               cpu_attach_domain(NULL, &def_root_domain, i);
8082 +       synchronize_sched();
8083 +       arch_destroy_sched_domains(cpu_map, &tmpmask);
8084 +}
8085 +
8086 +/* handle null as "default" */
8087 +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8088 +                       struct sched_domain_attr *new, int idx_new)
8089 +{
8090 +       struct sched_domain_attr tmp;
8091 +
8092 +       /* fast path */
8093 +       if (!new && !cur)
8094 +               return 1;
8095 +
8096 +       tmp = SD_ATTR_INIT;
8097 +       return !memcmp(cur ? (cur + idx_cur) : &tmp,
8098 +                       new ? (new + idx_new) : &tmp,
8099 +                       sizeof(struct sched_domain_attr));
8100 +}
8101 +
8102 +/*
8103 + * Partition sched domains as specified by the 'ndoms_new'
8104 + * cpumasks in the array doms_new[] of cpumasks. This compares
8105 + * doms_new[] to the current sched domain partitioning, doms_cur[].
8106 + * It destroys each deleted domain and builds each new domain.
8107 + *
8108 + * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
8109 + * The masks don't intersect (don't overlap.) We should setup one
8110 + * sched domain for each mask. CPUs not in any of the cpumasks will
8111 + * not be load balanced. If the same cpumask appears both in the
8112 + * current 'doms_cur' domains and in the new 'doms_new', we can leave
8113 + * it as it is.
8114 + *
8115 + * The passed in 'doms_new' should be kmalloc'd. This routine takes
8116 + * ownership of it and will kfree it when done with it. If the caller
8117 + * failed the kmalloc call, then it can pass in doms_new == NULL &&
8118 + * ndoms_new == 1, and partition_sched_domains() will fallback to
8119 + * the single partition 'fallback_doms', it also forces the domains
8120 + * to be rebuilt.
8121 + *
8122 + * If doms_new == NULL it will be replaced with cpu_online_map.
8123 + * ndoms_new == 0 is a special case for destroying existing domains,
8124 + * and it will not create the default domain.
8125 + *
8126 + * Call with hotplug lock held
8127 + */
8128 +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
8129 +                            struct sched_domain_attr *dattr_new)
8130 +{
8131 +       int i, j, n;
8132 +
8133 +       mutex_lock(&sched_domains_mutex);
8134 +
8135 +       /* always unregister in case we don't destroy any domains */
8136 +       unregister_sched_domain_sysctl();
8137 +
8138 +       n = doms_new ? ndoms_new : 0;
8139 +
8140 +       /* Destroy deleted domains */
8141 +       for (i = 0; i < ndoms_cur; i++) {
8142 +               for (j = 0; j < n; j++) {
8143 +                       if (cpus_equal(doms_cur[i], doms_new[j])
8144 +                           && dattrs_equal(dattr_cur, i, dattr_new, j))
8145 +                               goto match1;
8146 +               }
8147 +               /* no match - a current sched domain not in new doms_new[] */
8148 +               detach_destroy_domains(doms_cur + i);
8149 +match1:
8150 +               ;
8151 +       }
8152 +
8153 +       if (doms_new == NULL) {
8154 +               ndoms_cur = 0;
8155 +               doms_new = &fallback_doms;
8156 +               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
8157 +               dattr_new = NULL;
8158 +       }
8159 +
8160 +       /* Build new domains */
8161 +       for (i = 0; i < ndoms_new; i++) {
8162 +               for (j = 0; j < ndoms_cur; j++) {
8163 +                       if (cpus_equal(doms_new[i], doms_cur[j])
8164 +                           && dattrs_equal(dattr_new, i, dattr_cur, j))
8165 +                               goto match2;
8166 +               }
8167 +               /* no match - add a new doms_new */
8168 +               __build_sched_domains(doms_new + i,
8169 +                                       dattr_new ? dattr_new + i : NULL);
8170 +match2:
8171 +               ;
8172 +       }
8173 +
8174 +       /* Remember the new sched domains */
8175 +       if (doms_cur != &fallback_doms)
8176 +               kfree(doms_cur);
8177 +       kfree(dattr_cur);       /* kfree(NULL) is safe */
8178 +       doms_cur = doms_new;
8179 +       dattr_cur = dattr_new;
8180 +       ndoms_cur = ndoms_new;
8181 +
8182 +       register_sched_domain_sysctl();
8183 +
8184 +       mutex_unlock(&sched_domains_mutex);
8185 +}
8186 +
8187 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
8188 +int arch_reinit_sched_domains(void)
8189 +{
8190 +       get_online_cpus();
8191 +
8192 +       /* Destroy domains first to force the rebuild */
8193 +       partition_sched_domains(0, NULL, NULL);
8194 +
8195 +       rebuild_sched_domains();
8196 +       put_online_cpus();
8197 +
8198 +       return 0;
8199 +}
8200 +
8201 +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
8202 +{
8203 +       int ret;
8204 +
8205 +       if (buf[0] != '0' && buf[0] != '1')
8206 +               return -EINVAL;
8207 +
8208 +       if (smt)
8209 +               sched_smt_power_savings = (buf[0] == '1');
8210 +       else
8211 +               sched_mc_power_savings = (buf[0] == '1');
8212 +
8213 +       ret = arch_reinit_sched_domains();
8214 +
8215 +       return ret ? ret : count;
8216 +}
8217 +
8218 +#ifdef CONFIG_SCHED_MC
8219 +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
8220 +                                          char *page)
8221 +{
8222 +       return sprintf(page, "%u\n", sched_mc_power_savings);
8223 +}
8224 +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
8225 +                                           const char *buf, size_t count)
8226 +{
8227 +       return sched_power_savings_store(buf, count, 0);
8228 +}
8229 +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
8230 +                        sched_mc_power_savings_show,
8231 +                        sched_mc_power_savings_store);
8232 +#endif
8233 +
8234 +#ifdef CONFIG_SCHED_SMT
8235 +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
8236 +                                           char *page)
8237 +{
8238 +       return sprintf(page, "%u\n", sched_smt_power_savings);
8239 +}
8240 +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
8241 +                                            const char *buf, size_t count)
8242 +{
8243 +       return sched_power_savings_store(buf, count, 1);
8244 +}
8245 +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
8246 +                  sched_smt_power_savings_show,
8247 +                  sched_smt_power_savings_store);
8248 +#endif
8249 +
8250 +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
8251 +{
8252 +       int err = 0;
8253 +
8254 +#ifdef CONFIG_SCHED_SMT
8255 +       if (smt_capable())
8256 +               err = sysfs_create_file(&cls->kset.kobj,
8257 +                                       &attr_sched_smt_power_savings.attr);
8258 +#endif
8259 +#ifdef CONFIG_SCHED_MC
8260 +       if (!err && mc_capable())
8261 +               err = sysfs_create_file(&cls->kset.kobj,
8262 +                                       &attr_sched_mc_power_savings.attr);
8263 +#endif
8264 +       return err;
8265 +}
8266 +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
8267 +
8268 +#ifndef CONFIG_CPUSETS
8269 +/*
8270 + * Add online and remove offline CPUs from the scheduler domains.
8271 + * When cpusets are enabled they take over this function.
8272 + */
8273 +static int update_sched_domains(struct notifier_block *nfb,
8274 +                               unsigned long action, void *hcpu)
8275 +{
8276 +       switch (action) {
8277 +       case CPU_ONLINE:
8278 +       case CPU_ONLINE_FROZEN:
8279 +       case CPU_DEAD:
8280 +       case CPU_DEAD_FROZEN:
8281 +               partition_sched_domains(1, NULL, NULL);
8282 +               return NOTIFY_OK;
8283 +
8284 +       default:
8285 +               return NOTIFY_DONE;
8286 +       }
8287 +}
8288 +#endif
8289 +
8290 +static int update_runtime(struct notifier_block *nfb,
8291 +                               unsigned long action, void *hcpu)
8292 +{
8293 +       int cpu = (int)(long)hcpu;
8294 +
8295 +       switch (action) {
8296 +       case CPU_DOWN_PREPARE:
8297 +       case CPU_DOWN_PREPARE_FROZEN:
8298 +               disable_runtime(cpu_rq(cpu));
8299 +               return NOTIFY_OK;
8300 +
8301 +       case CPU_DOWN_FAILED:
8302 +       case CPU_DOWN_FAILED_FROZEN:
8303 +       case CPU_ONLINE:
8304 +       case CPU_ONLINE_FROZEN:
8305 +               enable_runtime(cpu_rq(cpu));
8306 +               return NOTIFY_OK;
8307 +
8308 +       default:
8309 +               return NOTIFY_DONE;
8310 +       }
8311 +}
8312 +
8313 +void __init sched_init_smp(void)
8314 +{
8315 +       cpumask_t non_isolated_cpus;
8316 +
8317 +#if defined(CONFIG_NUMA)
8318 +       sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
8319 +                                                               GFP_KERNEL);
8320 +       BUG_ON(sched_group_nodes_bycpu == NULL);
8321 +#endif
8322 +       get_online_cpus();
8323 +       mutex_lock(&sched_domains_mutex);
8324 +       arch_init_sched_domains(&cpu_online_map);
8325 +       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
8326 +       if (cpus_empty(non_isolated_cpus))
8327 +               cpu_set(smp_processor_id(), non_isolated_cpus);
8328 +       mutex_unlock(&sched_domains_mutex);
8329 +       put_online_cpus();
8330 +
8331 +#ifndef CONFIG_CPUSETS
8332 +       /* XXX: Theoretical race here - CPU may be hotplugged now */
8333 +       hotcpu_notifier(update_sched_domains, 0);
8334 +#endif
8335 +
8336 +       /* RT runtime code needs to handle some hotplug events */
8337 +       hotcpu_notifier(update_runtime, 0);
8338 +
8339 +       init_hrtick();
8340 +
8341 +       /* Move init over to a non-isolated CPU */
8342 +       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
8343 +               BUG();
8344 +       sched_init_granularity();
8345 +}
8346 +#else
8347 +void __init sched_init_smp(void)
8348 +{
8349 +       sched_init_granularity();
8350 +}
8351 +#endif /* CONFIG_SMP */
8352 +
8353 +int in_sched_functions(unsigned long addr)
8354 +{
8355 +       return in_lock_functions(addr) ||
8356 +               (addr >= (unsigned long)__sched_text_start
8357 +               && addr < (unsigned long)__sched_text_end);
8358 +}
8359 +
8360 +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
8361 +{
8362 +       cfs_rq->tasks_timeline = RB_ROOT;
8363 +       INIT_LIST_HEAD(&cfs_rq->tasks);
8364 +#ifdef CONFIG_FAIR_GROUP_SCHED
8365 +       cfs_rq->rq = rq;
8366 +#endif
8367 +       cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8368 +}
8369 +
8370 +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8371 +{
8372 +       struct rt_prio_array *array;
8373 +       int i;
8374 +
8375 +       array = &rt_rq->active;
8376 +       for (i = 0; i < MAX_RT_PRIO; i++) {
8377 +               INIT_LIST_HEAD(array->queue + i);
8378 +               __clear_bit(i, array->bitmap);
8379 +       }
8380 +       /* delimiter for bitsearch: */
8381 +       __set_bit(MAX_RT_PRIO, array->bitmap);
8382 +
8383 +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8384 +       rt_rq->highest_prio = MAX_RT_PRIO;
8385 +#endif
8386 +#ifdef CONFIG_SMP
8387 +       rt_rq->rt_nr_migratory = 0;
8388 +       rt_rq->overloaded = 0;
8389 +#endif
8390 +
8391 +       rt_rq->rt_time = 0;
8392 +       rt_rq->rt_throttled = 0;
8393 +       rt_rq->rt_runtime = 0;
8394 +       spin_lock_init(&rt_rq->rt_runtime_lock);
8395 +
8396 +#ifdef CONFIG_RT_GROUP_SCHED
8397 +       rt_rq->rt_nr_boosted = 0;
8398 +       rt_rq->rq = rq;
8399 +#endif
8400 +}
8401 +
8402 +#ifdef CONFIG_FAIR_GROUP_SCHED
8403 +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8404 +                               struct sched_entity *se, int cpu, int add,
8405 +                               struct sched_entity *parent)
8406 +{
8407 +       struct rq *rq = cpu_rq(cpu);
8408 +       tg->cfs_rq[cpu] = cfs_rq;
8409 +       init_cfs_rq(cfs_rq, rq);
8410 +       cfs_rq->tg = tg;
8411 +       if (add)
8412 +               list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8413 +
8414 +       tg->se[cpu] = se;
8415 +       /* se could be NULL for init_task_group */
8416 +       if (!se)
8417 +               return;
8418 +
8419 +       if (!parent)
8420 +               se->cfs_rq = &rq->cfs;
8421 +       else
8422 +               se->cfs_rq = parent->my_q;
8423 +
8424 +       se->my_q = cfs_rq;
8425 +       se->load.weight = tg->shares;
8426 +       se->load.inv_weight = 0;
8427 +       se->parent = parent;
8428 +}
8429 +#endif
8430 +
8431 +#ifdef CONFIG_RT_GROUP_SCHED
8432 +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8433 +               struct sched_rt_entity *rt_se, int cpu, int add,
8434 +               struct sched_rt_entity *parent)
8435 +{
8436 +       struct rq *rq = cpu_rq(cpu);
8437 +
8438 +       tg->rt_rq[cpu] = rt_rq;
8439 +       init_rt_rq(rt_rq, rq);
8440 +       rt_rq->tg = tg;
8441 +       rt_rq->rt_se = rt_se;
8442 +       rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8443 +       if (add)
8444 +               list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8445 +
8446 +       tg->rt_se[cpu] = rt_se;
8447 +       if (!rt_se)
8448 +               return;
8449 +
8450 +       if (!parent)
8451 +               rt_se->rt_rq = &rq->rt;
8452 +       else
8453 +               rt_se->rt_rq = parent->my_q;
8454 +
8455 +       rt_se->my_q = rt_rq;
8456 +       rt_se->parent = parent;
8457 +       INIT_LIST_HEAD(&rt_se->run_list);
8458 +}
8459 +#endif
8460 +
8461 +void __init sched_init(void)
8462 +{
8463 +       int i, j;
8464 +       unsigned long alloc_size = 0, ptr;
8465 +
8466 +#ifdef CONFIG_FAIR_GROUP_SCHED
8467 +       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8468 +#endif
8469 +#ifdef CONFIG_RT_GROUP_SCHED
8470 +       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8471 +#endif
8472 +#ifdef CONFIG_USER_SCHED
8473 +       alloc_size *= 2;
8474 +#endif
8475 +       /*
8476 +        * As sched_init() is called before page_alloc is setup,
8477 +        * we use alloc_bootmem().
8478 +        */
8479 +       if (alloc_size) {
8480 +               ptr = (unsigned long)alloc_bootmem(alloc_size);
8481 +
8482 +#ifdef CONFIG_FAIR_GROUP_SCHED
8483 +               init_task_group.se = (struct sched_entity **)ptr;
8484 +               ptr += nr_cpu_ids * sizeof(void **);
8485 +
8486 +               init_task_group.cfs_rq = (struct cfs_rq **)ptr;
8487 +               ptr += nr_cpu_ids * sizeof(void **);
8488 +
8489 +#ifdef CONFIG_USER_SCHED
8490 +               root_task_group.se = (struct sched_entity **)ptr;
8491 +               ptr += nr_cpu_ids * sizeof(void **);
8492 +
8493 +               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8494 +               ptr += nr_cpu_ids * sizeof(void **);
8495 +#endif /* CONFIG_USER_SCHED */
8496 +#endif /* CONFIG_FAIR_GROUP_SCHED */
8497 +#ifdef CONFIG_RT_GROUP_SCHED
8498 +               init_task_group.rt_se = (struct sched_rt_entity **)ptr;
8499 +               ptr += nr_cpu_ids * sizeof(void **);
8500 +
8501 +               init_task_group.rt_rq = (struct rt_rq **)ptr;
8502 +               ptr += nr_cpu_ids * sizeof(void **);
8503 +
8504 +#ifdef CONFIG_USER_SCHED
8505 +               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8506 +               ptr += nr_cpu_ids * sizeof(void **);
8507 +
8508 +               root_task_group.rt_rq = (struct rt_rq **)ptr;
8509 +               ptr += nr_cpu_ids * sizeof(void **);
8510 +#endif /* CONFIG_USER_SCHED */
8511 +#endif /* CONFIG_RT_GROUP_SCHED */
8512 +       }
8513 +
8514 +#ifdef CONFIG_SMP
8515 +       init_defrootdomain();
8516 +#endif
8517 +
8518 +       init_rt_bandwidth(&def_rt_bandwidth,
8519 +                       global_rt_period(), global_rt_runtime());
8520 +
8521 +#ifdef CONFIG_RT_GROUP_SCHED
8522 +       init_rt_bandwidth(&init_task_group.rt_bandwidth,
8523 +                       global_rt_period(), global_rt_runtime());
8524 +#ifdef CONFIG_USER_SCHED
8525 +       init_rt_bandwidth(&root_task_group.rt_bandwidth,
8526 +                       global_rt_period(), RUNTIME_INF);
8527 +#endif /* CONFIG_USER_SCHED */
8528 +#endif /* CONFIG_RT_GROUP_SCHED */
8529 +
8530 +#ifdef CONFIG_GROUP_SCHED
8531 +       list_add(&init_task_group.list, &task_groups);
8532 +       INIT_LIST_HEAD(&init_task_group.children);
8533 +
8534 +#ifdef CONFIG_USER_SCHED
8535 +       INIT_LIST_HEAD(&root_task_group.children);
8536 +       init_task_group.parent = &root_task_group;
8537 +       list_add(&init_task_group.siblings, &root_task_group.children);
8538 +#endif /* CONFIG_USER_SCHED */
8539 +#endif /* CONFIG_GROUP_SCHED */
8540 +
8541 +       for_each_possible_cpu(i) {
8542 +               struct rq *rq;
8543 +
8544 +               rq = cpu_rq(i);
8545 +               spin_lock_init(&rq->lock);
8546 +               rq->nr_running = 0;
8547 +               init_cfs_rq(&rq->cfs, rq);
8548 +               init_rt_rq(&rq->rt, rq);
8549 +#ifdef CONFIG_FAIR_GROUP_SCHED
8550 +               init_task_group.shares = init_task_group_load;
8551 +               INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8552 +#ifdef CONFIG_CGROUP_SCHED
8553 +               /*
8554 +                * How much cpu bandwidth does init_task_group get?
8555 +                *
8556 +                * In case of task-groups formed thr' the cgroup filesystem, it
8557 +                * gets 100% of the cpu resources in the system. This overall
8558 +                * system cpu resource is divided among the tasks of
8559 +                * init_task_group and its child task-groups in a fair manner,
8560 +                * based on each entity's (task or task-group's) weight
8561 +                * (se->load.weight).
8562 +                *
8563 +                * In other words, if init_task_group has 10 tasks of weight
8564 +                * 1024) and two child groups A0 and A1 (of weight 1024 each),
8565 +                * then A0's share of the cpu resource is:
8566 +                *
8567 +                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8568 +                *
8569 +                * We achieve this by letting init_task_group's tasks sit
8570 +                * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8571 +                */
8572 +               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
8573 +#elif defined CONFIG_USER_SCHED
8574 +               root_task_group.shares = NICE_0_LOAD;
8575 +               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
8576 +               /*
8577 +                * In case of task-groups formed thr' the user id of tasks,
8578 +                * init_task_group represents tasks belonging to root user.
8579 +                * Hence it forms a sibling of all subsequent groups formed.
8580 +                * In this case, init_task_group gets only a fraction of overall
8581 +                * system cpu resource, based on the weight assigned to root
8582 +                * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
8583 +                * by letting tasks of init_task_group sit in a separate cfs_rq
8584 +                * (init_cfs_rq) and having one entity represent this group of
8585 +                * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
8586 +                */
8587 +               init_tg_cfs_entry(&init_task_group,
8588 +                               &per_cpu(init_cfs_rq, i),
8589 +                               &per_cpu(init_sched_entity, i), i, 1,
8590 +                               root_task_group.se[i]);
8591 +
8592 +#endif
8593 +#endif /* CONFIG_FAIR_GROUP_SCHED */
8594 +
8595 +               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8596 +#ifdef CONFIG_RT_GROUP_SCHED
8597 +               INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8598 +#ifdef CONFIG_CGROUP_SCHED
8599 +               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8600 +#elif defined CONFIG_USER_SCHED
8601 +               init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
8602 +               init_tg_rt_entry(&init_task_group,
8603 +                               &per_cpu(init_rt_rq, i),
8604 +                               &per_cpu(init_sched_rt_entity, i), i, 1,
8605 +                               root_task_group.rt_se[i]);
8606 +#endif
8607 +#endif
8608 +
8609 +               for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8610 +                       rq->cpu_load[j] = 0;
8611 +#ifdef CONFIG_SMP
8612 +               rq->sd = NULL;
8613 +               rq->rd = NULL;
8614 +               rq->active_balance = 0;
8615 +               rq->next_balance = jiffies;
8616 +               rq->push_cpu = 0;
8617 +               rq->cpu = i;
8618 +               rq->online = 0;
8619 +               rq->migration_thread = NULL;
8620 +               INIT_LIST_HEAD(&rq->migration_queue);
8621 +               rq_attach_root(rq, &def_root_domain);
8622 +#endif
8623 +               init_rq_hrtick(rq);
8624 +               atomic_set(&rq->nr_iowait, 0);
8625 +       }
8626 +
8627 +       set_load_weight(&init_task);
8628 +
8629 +#ifdef CONFIG_PREEMPT_NOTIFIERS
8630 +       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8631 +#endif
8632 +
8633 +#ifdef CONFIG_SMP
8634 +       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8635 +#endif
8636 +
8637 +#ifdef CONFIG_RT_MUTEXES
8638 +       plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
8639 +#endif
8640 +
8641 +       /*
8642 +        * The boot idle thread does lazy MMU switching as well:
8643 +        */
8644 +       atomic_inc(&init_mm.mm_count);
8645 +       enter_lazy_tlb(&init_mm, current);
8646 +
8647 +       /*
8648 +        * Make us the idle thread. Technically, schedule() should not be
8649 +        * called from this thread, however somewhere below it might be,
8650 +        * but because we are the idle thread, we just pick up running again
8651 +        * when this runqueue becomes "idle".
8652 +        */
8653 +       init_idle(current, smp_processor_id());
8654 +       /*
8655 +        * During early bootup we pretend to be a normal task:
8656 +        */
8657 +       current->sched_class = &fair_sched_class;
8658 +
8659 +       scheduler_running = 1;
8660 +}
8661 +
8662 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
8663 +void __might_sleep(char *file, int line)
8664 +{
8665 +#ifdef in_atomic
8666 +       static unsigned long prev_jiffy;        /* ratelimiting */
8667 +
8668 +       if ((in_atomic() || irqs_disabled()) &&
8669 +           system_state == SYSTEM_RUNNING && !oops_in_progress) {
8670 +               if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8671 +                       return;
8672 +               prev_jiffy = jiffies;
8673 +               printk(KERN_ERR "BUG: sleeping function called from invalid"
8674 +                               " context at %s:%d\n", file, line);
8675 +               printk("in_atomic():%d, irqs_disabled():%d\n",
8676 +                       in_atomic(), irqs_disabled());
8677 +               debug_show_held_locks(current);
8678 +               if (irqs_disabled())
8679 +                       print_irqtrace_events(current);
8680 +               dump_stack();
8681 +       }
8682 +#endif
8683 +}
8684 +EXPORT_SYMBOL(__might_sleep);
8685 +#endif
8686 +
8687 +#ifdef CONFIG_MAGIC_SYSRQ
8688 +static void normalize_task(struct rq *rq, struct task_struct *p)
8689 +{
8690 +       int on_rq;
8691 +
8692 +       update_rq_clock(rq);
8693 +       on_rq = p->se.on_rq;
8694 +       if (on_rq)
8695 +               deactivate_task(rq, p, 0);
8696 +       __setscheduler(rq, p, SCHED_NORMAL, 0);
8697 +       if (on_rq) {
8698 +               activate_task(rq, p, 0);
8699 +               resched_task(rq->curr);
8700 +       }
8701 +}
8702 +
8703 +void normalize_rt_tasks(void)
8704 +{
8705 +       struct task_struct *g, *p;
8706 +       unsigned long flags;
8707 +       struct rq *rq;
8708 +
8709 +       read_lock_irqsave(&tasklist_lock, flags);
8710 +       do_each_thread(g, p) {
8711 +               /*
8712 +                * Only normalize user tasks:
8713 +                */
8714 +               if (!p->mm)
8715 +                       continue;
8716 +
8717 +               p->se.exec_start                = 0;
8718 +#ifdef CONFIG_SCHEDSTATS
8719 +               p->se.wait_start                = 0;
8720 +               p->se.sleep_start               = 0;
8721 +               p->se.block_start               = 0;
8722 +#endif
8723 +
8724 +               if (!rt_task(p)) {
8725 +                       /*
8726 +                        * Renice negative nice level userspace
8727 +                        * tasks back to 0:
8728 +                        */
8729 +                       if (TASK_NICE(p) < 0 && p->mm)
8730 +                               set_user_nice(p, 0);
8731 +                       continue;
8732 +               }
8733 +
8734 +               spin_lock(&p->pi_lock);
8735 +               rq = __task_rq_lock(p);
8736 +
8737 +               normalize_task(rq, p);
8738 +
8739 +               __task_rq_unlock(rq);
8740 +               spin_unlock(&p->pi_lock);
8741 +       } while_each_thread(g, p);
8742 +
8743 +       read_unlock_irqrestore(&tasklist_lock, flags);
8744 +}
8745 +
8746 +#endif /* CONFIG_MAGIC_SYSRQ */
8747 +
8748 +#ifdef CONFIG_IA64
8749 +/*
8750 + * These functions are only useful for the IA64 MCA handling.
8751 + *
8752 + * They can only be called when the whole system has been
8753 + * stopped - every CPU needs to be quiescent, and no scheduling
8754 + * activity can take place. Using them for anything else would
8755 + * be a serious bug, and as a result, they aren't even visible
8756 + * under any other configuration.
8757 + */
8758 +
8759 +/**
8760 + * curr_task - return the current task for a given cpu.
8761 + * @cpu: the processor in question.
8762 + *
8763 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8764 + */
8765 +struct task_struct *curr_task(int cpu)
8766 +{
8767 +       return cpu_curr(cpu);
8768 +}
8769 +
8770 +/**
8771 + * set_curr_task - set the current task for a given cpu.
8772 + * @cpu: the processor in question.
8773 + * @p: the task pointer to set.
8774 + *
8775 + * Description: This function must only be used when non-maskable interrupts
8776 + * are serviced on a separate stack. It allows the architecture to switch the
8777 + * notion of the current task on a cpu in a non-blocking manner. This function
8778 + * must be called with all CPU's synchronized, and interrupts disabled, the
8779 + * and caller must save the original value of the current task (see
8780 + * curr_task() above) and restore that value before reenabling interrupts and
8781 + * re-starting the system.
8782 + *
8783 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8784 + */
8785 +void set_curr_task(int cpu, struct task_struct *p)
8786 +{
8787 +       cpu_curr(cpu) = p;
8788 +}
8789 +
8790 +#endif
8791 +
8792 +#ifdef CONFIG_FAIR_GROUP_SCHED
8793 +static void free_fair_sched_group(struct task_group *tg)
8794 +{
8795 +       int i;
8796 +
8797 +       for_each_possible_cpu(i) {
8798 +               if (tg->cfs_rq)
8799 +                       kfree(tg->cfs_rq[i]);
8800 +               if (tg->se)
8801 +                       kfree(tg->se[i]);
8802 +       }
8803 +
8804 +       kfree(tg->cfs_rq);
8805 +       kfree(tg->se);
8806 +}
8807 +
8808 +static
8809 +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8810 +{
8811 +       struct cfs_rq *cfs_rq;
8812 +       struct sched_entity *se, *parent_se;
8813 +       struct rq *rq;
8814 +       int i;
8815 +
8816 +       tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8817 +       if (!tg->cfs_rq)
8818 +               goto err;
8819 +       tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8820 +       if (!tg->se)
8821 +               goto err;
8822 +
8823 +       tg->shares = NICE_0_LOAD;
8824 +
8825 +       for_each_possible_cpu(i) {
8826 +               rq = cpu_rq(i);
8827 +
8828 +               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
8829 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8830 +               if (!cfs_rq)
8831 +                       goto err;
8832 +
8833 +               se = kmalloc_node(sizeof(struct sched_entity),
8834 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8835 +               if (!se)
8836 +                       goto err;
8837 +
8838 +               parent_se = parent ? parent->se[i] : NULL;
8839 +               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8840 +       }
8841 +
8842 +       return 1;
8843 +
8844 + err:
8845 +       return 0;
8846 +}
8847 +
8848 +static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8849 +{
8850 +       list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8851 +                       &cpu_rq(cpu)->leaf_cfs_rq_list);
8852 +}
8853 +
8854 +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8855 +{
8856 +       list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8857 +}
8858 +#else /* !CONFG_FAIR_GROUP_SCHED */
8859 +static inline void free_fair_sched_group(struct task_group *tg)
8860 +{
8861 +}
8862 +
8863 +static inline
8864 +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8865 +{
8866 +       return 1;
8867 +}
8868 +
8869 +static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8870 +{
8871 +}
8872 +
8873 +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8874 +{
8875 +}
8876 +#endif /* CONFIG_FAIR_GROUP_SCHED */
8877 +
8878 +#ifdef CONFIG_RT_GROUP_SCHED
8879 +static void free_rt_sched_group(struct task_group *tg)
8880 +{
8881 +       int i;
8882 +
8883 +       destroy_rt_bandwidth(&tg->rt_bandwidth);
8884 +
8885 +       for_each_possible_cpu(i) {
8886 +               if (tg->rt_rq)
8887 +                       kfree(tg->rt_rq[i]);
8888 +               if (tg->rt_se)
8889 +                       kfree(tg->rt_se[i]);
8890 +       }
8891 +
8892 +       kfree(tg->rt_rq);
8893 +       kfree(tg->rt_se);
8894 +}
8895 +
8896 +static
8897 +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8898 +{
8899 +       struct rt_rq *rt_rq;
8900 +       struct sched_rt_entity *rt_se, *parent_se;
8901 +       struct rq *rq;
8902 +       int i;
8903 +
8904 +       tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8905 +       if (!tg->rt_rq)
8906 +               goto err;
8907 +       tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8908 +       if (!tg->rt_se)
8909 +               goto err;
8910 +
8911 +       init_rt_bandwidth(&tg->rt_bandwidth,
8912 +                       ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8913 +
8914 +       for_each_possible_cpu(i) {
8915 +               rq = cpu_rq(i);
8916 +
8917 +               rt_rq = kmalloc_node(sizeof(struct rt_rq),
8918 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8919 +               if (!rt_rq)
8920 +                       goto err;
8921 +
8922 +               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
8923 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8924 +               if (!rt_se)
8925 +                       goto err;
8926 +
8927 +               parent_se = parent ? parent->rt_se[i] : NULL;
8928 +               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8929 +       }
8930 +
8931 +       return 1;
8932 +
8933 + err:
8934 +       return 0;
8935 +}
8936 +
8937 +static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8938 +{
8939 +       list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8940 +                       &cpu_rq(cpu)->leaf_rt_rq_list);
8941 +}
8942 +
8943 +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8944 +{
8945 +       list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8946 +}
8947 +#else /* !CONFIG_RT_GROUP_SCHED */
8948 +static inline void free_rt_sched_group(struct task_group *tg)
8949 +{
8950 +}
8951 +
8952 +static inline
8953 +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8954 +{
8955 +       return 1;
8956 +}
8957 +
8958 +static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8959 +{
8960 +}
8961 +
8962 +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8963 +{
8964 +}
8965 +#endif /* CONFIG_RT_GROUP_SCHED */
8966 +
8967 +#ifdef CONFIG_GROUP_SCHED
8968 +static void free_sched_group(struct task_group *tg)
8969 +{
8970 +       free_fair_sched_group(tg);
8971 +       free_rt_sched_group(tg);
8972 +       kfree(tg);
8973 +}
8974 +
8975 +/* allocate runqueue etc for a new task group */
8976 +struct task_group *sched_create_group(struct task_group *parent)
8977 +{
8978 +       struct task_group *tg;
8979 +       unsigned long flags;
8980 +       int i;
8981 +
8982 +       tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8983 +       if (!tg)
8984 +               return ERR_PTR(-ENOMEM);
8985 +
8986 +       if (!alloc_fair_sched_group(tg, parent))
8987 +               goto err;
8988 +
8989 +       if (!alloc_rt_sched_group(tg, parent))
8990 +               goto err;
8991 +
8992 +       spin_lock_irqsave(&task_group_lock, flags);
8993 +       for_each_possible_cpu(i) {
8994 +               register_fair_sched_group(tg, i);
8995 +               register_rt_sched_group(tg, i);
8996 +       }
8997 +       list_add_rcu(&tg->list, &task_groups);
8998 +
8999 +       WARN_ON(!parent); /* root should already exist */
9000 +
9001 +       tg->parent = parent;
9002 +       INIT_LIST_HEAD(&tg->children);
9003 +       list_add_rcu(&tg->siblings, &parent->children);
9004 +       spin_unlock_irqrestore(&task_group_lock, flags);
9005 +
9006 +       return tg;
9007 +
9008 +err:
9009 +       free_sched_group(tg);
9010 +       return ERR_PTR(-ENOMEM);
9011 +}
9012 +
9013 +/* rcu callback to free various structures associated with a task group */
9014 +static void free_sched_group_rcu(struct rcu_head *rhp)
9015 +{
9016 +       /* now it should be safe to free those cfs_rqs */
9017 +       free_sched_group(container_of(rhp, struct task_group, rcu));
9018 +}
9019 +
9020 +/* Destroy runqueue etc associated with a task group */
9021 +void sched_destroy_group(struct task_group *tg)
9022 +{
9023 +       unsigned long flags;
9024 +       int i;
9025 +
9026 +       spin_lock_irqsave(&task_group_lock, flags);
9027 +       for_each_possible_cpu(i) {
9028 +               unregister_fair_sched_group(tg, i);
9029 +               unregister_rt_sched_group(tg, i);
9030 +       }
9031 +       list_del_rcu(&tg->list);
9032 +       list_del_rcu(&tg->siblings);
9033 +       spin_unlock_irqrestore(&task_group_lock, flags);
9034 +
9035 +       /* wait for possible concurrent references to cfs_rqs complete */
9036 +       call_rcu(&tg->rcu, free_sched_group_rcu);
9037 +}
9038 +
9039 +/* change task's runqueue when it moves between groups.
9040 + *     The caller of this function should have put the task in its new group
9041 + *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
9042 + *     reflect its new group.
9043 + */
9044 +void sched_move_task(struct task_struct *tsk)
9045 +{
9046 +       int on_rq, running;
9047 +       unsigned long flags;
9048 +       struct rq *rq;
9049 +
9050 +       rq = task_rq_lock(tsk, &flags);
9051 +
9052 +       update_rq_clock(rq);
9053 +
9054 +       running = task_current(rq, tsk);
9055 +       on_rq = tsk->se.on_rq;
9056 +
9057 +       if (on_rq)
9058 +               dequeue_task(rq, tsk, 0);
9059 +       if (unlikely(running))
9060 +               tsk->sched_class->put_prev_task(rq, tsk);
9061 +
9062 +       set_task_rq(tsk, task_cpu(tsk));
9063 +
9064 +#ifdef CONFIG_FAIR_GROUP_SCHED
9065 +       if (tsk->sched_class->moved_group)
9066 +               tsk->sched_class->moved_group(tsk);
9067 +#endif
9068 +
9069 +       if (unlikely(running))
9070 +               tsk->sched_class->set_curr_task(rq);
9071 +       if (on_rq)
9072 +               enqueue_task(rq, tsk, 0);
9073 +
9074 +       task_rq_unlock(rq, &flags);
9075 +}
9076 +#endif /* CONFIG_GROUP_SCHED */
9077 +
9078 +#ifdef CONFIG_FAIR_GROUP_SCHED
9079 +static void __set_se_shares(struct sched_entity *se, unsigned long shares)
9080 +{
9081 +       struct cfs_rq *cfs_rq = se->cfs_rq;
9082 +       int on_rq;
9083 +
9084 +       on_rq = se->on_rq;
9085 +       if (on_rq)
9086 +               dequeue_entity(cfs_rq, se, 0);
9087 +
9088 +       se->load.weight = shares;
9089 +       se->load.inv_weight = 0;
9090 +
9091 +       if (on_rq)
9092 +               enqueue_entity(cfs_rq, se, 0);
9093 +}
9094 +
9095 +static void set_se_shares(struct sched_entity *se, unsigned long shares)
9096 +{
9097 +       struct cfs_rq *cfs_rq = se->cfs_rq;
9098 +       struct rq *rq = cfs_rq->rq;
9099 +       unsigned long flags;
9100 +
9101 +       spin_lock_irqsave(&rq->lock, flags);
9102 +       __set_se_shares(se, shares);
9103 +       spin_unlock_irqrestore(&rq->lock, flags);
9104 +}
9105 +
9106 +static DEFINE_MUTEX(shares_mutex);
9107 +
9108 +int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9109 +{
9110 +       int i;
9111 +       unsigned long flags;
9112 +
9113 +       /*
9114 +        * We can't change the weight of the root cgroup.
9115 +        */
9116 +       if (!tg->se[0])
9117 +               return -EINVAL;
9118 +
9119 +       if (shares < MIN_SHARES)
9120 +               shares = MIN_SHARES;
9121 +       else if (shares > MAX_SHARES)
9122 +               shares = MAX_SHARES;
9123 +
9124 +       mutex_lock(&shares_mutex);
9125 +       if (tg->shares == shares)
9126 +               goto done;
9127 +
9128 +       spin_lock_irqsave(&task_group_lock, flags);
9129 +       for_each_possible_cpu(i)
9130 +               unregister_fair_sched_group(tg, i);
9131 +       list_del_rcu(&tg->siblings);
9132 +       spin_unlock_irqrestore(&task_group_lock, flags);
9133 +
9134 +       /* wait for any ongoing reference to this group to finish */
9135 +       synchronize_sched();
9136 +
9137 +       /*
9138 +        * Now we are free to modify the group's share on each cpu
9139 +        * w/o tripping rebalance_share or load_balance_fair.
9140 +        */
9141 +       tg->shares = shares;
9142 +       for_each_possible_cpu(i) {
9143 +               /*
9144 +                * force a rebalance
9145 +                */
9146 +               cfs_rq_set_shares(tg->cfs_rq[i], 0);
9147 +               set_se_shares(tg->se[i], shares);
9148 +       }
9149 +
9150 +       /*
9151 +        * Enable load balance activity on this group, by inserting it back on
9152 +        * each cpu's rq->leaf_cfs_rq_list.
9153 +        */
9154 +       spin_lock_irqsave(&task_group_lock, flags);
9155 +       for_each_possible_cpu(i)
9156 +               register_fair_sched_group(tg, i);
9157 +       list_add_rcu(&tg->siblings, &tg->parent->children);
9158 +       spin_unlock_irqrestore(&task_group_lock, flags);
9159 +done:
9160 +       mutex_unlock(&shares_mutex);
9161 +       return 0;
9162 +}
9163 +
9164 +unsigned long sched_group_shares(struct task_group *tg)
9165 +{
9166 +       return tg->shares;
9167 +}
9168 +#endif
9169 +
9170 +#ifdef CONFIG_RT_GROUP_SCHED
9171 +/*
9172 + * Ensure that the real time constraints are schedulable.
9173 + */
9174 +static DEFINE_MUTEX(rt_constraints_mutex);
9175 +
9176 +static unsigned long to_ratio(u64 period, u64 runtime)
9177 +{
9178 +       if (runtime == RUNTIME_INF)
9179 +               return 1ULL << 16;
9180 +
9181 +       return div64_u64(runtime << 16, period);
9182 +}
9183 +
9184 +#ifdef CONFIG_CGROUP_SCHED
9185 +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
9186 +{
9187 +       struct task_group *tgi, *parent = tg->parent;
9188 +       unsigned long total = 0;
9189 +
9190 +       if (!parent) {
9191 +               if (global_rt_period() < period)
9192 +                       return 0;
9193 +
9194 +               return to_ratio(period, runtime) <
9195 +                       to_ratio(global_rt_period(), global_rt_runtime());
9196 +       }
9197 +
9198 +       if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
9199 +               return 0;
9200 +
9201 +       rcu_read_lock();
9202 +       list_for_each_entry_rcu(tgi, &parent->children, siblings) {
9203 +               if (tgi == tg)
9204 +                       continue;
9205 +
9206 +               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
9207 +                               tgi->rt_bandwidth.rt_runtime);
9208 +       }
9209 +       rcu_read_unlock();
9210 +
9211 +       return total + to_ratio(period, runtime) <=
9212 +               to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
9213 +                               parent->rt_bandwidth.rt_runtime);
9214 +}
9215 +#elif defined CONFIG_USER_SCHED
9216 +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
9217 +{
9218 +       struct task_group *tgi;
9219 +       unsigned long total = 0;
9220 +       unsigned long global_ratio =
9221 +               to_ratio(global_rt_period(), global_rt_runtime());
9222 +
9223 +       rcu_read_lock();
9224 +       list_for_each_entry_rcu(tgi, &task_groups, list) {
9225 +               if (tgi == tg)
9226 +                       continue;
9227 +
9228 +               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
9229 +                               tgi->rt_bandwidth.rt_runtime);
9230 +       }
9231 +       rcu_read_unlock();
9232 +
9233 +       return total + to_ratio(period, runtime) < global_ratio;
9234 +}
9235 +#endif
9236 +
9237 +/* Must be called with tasklist_lock held */
9238 +static inline int tg_has_rt_tasks(struct task_group *tg)
9239 +{
9240 +       struct task_struct *g, *p;
9241 +       do_each_thread(g, p) {
9242 +               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
9243 +                       return 1;
9244 +       } while_each_thread(g, p);
9245 +       return 0;
9246 +}
9247 +
9248 +static int tg_set_bandwidth(struct task_group *tg,
9249 +               u64 rt_period, u64 rt_runtime)
9250 +{
9251 +       int i, err = 0;
9252 +
9253 +       mutex_lock(&rt_constraints_mutex);
9254 +       read_lock(&tasklist_lock);
9255 +       if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
9256 +               err = -EBUSY;
9257 +               goto unlock;
9258 +       }
9259 +       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
9260 +               err = -EINVAL;
9261 +               goto unlock;
9262 +       }
9263 +
9264 +       spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
9265 +       tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
9266 +       tg->rt_bandwidth.rt_runtime = rt_runtime;
9267 +
9268 +       for_each_possible_cpu(i) {
9269 +               struct rt_rq *rt_rq = tg->rt_rq[i];
9270 +
9271 +               spin_lock(&rt_rq->rt_runtime_lock);
9272 +               rt_rq->rt_runtime = rt_runtime;
9273 +               spin_unlock(&rt_rq->rt_runtime_lock);
9274 +       }
9275 +       spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
9276 + unlock:
9277 +       read_unlock(&tasklist_lock);
9278 +       mutex_unlock(&rt_constraints_mutex);
9279 +
9280 +       return err;
9281 +}
9282 +
9283 +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
9284 +{
9285 +       u64 rt_runtime, rt_period;
9286 +
9287 +       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
9288 +       rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
9289 +       if (rt_runtime_us < 0)
9290 +               rt_runtime = RUNTIME_INF;
9291 +
9292 +       return tg_set_bandwidth(tg, rt_period, rt_runtime);
9293 +}
9294 +
9295 +long sched_group_rt_runtime(struct task_group *tg)
9296 +{
9297 +       u64 rt_runtime_us;
9298 +
9299 +       if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
9300 +               return -1;
9301 +
9302 +       rt_runtime_us = tg->rt_bandwidth.rt_runtime;
9303 +       do_div(rt_runtime_us, NSEC_PER_USEC);
9304 +       return rt_runtime_us;
9305 +}
9306 +
9307 +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
9308 +{
9309 +       u64 rt_runtime, rt_period;
9310 +
9311 +       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
9312 +       rt_runtime = tg->rt_bandwidth.rt_runtime;
9313 +
9314 +       if (rt_period == 0)
9315 +               return -EINVAL;
9316 +
9317 +       return tg_set_bandwidth(tg, rt_period, rt_runtime);
9318 +}
9319 +
9320 +long sched_group_rt_period(struct task_group *tg)
9321 +{
9322 +       u64 rt_period_us;
9323 +
9324 +       rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
9325 +       do_div(rt_period_us, NSEC_PER_USEC);
9326 +       return rt_period_us;
9327 +}
9328 +
9329 +static int sched_rt_global_constraints(void)
9330 +{
9331 +       struct task_group *tg = &root_task_group;
9332 +       u64 rt_runtime, rt_period;
9333 +       int ret = 0;
9334 +
9335 +       if (sysctl_sched_rt_period <= 0)
9336 +               return -EINVAL;
9337 +
9338 +       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
9339 +       rt_runtime = tg->rt_bandwidth.rt_runtime;
9340 +
9341 +       mutex_lock(&rt_constraints_mutex);
9342 +       if (!__rt_schedulable(tg, rt_period, rt_runtime))
9343 +               ret = -EINVAL;
9344 +       mutex_unlock(&rt_constraints_mutex);
9345 +
9346 +       return ret;
9347 +}
9348 +#else /* !CONFIG_RT_GROUP_SCHED */
9349 +static int sched_rt_global_constraints(void)
9350 +{
9351 +       unsigned long flags;
9352 +       int i;
9353 +
9354 +       if (sysctl_sched_rt_period <= 0)
9355 +               return -EINVAL;
9356 +
9357 +       spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9358 +       for_each_possible_cpu(i) {
9359 +               struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9360 +
9361 +               spin_lock(&rt_rq->rt_runtime_lock);
9362 +               rt_rq->rt_runtime = global_rt_runtime();
9363 +               spin_unlock(&rt_rq->rt_runtime_lock);
9364 +       }
9365 +       spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
9366 +
9367 +       return 0;
9368 +}
9369 +#endif /* CONFIG_RT_GROUP_SCHED */
9370 +
9371 +int sched_rt_handler(struct ctl_table *table, int write,
9372 +               struct file *filp, void __user *buffer, size_t *lenp,
9373 +               loff_t *ppos)
9374 +{
9375 +       int ret;
9376 +       int old_period, old_runtime;
9377 +       static DEFINE_MUTEX(mutex);
9378 +
9379 +       mutex_lock(&mutex);
9380 +       old_period = sysctl_sched_rt_period;
9381 +       old_runtime = sysctl_sched_rt_runtime;
9382 +
9383 +       ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
9384 +
9385 +       if (!ret && write) {
9386 +               ret = sched_rt_global_constraints();
9387 +               if (ret) {
9388 +                       sysctl_sched_rt_period = old_period;
9389 +                       sysctl_sched_rt_runtime = old_runtime;
9390 +               } else {
9391 +                       def_rt_bandwidth.rt_runtime = global_rt_runtime();
9392 +                       def_rt_bandwidth.rt_period =
9393 +                               ns_to_ktime(global_rt_period());
9394 +               }
9395 +       }
9396 +       mutex_unlock(&mutex);
9397 +
9398 +       return ret;
9399 +}
9400 +
9401 +#ifdef CONFIG_CGROUP_SCHED
9402 +
9403 +/* return corresponding task_group object of a cgroup */
9404 +static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
9405 +{
9406 +       return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
9407 +                           struct task_group, css);
9408 +}
9409 +
9410 +static struct cgroup_subsys_state *
9411 +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9412 +{
9413 +       struct task_group *tg, *parent;
9414 +
9415 +       if (!cgrp->parent) {
9416 +               /* This is early initialization for the top cgroup */
9417 +               init_task_group.css.cgroup = cgrp;
9418 +               return &init_task_group.css;
9419 +       }
9420 +
9421 +       parent = cgroup_tg(cgrp->parent);
9422 +       tg = sched_create_group(parent);
9423 +       if (IS_ERR(tg))
9424 +               return ERR_PTR(-ENOMEM);
9425 +
9426 +       /* Bind the cgroup to task_group object we just created */
9427 +       tg->css.cgroup = cgrp;
9428 +
9429 +       return &tg->css;
9430 +}
9431 +
9432 +static void
9433 +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9434 +{
9435 +       struct task_group *tg = cgroup_tg(cgrp);
9436 +
9437 +       sched_destroy_group(tg);
9438 +}
9439 +
9440 +static int
9441 +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9442 +                     struct task_struct *tsk)
9443 +{
9444 +#ifdef CONFIG_RT_GROUP_SCHED
9445 +       /* Don't accept realtime tasks when there is no way for them to run */
9446 +       if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9447 +               return -EINVAL;
9448 +#else
9449 +       /* We don't support RT-tasks being in separate groups */
9450 +       if (tsk->sched_class != &fair_sched_class)
9451 +               return -EINVAL;
9452 +#endif
9453 +
9454 +       return 0;
9455 +}
9456 +
9457 +static void
9458 +cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9459 +                       struct cgroup *old_cont, struct task_struct *tsk)
9460 +{
9461 +       sched_move_task(tsk);
9462 +}
9463 +
9464 +#ifdef CONFIG_FAIR_GROUP_SCHED
9465 +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9466 +                               u64 shareval)
9467 +{
9468 +       return sched_group_set_shares(cgroup_tg(cgrp), shareval);
9469 +}
9470 +
9471 +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9472 +{
9473 +       struct task_group *tg = cgroup_tg(cgrp);
9474 +
9475 +       return (u64) tg->shares;
9476 +}
9477 +#endif /* CONFIG_FAIR_GROUP_SCHED */
9478 +
9479 +#ifdef CONFIG_RT_GROUP_SCHED
9480 +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9481 +                               s64 val)
9482 +{
9483 +       return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9484 +}
9485 +
9486 +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9487 +{
9488 +       return sched_group_rt_runtime(cgroup_tg(cgrp));
9489 +}
9490 +
9491 +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9492 +               u64 rt_period_us)
9493 +{
9494 +       return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9495 +}
9496 +
9497 +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9498 +{
9499 +       return sched_group_rt_period(cgroup_tg(cgrp));
9500 +}
9501 +#endif /* CONFIG_RT_GROUP_SCHED */
9502 +
9503 +static struct cftype cpu_files[] = {
9504 +#ifdef CONFIG_FAIR_GROUP_SCHED
9505 +       {
9506 +               .name = "shares",
9507 +               .read_u64 = cpu_shares_read_u64,
9508 +               .write_u64 = cpu_shares_write_u64,
9509 +       },
9510 +#endif
9511 +#ifdef CONFIG_RT_GROUP_SCHED
9512 +       {
9513 +               .name = "rt_runtime_us",
9514 +               .read_s64 = cpu_rt_runtime_read,
9515 +               .write_s64 = cpu_rt_runtime_write,
9516 +       },
9517 +       {
9518 +               .name = "rt_period_us",
9519 +               .read_u64 = cpu_rt_period_read_uint,
9520 +               .write_u64 = cpu_rt_period_write_uint,
9521 +       },
9522 +#endif
9523 +};
9524 +
9525 +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
9526 +{
9527 +       return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
9528 +}
9529 +
9530 +struct cgroup_subsys cpu_cgroup_subsys = {
9531 +       .name           = "cpu",
9532 +       .create         = cpu_cgroup_create,
9533 +       .destroy        = cpu_cgroup_destroy,
9534 +       .can_attach     = cpu_cgroup_can_attach,
9535 +       .attach         = cpu_cgroup_attach,
9536 +       .populate       = cpu_cgroup_populate,
9537 +       .subsys_id      = cpu_cgroup_subsys_id,
9538 +       .early_init     = 1,
9539 +};
9540 +
9541 +#endif /* CONFIG_CGROUP_SCHED */
9542 +
9543 +#ifdef CONFIG_CGROUP_CPUACCT
9544 +
9545 +/*
9546 + * CPU accounting code for task groups.
9547 + *
9548 + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
9549 + * (balbir@in.ibm.com).
9550 + */
9551 +
9552 +/* track cpu usage of a group of tasks */
9553 +struct cpuacct {
9554 +       struct cgroup_subsys_state css;
9555 +       /* cpuusage holds pointer to a u64-type object on every cpu */
9556 +       u64 *cpuusage;
9557 +};
9558 +
9559 +struct cgroup_subsys cpuacct_subsys;
9560 +
9561 +/* return cpu accounting group corresponding to this container */
9562 +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9563 +{
9564 +       return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9565 +                           struct cpuacct, css);
9566 +}
9567 +
9568 +/* return cpu accounting group to which this task belongs */
9569 +static inline struct cpuacct *task_ca(struct task_struct *tsk)
9570 +{
9571 +       return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9572 +                           struct cpuacct, css);
9573 +}
9574 +
9575 +/* create a new cpu accounting group */
9576 +static struct cgroup_subsys_state *cpuacct_create(
9577 +       struct cgroup_subsys *ss, struct cgroup *cgrp)
9578 +{
9579 +       struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9580 +
9581 +       if (!ca)
9582 +               return ERR_PTR(-ENOMEM);
9583 +
9584 +       ca->cpuusage = alloc_percpu(u64);
9585 +       if (!ca->cpuusage) {
9586 +               kfree(ca);
9587 +               return ERR_PTR(-ENOMEM);
9588 +       }
9589 +
9590 +       return &ca->css;
9591 +}
9592 +
9593 +/* destroy an existing cpu accounting group */
9594 +static void
9595 +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9596 +{
9597 +       struct cpuacct *ca = cgroup_ca(cgrp);
9598 +
9599 +       free_percpu(ca->cpuusage);
9600 +       kfree(ca);
9601 +}
9602 +
9603 +/* return total cpu usage (in nanoseconds) of a group */
9604 +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9605 +{
9606 +       struct cpuacct *ca = cgroup_ca(cgrp);
9607 +       u64 totalcpuusage = 0;
9608 +       int i;
9609 +
9610 +       for_each_possible_cpu(i) {
9611 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9612 +
9613 +               /*
9614 +                * Take rq->lock to make 64-bit addition safe on 32-bit
9615 +                * platforms.
9616 +                */
9617 +               spin_lock_irq(&cpu_rq(i)->lock);
9618 +               totalcpuusage += *cpuusage;
9619 +               spin_unlock_irq(&cpu_rq(i)->lock);
9620 +       }
9621 +
9622 +       return totalcpuusage;
9623 +}
9624 +
9625 +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9626 +                                                               u64 reset)
9627 +{
9628 +       struct cpuacct *ca = cgroup_ca(cgrp);
9629 +       int err = 0;
9630 +       int i;
9631 +
9632 +       if (reset) {
9633 +               err = -EINVAL;
9634 +               goto out;
9635 +       }
9636 +
9637 +       for_each_possible_cpu(i) {
9638 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9639 +
9640 +               spin_lock_irq(&cpu_rq(i)->lock);
9641 +               *cpuusage = 0;
9642 +               spin_unlock_irq(&cpu_rq(i)->lock);
9643 +       }
9644 +out:
9645 +       return err;
9646 +}
9647 +
9648 +static struct cftype files[] = {
9649 +       {
9650 +               .name = "usage",
9651 +               .read_u64 = cpuusage_read,
9652 +               .write_u64 = cpuusage_write,
9653 +       },
9654 +};
9655 +
9656 +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9657 +{
9658 +       return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
9659 +}
9660 +
9661 +/*
9662 + * charge this task's execution time to its accounting group.
9663 + *
9664 + * called with rq->lock held.
9665 + */
9666 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9667 +{
9668 +       struct cpuacct *ca;
9669 +
9670 +       if (!cpuacct_subsys.active)
9671 +               return;
9672 +
9673 +       ca = task_ca(tsk);
9674 +       if (ca) {
9675 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9676 +
9677 +               *cpuusage += cputime;
9678 +       }
9679 +}
9680 +
9681 +struct cgroup_subsys cpuacct_subsys = {
9682 +       .name = "cpuacct",
9683 +       .create = cpuacct_create,
9684 +       .destroy = cpuacct_destroy,
9685 +       .populate = cpuacct_populate,
9686 +       .subsys_id = cpuacct_subsys_id,
9687 +};
9688 +#endif /* CONFIG_CGROUP_CPUACCT */
9689 diff -Nurb linux-2.6.27-590/kernel/sched.c.rej linux-2.6.27-591/kernel/sched.c.rej
9690 --- linux-2.6.27-590/kernel/sched.c.rej 1969-12-31 19:00:00.000000000 -0500
9691 +++ linux-2.6.27-591/kernel/sched.c.rej 2010-01-29 16:30:22.000000000 -0500
9692 @@ -0,0 +1,258 @@
9693 +***************
9694 +*** 23,28 ****
9695 +  #include <linux/nmi.h>
9696 +  #include <linux/init.h>
9697 +  #include <asm/uaccess.h>
9698 +  #include <linux/highmem.h>
9699 +  #include <linux/smp_lock.h>
9700 +  #include <asm/mmu_context.h>
9701 +--- 23,29 ----
9702 +  #include <linux/nmi.h>
9703 +  #include <linux/init.h>
9704 +  #include <asm/uaccess.h>
9705 ++ #include <linux/arrays.h>
9706 +  #include <linux/highmem.h>
9707 +  #include <linux/smp_lock.h>
9708 +  #include <asm/mmu_context.h>
9709 +***************
9710 +*** 451,456 ****
9711 +
9712 +  repeat_lock_task:
9713 +       rq = task_rq(p);
9714 +       spin_lock(&rq->lock);
9715 +       if (unlikely(rq != task_rq(p))) {
9716 +               spin_unlock(&rq->lock);
9717 +--- 455,461 ----
9718 +
9719 +  repeat_lock_task:
9720 +       rq = task_rq(p);
9721 ++
9722 +       spin_lock(&rq->lock);
9723 +       if (unlikely(rq != task_rq(p))) {
9724 +               spin_unlock(&rq->lock);
9725 +***************
9726 +*** 1761,1766 ****
9727 +        * event cannot wake it up and insert it on the runqueue either.
9728 +        */
9729 +       p->state = TASK_RUNNING;
9730 +
9731 +       /*
9732 +        * Make sure we do not leak PI boosting priority to the child:
9733 +--- 1766,1786 ----
9734 +        * event cannot wake it up and insert it on the runqueue either.
9735 +        */
9736 +       p->state = TASK_RUNNING;
9737 ++ #ifdef CONFIG_CHOPSTIX
9738 ++     /* The jiffy of last interruption */
9739 ++     if (p->state & TASK_UNINTERRUPTIBLE) {
9740 ++                              p->last_interrupted=jiffies;
9741 ++      }
9742 ++     else
9743 ++     if (p->state & TASK_INTERRUPTIBLE) {
9744 ++                              p->last_interrupted=INTERRUPTIBLE;
9745 ++      }
9746 ++     else
9747 ++          p->last_interrupted=RUNNING;
9748 ++
9749 ++     /* The jiffy of last execution */
9750 ++      p->last_ran_j=jiffies;
9751 ++ #endif
9752 +
9753 +       /*
9754 +        * Make sure we do not leak PI boosting priority to the child:
9755 +***************
9756 +*** 3628,3633 ****
9757 +
9758 +  #endif
9759 +
9760 +  static inline int interactive_sleep(enum sleep_type sleep_type)
9761 +  {
9762 +       return (sleep_type == SLEEP_INTERACTIVE ||
9763 +--- 3648,3654 ----
9764 +
9765 +  #endif
9766 +
9767 ++
9768 +  static inline int interactive_sleep(enum sleep_type sleep_type)
9769 +  {
9770 +       return (sleep_type == SLEEP_INTERACTIVE ||
9771 +***************
9772 +*** 3637,3652 ****
9773 +  /*
9774 +   * schedule() is the main scheduler function.
9775 +   */
9776 +  asmlinkage void __sched schedule(void)
9777 +  {
9778 +       struct task_struct *prev, *next;
9779 +       struct prio_array *array;
9780 +       struct list_head *queue;
9781 +       unsigned long long now;
9782 +-      unsigned long run_time;
9783 +       int cpu, idx, new_prio;
9784 +       long *switch_count;
9785 +       struct rq *rq;
9786 +
9787 +       /*
9788 +        * Test if we are atomic.  Since do_exit() needs to call into
9789 +--- 3658,3685 ----
9790 +  /*
9791 +   * schedule() is the main scheduler function.
9792 +   */
9793 ++
9794 ++ #ifdef CONFIG_CHOPSTIX
9795 ++ extern void (*rec_event)(void *,unsigned int);
9796 ++ struct event_spec {
9797 ++      unsigned long pc;
9798 ++      unsigned long dcookie;
9799 ++      unsigned int count;
9800 ++      unsigned int reason;
9801 ++ };
9802 ++ #endif
9803 ++
9804 +  asmlinkage void __sched schedule(void)
9805 +  {
9806 +       struct task_struct *prev, *next;
9807 +       struct prio_array *array;
9808 +       struct list_head *queue;
9809 +       unsigned long long now;
9810 ++      unsigned long run_time, diff;
9811 +       int cpu, idx, new_prio;
9812 +       long *switch_count;
9813 +       struct rq *rq;
9814 ++      int sampling_reason;
9815 +
9816 +       /*
9817 +        * Test if we are atomic.  Since do_exit() needs to call into
9818 +***************
9819 +*** 3700,3705 ****
9820 +       switch_count = &prev->nivcsw;
9821 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
9822 +               switch_count = &prev->nvcsw;
9823 +               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
9824 +                               unlikely(signal_pending(prev))))
9825 +                       prev->state = TASK_RUNNING;
9826 +--- 3733,3739 ----
9827 +       switch_count = &prev->nivcsw;
9828 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
9829 +               switch_count = &prev->nvcsw;
9830 ++
9831 +               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
9832 +                               unlikely(signal_pending(prev))))
9833 +                       prev->state = TASK_RUNNING;
9834 +***************
9835 +*** 3709,3714 ****
9836 +                               vx_uninterruptible_inc(prev);
9837 +                       }
9838 +                       deactivate_task(prev, rq);
9839 +               }
9840 +       }
9841 +
9842 +--- 3743,3759 ----
9843 +                               vx_uninterruptible_inc(prev);
9844 +                       }
9845 +                       deactivate_task(prev, rq);
9846 ++ #ifdef CONFIG_CHOPSTIX
9847 ++             /* An uninterruptible process just yielded. Record the current jiffie */
9848 ++                      if (prev->state & TASK_UNINTERRUPTIBLE) {
9849 ++                              prev->last_interrupted=jiffies;
9850 ++                      }
9851 ++             /* An interruptible process just yielded, or it got preempted.
9852 ++              * Mark it as interruptible */
9853 ++                      else if (prev->state & TASK_INTERRUPTIBLE) {
9854 ++                              prev->last_interrupted=INTERRUPTIBLE;
9855 ++                      }
9856 ++ #endif
9857 +               }
9858 +       }
9859 +
9860 +***************
9861 +*** 3785,3790 ****
9862 +               prev->sleep_avg = 0;
9863 +       prev->timestamp = prev->last_ran = now;
9864 +
9865 +       sched_info_switch(prev, next);
9866 +       if (likely(prev != next)) {
9867 +               next->timestamp = next->last_ran = now;
9868 +--- 3830,3869 ----
9869 +               prev->sleep_avg = 0;
9870 +       prev->timestamp = prev->last_ran = now;
9871 +
9872 ++ #ifdef CONFIG_CHOPSTIX
9873 ++      /* Run only if the Chopstix module so decrees it */
9874 ++      if (rec_event) {
9875 ++              prev->last_ran_j = jiffies;
9876 ++              if (next->last_interrupted!=INTERRUPTIBLE) {
9877 ++                      if (next->last_interrupted!=RUNNING) {
9878 ++                              diff = (jiffies-next->last_interrupted);
9879 ++                              sampling_reason = 0;/* BLOCKING */
9880 ++                      }
9881 ++                      else {
9882 ++                              diff = jiffies-next->last_ran_j;
9883 ++                              sampling_reason = 1;/* PREEMPTION */
9884 ++                      }
9885 ++
9886 ++                      if (diff >= HZ/10) {
9887 ++                              struct event event;
9888 ++                              struct event_spec espec;
9889 ++                 struct pt_regs *regs;
9890 ++                 regs = task_pt_regs(current);
9891 ++
9892 ++                              espec.reason = sampling_reason;
9893 ++                              event.event_data=&espec;
9894 ++                              event.task=next;
9895 ++                              espec.pc=regs->eip;
9896 ++                              event.event_type=2;
9897 ++                              /* index in the event array currently set up */
9898 ++                              /* make sure the counters are loaded in the order we want them to show up*/
9899 ++                              (*rec_event)(&event, diff);
9900 ++                      }
9901 ++              }
9902 ++         /* next has been elected to run */
9903 ++              next->last_interrupted=0;
9904 ++      }
9905 ++ #endif
9906 +       sched_info_switch(prev, next);
9907 +       if (likely(prev != next)) {
9908 +               next->timestamp = next->last_ran = now;
9909 +***************
9910 +*** 5737,5742 ****
9911 +       jiffies_to_timespec(p->policy == SCHED_FIFO ?
9912 +                               0 : task_timeslice(p), &t);
9913 +       read_unlock(&tasklist_lock);
9914 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
9915 +  out_nounlock:
9916 +       return retval;
9917 +--- 5817,5823 ----
9918 +       jiffies_to_timespec(p->policy == SCHED_FIFO ?
9919 +                               0 : task_timeslice(p), &t);
9920 +       read_unlock(&tasklist_lock);
9921 ++
9922 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
9923 +  out_nounlock:
9924 +       return retval;
9925 +***************
9926 +*** 7980,7982 ****
9927 +  }
9928 +
9929 +  #endif
9930 +--- 8061,8080 ----
9931 +  }
9932 +
9933 +  #endif
9934 ++
9935 ++ #ifdef CONFIG_CHOPSTIX
9936 ++ void (*rec_event)(void *,unsigned int) = NULL;
9937 ++
9938 ++ /* To support safe calling from asm */
9939 ++ asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
9940 ++     struct pt_regs *regs;
9941 ++     struct event_spec *es = event_signature_in->event_data;
9942 ++     regs = task_pt_regs(current);
9943 ++      event_signature_in->task=current;
9944 ++      es->pc=regs->eip;
9945 ++     event_signature_in->count=1;
9946 ++     (*rec_event)(event_signature_in, count);
9947 ++ }
9948 ++ EXPORT_SYMBOL(rec_event);
9949 ++ EXPORT_SYMBOL(in_sched_functions);
9950 ++ #endif
9951 diff -Nurb linux-2.6.27-590/mm/memory.c linux-2.6.27-591/mm/memory.c
9952 --- linux-2.6.27-590/mm/memory.c        2010-01-29 16:29:48.000000000 -0500
9953 +++ linux-2.6.27-591/mm/memory.c        2010-01-29 16:30:22.000000000 -0500
9954 @@ -61,6 +61,7 @@
9955
9956  #include <linux/swapops.h>
9957  #include <linux/elf.h>
9958 +#include <linux/arrays.h>
9959
9960  #include "internal.h"
9961
9962 @@ -2690,6 +2691,15 @@
9963         return ret;
9964  }
9965
9966 +extern void (*rec_event)(void *,unsigned int);
9967 +struct event_spec {
9968 +       unsigned long pc;
9969 +       unsigned long dcookie;
9970 +       unsigned count;
9971 +       unsigned char reason;
9972 +};
9973 +
9974 +
9975  /*
9976   * By the time we get here, we already hold the mm semaphore
9977   */
9978 @@ -2719,6 +2729,24 @@
9979         if (!pte)
9980                 return VM_FAULT_OOM;
9981
9982 +#ifdef CONFIG_CHOPSTIX
9983 +       if (rec_event) {
9984 +               struct event event;
9985 +               struct event_spec espec;
9986 +        struct pt_regs *regs;
9987 +        unsigned int pc;
9988 +        regs = task_pt_regs(current);
9989 +        pc = regs->eip & (unsigned int) ~4095;
9990 +
9991 +               espec.reason = 0; /* alloc */
9992 +               event.event_data=&espec;
9993 +               event.task = current;
9994 +               espec.pc=pc;
9995 +               event.event_type=5;
9996 +               (*rec_event)(&event, 1);
9997 +       }
9998 +#endif
9999 +
10000         return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
10001  }
10002
10003 diff -Nurb linux-2.6.27-590/mm/slab.c linux-2.6.27-591/mm/slab.c
10004 --- linux-2.6.27-590/mm/slab.c  2010-01-29 16:29:48.000000000 -0500
10005 +++ linux-2.6.27-591/mm/slab.c  2010-01-29 16:30:22.000000000 -0500
10006 @@ -110,6 +110,7 @@
10007  #include       <linux/fault-inject.h>
10008  #include       <linux/rtmutex.h>
10009  #include       <linux/reciprocal_div.h>
10010 +#include <linux/arrays.h>
10011  #include       <linux/debugobjects.h>
10012
10013  #include       <asm/cacheflush.h>
10014 @@ -248,6 +249,14 @@
10015         void *addr;
10016  };
10017
10018 +extern void (*rec_event)(void *,unsigned int);
10019 +struct event_spec {
10020 +       unsigned long pc;
10021 +       unsigned long dcookie;
10022 +       unsigned count;
10023 +       unsigned char reason;
10024 +};
10025 +
10026  /*
10027   * struct array_cache
10028   *
10029 @@ -3469,6 +3478,19 @@
10030         local_irq_restore(save_flags);
10031         objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
10032         prefetchw(objp);
10033 +#ifdef CONFIG_CHOPSTIX
10034 +       if (rec_event && objp) {
10035 +               struct event event;
10036 +               struct event_spec espec;
10037 +
10038 +               espec.reason = 0; /* alloc */
10039 +               event.event_data=&espec;
10040 +               event.task = current;
10041 +               espec.pc=caller;
10042 +               event.event_type=5;
10043 +               (*rec_event)(&event, cachep->buffer_size);
10044 +       }
10045 +#endif
10046
10047         if (unlikely((flags & __GFP_ZERO) && objp))
10048                 memset(objp, 0, obj_size(cachep));
10049 @@ -3578,12 +3600,26 @@
10050   * Release an obj back to its cache. If the obj has a constructed state, it must
10051   * be in this state _before_ it is released.  Called with disabled ints.
10052   */
10053 -static inline void __cache_free(struct kmem_cache *cachep, void *objp)
10054 +static inline void __cache_free(struct kmem_cache *cachep, void *objp, void *caller)
10055  {
10056         struct array_cache *ac = cpu_cache_get(cachep);
10057
10058         check_irq_off();
10059 -       objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
10060 +       objp = cache_free_debugcheck(cachep, objp, caller);
10061 + #ifdef CONFIG_CHOPSTIX
10062 +       if (rec_event && objp) {
10063 +               struct event event;
10064 +               struct event_spec espec;
10065 +
10066 +               espec.reason = 1; /* free */
10067 +               event.event_data=&espec;
10068 +               event.task = current;
10069 +               espec.pc=caller;
10070 +               event.event_type=4;
10071 +               (*rec_event)(&event, cachep->buffer_size);
10072 +       }
10073 + #endif
10074 +
10075         vx_slab_free(cachep);
10076
10077         /*
10078 @@ -3714,6 +3750,7 @@
10079                                           void *caller)
10080  {
10081         struct kmem_cache *cachep;
10082 +       void *ret;
10083
10084         /* If you want to save a few bytes .text space: replace
10085          * __ with kmem_.
10086 @@ -3741,10 +3778,17 @@
10087  EXPORT_SYMBOL(__kmalloc_track_caller);
10088
10089  #else
10090 +#ifdef CONFIG_CHOPSTIX
10091 +void *__kmalloc(size_t size, gfp_t flags)
10092 +{
10093 +       return __do_kmalloc(size, flags, __builtin_return_address(0));
10094 +}
10095 +#else
10096  void *__kmalloc(size_t size, gfp_t flags)
10097  {
10098         return __do_kmalloc(size, flags, NULL);
10099  }
10100 +#endif
10101  EXPORT_SYMBOL(__kmalloc);
10102  #endif
10103
10104 @@ -3764,7 +3808,7 @@
10105         debug_check_no_locks_freed(objp, obj_size(cachep));
10106         if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
10107                 debug_check_no_obj_freed(objp, obj_size(cachep));
10108 -       __cache_free(cachep, objp);
10109 +       __cache_free(cachep, objp,__builtin_return_address(0));
10110         local_irq_restore(flags);
10111  }
10112  EXPORT_SYMBOL(kmem_cache_free);
10113 @@ -3790,7 +3834,7 @@
10114         c = virt_to_cache(objp);
10115         debug_check_no_locks_freed(objp, obj_size(c));
10116         debug_check_no_obj_freed(objp, obj_size(c));
10117 -       __cache_free(c, (void *)objp);
10118 +       __cache_free(c, (void *)objp,__builtin_return_address(0));
10119         local_irq_restore(flags);
10120  }
10121  EXPORT_SYMBOL(kfree);