linux-2.6-591-chopstix-intern.patch

   1 diff -Nurb linux-2.6.27-590/arch/Kconfig linux-2.6.27-591/arch/Kconfig
   2 --- linux-2.6.27-590/arch/Kconfig       2010-02-01 19:42:05.000000000 -0500
   3 +++ linux-2.6.27-591/arch/Kconfig       2010-02-01 19:42:30.000000000 -0500
   4 @@ -13,9 +13,18 @@
   5
   6           If unsure, say N.
   7
   8 +config CHOPSTIX
   9 +       bool "Chopstix (PlanetLab)"
  10 +       depends on MODULES && OPROFILE
  11 +       help
  12 +         Chopstix allows you to monitor various events by summarizing them
  13 +         in lossy data structures and transferring these data structures
  14 +         into user space. If in doubt, say "N".
  15 +
  16  config HAVE_OPROFILE
  17         def_bool n
  18
  19 +
  20  config KPROBES
  21         bool "Kprobes"
  22         depends on KALLSYMS && MODULES
  23 diff -Nurb linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c
  24 --- linux-2.6.27-590/arch/x86/kernel/asm-offsets_32.c   2008-10-09 18:13:53.000000000 -0400
  25 +++ linux-2.6.27-591/arch/x86/kernel/asm-offsets_32.c   2010-02-01 19:42:30.000000000 -0500
  26 @@ -9,6 +9,7 @@
  27  #include <linux/signal.h>
  28  #include <linux/personality.h>
  29  #include <linux/suspend.h>
  30 +#include <linux/arrays.h>
  31  #include <linux/kbuild.h>
  32  #include <asm/ucontext.h>
  33  #include "sigframe.h"
  34 @@ -24,9 +25,20 @@
  35  #include <linux/lguest.h>
  36  #include "../../../drivers/lguest/lg.h"
  37
  38 +
  39 +#define STACKOFFSET(sym, str, mem) \
  40 +       DEFINE(sym, offsetof(struct str, mem)-sizeof(struct str));
  41 +
  42  /* workaround for a warning with -Wmissing-prototypes */
  43  void foo(void);
  44
  45 +struct event_spec {
  46 +       unsigned long pc;
  47 +       unsigned long dcookie;
  48 +       unsigned count;
  49 +       unsigned int number;
  50 +};
  51 +
  52  void foo(void)
  53  {
  54         OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
  55 @@ -50,6 +62,16 @@
  56         OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
  57         BLANK();
  58
  59 +    STACKOFFSET(TASK_thread, task_struct, thread);
  60 +    STACKOFFSET(THREAD_esp, thread_struct, sp);
  61 +    STACKOFFSET(EVENT_event_data, event, event_data);
  62 +    STACKOFFSET(EVENT_task, event, task);
  63 +    STACKOFFSET(EVENT_event_type, event, event_type);
  64 +    STACKOFFSET(SPEC_number, event_spec, number);
  65 +    DEFINE(EVENT_SIZE, sizeof(struct event));
  66 +    DEFINE(SPEC_SIZE, sizeof(struct event_spec));
  67 +    DEFINE(SPEC_EVENT_SIZE, sizeof(struct event_spec)+sizeof(struct event));
  68 +
  69         OFFSET(TI_task, thread_info, task);
  70         OFFSET(TI_exec_domain, thread_info, exec_domain);
  71         OFFSET(TI_flags, thread_info, flags);
  72 diff -Nurb linux-2.6.27-590/arch/x86/kernel/entry_32.S linux-2.6.27-591/arch/x86/kernel/entry_32.S
  73 --- linux-2.6.27-590/arch/x86/kernel/entry_32.S 2008-10-09 18:13:53.000000000 -0400
  74 +++ linux-2.6.27-591/arch/x86/kernel/entry_32.S 2010-02-01 19:42:30.000000000 -0500
  75 @@ -426,6 +426,33 @@
  76         cmpl $(nr_syscalls), %eax
  77         jae syscall_badsys
  78  syscall_call:
  79 +    /* Move Chopstix syscall probe here */
  80 +    /* Save and clobber: eax, ecx, ebp  */
  81 +    pushl   %eax
  82 +    pushl   %ecx
  83 +    pushl   %ebp
  84 +    movl    %esp, %ebp
  85 +    subl    $SPEC_EVENT_SIZE, %esp
  86 +    movl    rec_event, %ecx
  87 +    testl   %ecx, %ecx
  88 +    jz  carry_on
  89 +    # struct event is first, just below %ebp
  90 +    movl    %eax, (SPEC_number-EVENT_SIZE)(%ebp)
  91 +    leal    -SPEC_EVENT_SIZE(%ebp), %eax
  92 +    movl    %eax, EVENT_event_data(%ebp)
  93 +    movl    $6, EVENT_event_type(%ebp)
  94 +    movl    rec_event, %edx
  95 +    movl    $1, 4(%esp)
  96 +    leal    -EVENT_SIZE(%ebp), %eax
  97 +    movl    %eax, (%esp)
  98 +    call    rec_event_asm
  99 +carry_on:
 100 +    addl $SPEC_EVENT_SIZE, %esp
 101 +    popl %ebp
 102 +    popl %ecx
 103 +    popl %eax
 104 +     /* End chopstix */
 105 +
 106         call *sys_call_table(,%eax,4)
 107         movl %eax,PT_EAX(%esp)          # store the return value
 108  syscall_exit:
 109 diff -Nurb linux-2.6.27-590/arch/x86/mm/fault.c linux-2.6.27-591/arch/x86/mm/fault.c
 110 --- linux-2.6.27-590/arch/x86/mm/fault.c        2010-02-01 19:42:05.000000000 -0500
 111 +++ linux-2.6.27-591/arch/x86/mm/fault.c        2010-02-01 19:42:30.000000000 -0500
 112 @@ -79,6 +79,15 @@
 113  #endif
 114  }
 115
 116 +
 117 +extern void (*rec_event)(void *,unsigned int);
 118 +struct event_spec {
 119 +       unsigned long pc;
 120 +       unsigned long dcookie;
 121 +       unsigned count;
 122 +       unsigned char reason;
 123 +};
 124 +
 125  /*
 126   * X86_32
 127   * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 128 diff -Nurb linux-2.6.27-590/drivers/oprofile/cpu_buffer.c linux-2.6.27-591/drivers/oprofile/cpu_buffer.c
 129 --- linux-2.6.27-590/drivers/oprofile/cpu_buffer.c      2008-10-09 18:13:53.000000000 -0400
 130 +++ linux-2.6.27-591/drivers/oprofile/cpu_buffer.c      2010-02-01 19:42:30.000000000 -0500
 131 @@ -21,6 +21,7 @@
 132  #include <linux/oprofile.h>
 133  #include <linux/vmalloc.h>
 134  #include <linux/errno.h>
 135 +#include <linux/arrays.h>
 136
 137  #include "event_buffer.h"
 138  #include "cpu_buffer.h"
 139 @@ -147,6 +148,17 @@
 140                 b->head_pos = 0;
 141  }
 142
 143 +#ifdef CONFIG_CHOPSTIX
 144 +
 145 +struct event_spec {
 146 +       unsigned int pc;
 147 +       unsigned long dcookie;
 148 +       unsigned count;
 149 +};
 150 +
 151 +extern void (*rec_event)(void *,unsigned int);
 152 +#endif
 153 +
 154  static inline void
 155  add_sample(struct oprofile_cpu_buffer * cpu_buf,
 156             unsigned long pc, unsigned long event)
 157 @@ -155,6 +167,7 @@
 158         entry->eip = pc;
 159         entry->event = event;
 160         increment_head(cpu_buf);
 161 +
 162  }
 163
 164  static inline void
 165 @@ -250,8 +263,28 @@
 166  {
 167         int is_kernel = !user_mode(regs);
 168         unsigned long pc = profile_pc(regs);
 169 +       int res=0;
 170
 171 +#ifdef CONFIG_CHOPSTIX
 172 +       if (rec_event) {
 173 +               struct event esig;
 174 +               struct event_spec espec;
 175 +               esig.task = current;
 176 +               espec.pc=pc;
 177 +               espec.count=1;
 178 +               esig.event_data=&espec;
 179 +               esig.event_type=event; /* index in the event array currently set up */
 180 +                                       /* make sure the counters are loaded in the order we want them to show up*/
 181 +               (*rec_event)(&esig, 1);
 182 +       }
 183 +       else {
 184         oprofile_add_ext_sample(pc, regs, event, is_kernel);
 185 +       }
 186 +#else
 187 +       oprofile_add_ext_sample(pc, regs, event, is_kernel);
 188 +#endif
 189 +
 190 +
 191  }
 192
 193  void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 194 diff -Nurb linux-2.6.27-590/fs/bio.c linux-2.6.27-591/fs/bio.c
 195 --- linux-2.6.27-590/fs/bio.c   2008-10-09 18:13:53.000000000 -0400
 196 +++ linux-2.6.27-591/fs/bio.c   2010-02-01 19:42:30.000000000 -0500
 197 @@ -27,6 +27,7 @@
 198  #include <linux/workqueue.h>
 199  #include <linux/blktrace_api.h>
 200  #include <scsi/sg.h>           /* for struct sg_iovec */
 201 +#include <linux/arrays.h>
 202
 203  static struct kmem_cache *bio_slab __read_mostly;
 204
 205 @@ -44,6 +45,7 @@
 206  };
 207  #undef BV
 208
 209 +
 210  /*
 211   * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 212   * IO code that does not need private memory pools.
 213 @@ -1171,6 +1173,14 @@
 214         }
 215  }
 216
 217 +struct event_spec {
 218 +       unsigned long pc;
 219 +       unsigned long dcookie;
 220 +       unsigned count;
 221 +       unsigned char reason;
 222 +};
 223 +
 224 +extern void (*rec_event)(void *,unsigned int);
 225  /**
 226   * bio_endio - end I/O on a bio
 227   * @bio:       bio
 228 @@ -1192,6 +1202,24 @@
 229         else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 230                 error = -EIO;
 231
 232 +#if 0
 233 +               if (rec_event) {
 234 +                       struct event event;
 235 +                       struct event_spec espec;
 236 +                       unsigned long eip;
 237 +
 238 +                       espec.reason = 1;/*response */
 239 +
 240 +                       eip = bio->bi_end_io;
 241 +                       event.event_data=&espec;
 242 +                       espec.pc=eip;
 243 +                       event.event_type=3;
 244 +                       /* index in the event array currently set up */
 245 +                       /* make sure the counters are loaded in the order we want them to show up*/
 246 +                       (*rec_event)(&event, bytes_done);
 247 +               }
 248 +#endif
 249 +
 250         if (bio->bi_end_io)
 251                 bio->bi_end_io(bio, error);
 252  }
 253 diff -Nurb linux-2.6.27-590/fs/exec.c linux-2.6.27-591/fs/exec.c
 254 --- linux-2.6.27-590/fs/exec.c  2010-02-01 19:42:07.000000000 -0500
 255 +++ linux-2.6.27-591/fs/exec.c  2010-02-01 19:42:31.000000000 -0500
 256 @@ -27,6 +27,7 @@
 257  #include <linux/fdtable.h>
 258  #include <linux/mm.h>
 259  #include <linux/stat.h>
 260 +#include <linux/dcookies.h>
 261  #include <linux/fcntl.h>
 262  #include <linux/smp_lock.h>
 263  #include <linux/swap.h>
 264 @@ -698,6 +699,13 @@
 265                 goto out;
 266         }
 267
 268 + #ifdef CONFIG_CHOPSTIX
 269 +    unsigned long cookie;
 270 +    extern void (*rec_event)(void *, unsigned int);
 271 +    if (rec_event && !nd.path.dentry->d_cookie)
 272 +        get_dcookie(&nd.path, &cookie);
 273 + #endif
 274 +
 275         return file;
 276
 277   out_path_put:
 278 diff -Nurb linux-2.6.27-590/include/linux/arrays.h linux-2.6.27-591/include/linux/arrays.h
 279 --- linux-2.6.27-590/include/linux/arrays.h     1969-12-31 19:00:00.000000000 -0500
 280 +++ linux-2.6.27-591/include/linux/arrays.h     2010-02-01 19:42:31.000000000 -0500
 281 @@ -0,0 +1,36 @@
 282 +#ifndef __ARRAYS_H__
 283 +#define __ARRAYS_H__
 284 +#include <linux/list.h>
 285 +
 286 +#define SAMPLING_METHOD_DEFAULT 0
 287 +#define SAMPLING_METHOD_LOG 1
 288 +
 289 +/* Every probe has an array handler */
 290 +
 291 +/* XXX - Optimize this structure */
 292 +
 293 +extern void (*rec_event)(void *,unsigned int);
 294 +struct array_handler {
 295 +       struct list_head link;
 296 +       unsigned int (*hash_func)(void *);
 297 +       unsigned int (*sampling_func)(void *,int,void *);
 298 +       unsigned short size;
 299 +       unsigned int threshold;
 300 +       unsigned char **expcount;
 301 +       unsigned int sampling_method;
 302 +       unsigned int **arrays;
 303 +       unsigned int arraysize;
 304 +       unsigned int num_samples[2];
 305 +       void **epoch_samples; /* size-sized lists of samples */
 306 +       unsigned int (*serialize)(void *, void *);
 307 +       unsigned char code[5];
 308 +};
 309 +
 310 +struct event {
 311 +       struct list_head link;
 312 +       void *event_data;
 313 +       unsigned int count;
 314 +       unsigned int event_type;
 315 +       struct task_struct *task;
 316 +};
 317 +#endif
 318 diff -Nurb linux-2.6.27-590/include/linux/sched.h linux-2.6.27-591/include/linux/sched.h
 319 --- linux-2.6.27-590/include/linux/sched.h      2010-02-01 19:42:07.000000000 -0500
 320 +++ linux-2.6.27-591/include/linux/sched.h      2010-02-01 19:47:30.000000000 -0500
 321 @@ -1133,6 +1133,11 @@
 322         cputime_t utime, stime, utimescaled, stimescaled;
 323         cputime_t gtime;
 324         cputime_t prev_utime, prev_stime;
 325 +
 326 +    #ifdef CONFIG_CHOPSTIX
 327 +            unsigned long last_interrupted, last_ran_j;
 328 +    #endif
 329 +
 330         unsigned long nvcsw, nivcsw; /* context switch counts */
 331         struct timespec start_time;             /* monotonic time */
 332         struct timespec real_start_time;        /* boot based time */
 333 diff -Nurb linux-2.6.27-590/include/linux/sched.h.rej linux-2.6.27-591/include/linux/sched.h.rej
 334 --- linux-2.6.27-590/include/linux/sched.h.rej  1969-12-31 19:00:00.000000000 -0500
 335 +++ linux-2.6.27-591/include/linux/sched.h.rej  2010-02-01 19:42:31.000000000 -0500
 336 @@ -0,0 +1,19 @@
 337 +***************
 338 +*** 850,855 ****
 339 +  #endif
 340 +       unsigned long sleep_avg;
 341 +       unsigned long long timestamp, last_ran;
 342 +       unsigned long long sched_time; /* sched_clock time spent running */
 343 +       enum sleep_type sleep_type;
 344 +
 345 +--- 850,859 ----
 346 +  #endif
 347 +       unsigned long sleep_avg;
 348 +       unsigned long long timestamp, last_ran;
 349 ++ #ifdef CONFIG_CHOPSTIX
 350 ++      unsigned long last_interrupted, last_ran_j;
 351 ++ #endif
 352 ++
 353 +       unsigned long long sched_time; /* sched_clock time spent running */
 354 +       enum sleep_type sleep_type;
 355 +
 356 diff -Nurb linux-2.6.27-590/kernel/sched.c linux-2.6.27-591/kernel/sched.c
 357 --- linux-2.6.27-590/kernel/sched.c     2010-02-01 19:42:07.000000000 -0500
 358 +++ linux-2.6.27-591/kernel/sched.c     2010-02-01 19:47:30.000000000 -0500
 359 @@ -10,7 +10,7 @@
 360   *  1998-11-19 Implemented schedule_timeout() and related stuff
 361   *             by Andrea Arcangeli
 362   *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
 363 - *             hybrid priority-list and round-robin design with
 364 + *             hybrid priority-list and round-robin deventn with
 365   *             an array-switch method of distributing timeslices
 366   *             and per-CPU runqueues.  Cleanups and useful suggestions
 367   *             by Davide Libenzi, preemptible kernel bits by Robert Love.
 368 @@ -73,12 +73,16 @@
 369  #include <linux/ftrace.h>
 370  #include <linux/vs_sched.h>
 371  #include <linux/vs_cvirt.h>
 372 +#include <linux/arrays.h>
 373
 374  #include <asm/tlb.h>
 375  #include <asm/irq_regs.h>
 376
 377  #include "sched_cpupri.h"
 378
 379 +#define INTERRUPTIBLE   -1
 380 +#define RUNNING         0
 381 +
 382  /*
 383   * Convert user-nice values [ -20 ... 0 ... 19 ]
 384   * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 385 @@ -2368,6 +2372,10 @@
 386         INIT_HLIST_HEAD(&p->preempt_notifiers);
 387  #endif
 388
 389 +#ifdef CONFIG_CHOPSTIX
 390 +    p->last_ran_j = jiffies;
 391 +    p->last_interrupted = INTERRUPTIBLE;
 392 +#endif
 393         /*
 394          * We mark the process as running here, but have not actually
 395          * inserted it onto the runqueue yet. This guarantees that
 396 @@ -4428,6 +4436,29 @@
 397         }
 398  }
 399
 400 +void (*rec_event)(void *,unsigned int) = NULL;
 401 +EXPORT_SYMBOL(rec_event);
 402 +#ifdef CONFIG_CHOPSTIX
 403 +
 404 +struct event_spec {
 405 +    unsigned long pc;
 406 +    unsigned long dcookie;
 407 +    unsigned int count;
 408 +    unsigned int reason;
 409 +};
 410 +
 411 +/* To support safe calling from asm */
 412 +asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
 413 +    struct pt_regs *regs;
 414 +    struct event_spec *es = event_signature_in->event_data;
 415 +    regs = task_pt_regs(current);
 416 +    event_signature_in->task=current;
 417 +    es->pc=regs->ip;
 418 +    event_signature_in->count=1;
 419 +    (*rec_event)(event_signature_in, count);
 420 +}
 421 +#endif
 422 +
 423  /*
 424   * schedule() is the main scheduler function.
 425   */
 426 @@ -4482,6 +4513,61 @@
 427         next = pick_next_task(rq, prev);
 428
 429         if (likely(prev != next)) {
 430 +
 431 +#ifdef CONFIG_CHOPSTIX
 432 +       /* Run only if the Chopstix module so decrees it */
 433 +       if (rec_event) {
 434 +        unsigned long diff;
 435 +        int sampling_reason;
 436 +               prev->last_ran_j = jiffies;
 437 +               if (next->last_interrupted!=INTERRUPTIBLE) {
 438 +                       if (next->last_interrupted!=RUNNING) {
 439 +                               diff = (jiffies-next->last_interrupted);
 440 +                               sampling_reason = 0;/* BLOCKING */
 441 +                       }
 442 +                       else {
 443 +                               diff = jiffies-next->last_ran_j;
 444 +                               sampling_reason = 1;/* PREEMPTION */
 445 +                       }
 446 +
 447 +                       if (diff >= HZ/10) {
 448 +                struct event_spec {
 449 +                   unsigned long pc;
 450 +                   unsigned long dcookie;
 451 +                   unsigned int count;
 452 +                   unsigned int reason;
 453 +                };
 454 +
 455 +                               struct event event;
 456 +                               struct event_spec espec;
 457 +                struct pt_regs *regs;
 458 +                regs = task_pt_regs(current);
 459 +
 460 +                               espec.reason = sampling_reason;
 461 +                               event.event_data=&espec;
 462 +                               event.task=next;
 463 +                               espec.pc=regs->ip;
 464 +                               event.event_type=2;
 465 +                               /* index in the event array currently set up */
 466 +                               /* make sure the counters are loaded in the order we want them to show up*/
 467 +                               (*rec_event)(&event, diff);
 468 +                       }
 469 +               }
 470 +        /* next has been elected to run */
 471 +               next->last_interrupted=0;
 472 +
 473 +        /* An uninterruptible process just yielded. Record the current jiffy */
 474 +        if (prev->state & TASK_UNINTERRUPTIBLE) {
 475 +            prev->last_interrupted=jiffies;
 476 +        }
 477 +         /* An interruptible process just yielded, or it got preempted.
 478 +          * Mark it as interruptible */
 479 +        else if (prev->state & TASK_INTERRUPTIBLE) {
 480 +            prev->last_interrupted=INTERRUPTIBLE;
 481 +        }
 482 +       }
 483 +#endif
 484 +
 485                 sched_info_switch(prev, next);
 486
 487                 rq->nr_switches++;
 488 @@ -5369,6 +5455,7 @@
 489         get_task_struct(p);
 490         read_unlock(&tasklist_lock);
 491
 492 +
 493         retval = -EPERM;
 494         if ((current->euid != p->euid) && (current->euid != p->uid) &&
 495                         !capable(CAP_SYS_NICE))
 496 diff -Nurb linux-2.6.27-590/kernel/sched.c.orig linux-2.6.27-591/kernel/sched.c.orig
 497 --- linux-2.6.27-590/kernel/sched.c.orig        1969-12-31 19:00:00.000000000 -0500
 498 +++ linux-2.6.27-591/kernel/sched.c.orig        2010-02-01 19:43:07.000000000 -0500
 499 @@ -0,0 +1,9326 @@
 500 +/*
 501 + *  kernel/sched.c
 502 + *
 503 + *  Kernel scheduler and related syscalls
 504 + *
 505 + *  Copyright (C) 1991-2002  Linus Torvalds
 506 + *
 507 + *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 508 + *             make semaphores SMP safe
 509 + *  1998-11-19 Implemented schedule_timeout() and related stuff
 510 + *             by Andrea Arcangeli
 511 + *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
 512 + *             hybrid priority-list and round-robin deventn with
 513 + *             an array-switch method of distributing timeslices
 514 + *             and per-CPU runqueues.  Cleanups and useful suggestions
 515 + *             by Davide Libenzi, preemptible kernel bits by Robert Love.
 516 + *  2003-09-03 Interactivity tuning by Con Kolivas.
 517 + *  2004-04-02 Scheduler domains code by Nick Piggin
 518 + *  2007-04-15  Work begun on replacing all interactivity tuning with a
 519 + *              fair scheduling design by Con Kolivas.
 520 + *  2007-05-05  Load balancing (smp-nice) and other improvements
 521 + *              by Peter Williams
 522 + *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
 523 + *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
 524 + *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
 525 + *              Thomas Gleixner, Mike Kravetz
 526 + */
 527 +
 528 +#include <linux/mm.h>
 529 +#include <linux/module.h>
 530 +#include <linux/nmi.h>
 531 +#include <linux/init.h>
 532 +#include <linux/uaccess.h>
 533 +#include <linux/highmem.h>
 534 +#include <linux/smp_lock.h>
 535 +#include <asm/mmu_context.h>
 536 +#include <linux/interrupt.h>
 537 +#include <linux/capability.h>
 538 +#include <linux/completion.h>
 539 +#include <linux/kernel_stat.h>
 540 +#include <linux/debug_locks.h>
 541 +#include <linux/security.h>
 542 +#include <linux/notifier.h>
 543 +#include <linux/profile.h>
 544 +#include <linux/freezer.h>
 545 +#include <linux/vmalloc.h>
 546 +#include <linux/blkdev.h>
 547 +#include <linux/delay.h>
 548 +#include <linux/pid_namespace.h>
 549 +#include <linux/smp.h>
 550 +#include <linux/threads.h>
 551 +#include <linux/timer.h>
 552 +#include <linux/rcupdate.h>
 553 +#include <linux/cpu.h>
 554 +#include <linux/cpuset.h>
 555 +#include <linux/percpu.h>
 556 +#include <linux/kthread.h>
 557 +#include <linux/seq_file.h>
 558 +#include <linux/sysctl.h>
 559 +#include <linux/syscalls.h>
 560 +#include <linux/times.h>
 561 +#include <linux/tsacct_kern.h>
 562 +#include <linux/kprobes.h>
 563 +#include <linux/delayacct.h>
 564 +#include <linux/reciprocal_div.h>
 565 +#include <linux/unistd.h>
 566 +#include <linux/pagemap.h>
 567 +#include <linux/hrtimer.h>
 568 +#include <linux/tick.h>
 569 +#include <linux/bootmem.h>
 570 +#include <linux/debugfs.h>
 571 +#include <linux/ctype.h>
 572 +#include <linux/ftrace.h>
 573 +#include <linux/vs_sched.h>
 574 +#include <linux/vs_cvirt.h>
 575 +#include <linux/arrays.h>
 576 +
 577 +#include <asm/tlb.h>
 578 +#include <asm/irq_regs.h>
 579 +
 580 +#include "sched_cpupri.h"
 581 +
 582 +#define INTERRUPTIBLE   -1
 583 +#define RUNNING         0
 584 +
 585 +/*
 586 + * Convert user-nice values [ -20 ... 0 ... 19 ]
 587 + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 588 + * and back.
 589 + */
 590 +#define NICE_TO_PRIO(nice)     (MAX_RT_PRIO + (nice) + 20)
 591 +#define PRIO_TO_NICE(prio)     ((prio) - MAX_RT_PRIO - 20)
 592 +#define TASK_NICE(p)           PRIO_TO_NICE((p)->static_prio)
 593 +
 594 +/*
 595 + * 'User priority' is the nice value converted to something we
 596 + * can work with better when scaling various scheduler parameters,
 597 + * it's a [ 0 ... 39 ] range.
 598 + */
 599 +#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
 600 +#define TASK_USER_PRIO(p)      USER_PRIO((p)->static_prio)
 601 +#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
 602 +
 603 +/*
 604 + * Helpers for converting nanosecond timing to jiffy resolution
 605 + */
 606 +#define NS_TO_JIFFIES(TIME)    ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 607 +
 608 +#define NICE_0_LOAD            SCHED_LOAD_SCALE
 609 +#define NICE_0_SHIFT           SCHED_LOAD_SHIFT
 610 +
 611 +/*
 612 + * These are the 'tuning knobs' of the scheduler:
 613 + *
 614 + * default timeslice is 100 msecs (used only for SCHED_RR tasks).
 615 + * Timeslices get refilled after they expire.
 616 + */
 617 +#define DEF_TIMESLICE          (100 * HZ / 1000)
 618 +
 619 +/*
 620 + * single value that denotes runtime == period, ie unlimited time.
 621 + */
 622 +#define RUNTIME_INF    ((u64)~0ULL)
 623 +
 624 +#ifdef CONFIG_SMP
 625 +/*
 626 + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 627 + * Since cpu_power is a 'constant', we can use a reciprocal divide.
 628 + */
 629 +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 630 +{
 631 +       return reciprocal_divide(load, sg->reciprocal_cpu_power);
 632 +}
 633 +
 634 +/*
 635 + * Each time a sched group cpu_power is changed,
 636 + * we must compute its reciprocal value
 637 + */
 638 +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 639 +{
 640 +       sg->__cpu_power += val;
 641 +       sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 642 +}
 643 +#endif
 644 +
 645 +static inline int rt_policy(int policy)
 646 +{
 647 +       if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 648 +               return 1;
 649 +       return 0;
 650 +}
 651 +
 652 +static inline int task_has_rt_policy(struct task_struct *p)
 653 +{
 654 +       return rt_policy(p->policy);
 655 +}
 656 +
 657 +/*
 658 + * This is the priority-queue data structure of the RT scheduling class:
 659 + */
 660 +struct rt_prio_array {
 661 +       DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 662 +       struct list_head queue[MAX_RT_PRIO];
 663 +};
 664 +
 665 +struct rt_bandwidth {
 666 +       /* nests inside the rq lock: */
 667 +       spinlock_t              rt_runtime_lock;
 668 +       ktime_t                 rt_period;
 669 +       u64                     rt_runtime;
 670 +       struct hrtimer          rt_period_timer;
 671 +};
 672 +
 673 +static struct rt_bandwidth def_rt_bandwidth;
 674 +
 675 +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 676 +
 677 +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 678 +{
 679 +       struct rt_bandwidth *rt_b =
 680 +               container_of(timer, struct rt_bandwidth, rt_period_timer);
 681 +       ktime_t now;
 682 +       int overrun;
 683 +       int idle = 0;
 684 +
 685 +       for (;;) {
 686 +               now = hrtimer_cb_get_time(timer);
 687 +               overrun = hrtimer_forward(timer, now, rt_b->rt_period);
 688 +
 689 +               if (!overrun)
 690 +                       break;
 691 +
 692 +               idle = do_sched_rt_period_timer(rt_b, overrun);
 693 +       }
 694 +
 695 +       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 696 +}
 697 +
 698 +static
 699 +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 700 +{
 701 +       rt_b->rt_period = ns_to_ktime(period);
 702 +       rt_b->rt_runtime = runtime;
 703 +
 704 +       spin_lock_init(&rt_b->rt_runtime_lock);
 705 +
 706 +       hrtimer_init(&rt_b->rt_period_timer,
 707 +                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 708 +       rt_b->rt_period_timer.function = sched_rt_period_timer;
 709 +       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 710 +}
 711 +
 712 +static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 713 +{
 714 +       ktime_t now;
 715 +
 716 +       if (rt_b->rt_runtime == RUNTIME_INF)
 717 +               return;
 718 +
 719 +       if (hrtimer_active(&rt_b->rt_period_timer))
 720 +               return;
 721 +
 722 +       spin_lock(&rt_b->rt_runtime_lock);
 723 +       for (;;) {
 724 +               if (hrtimer_active(&rt_b->rt_period_timer))
 725 +                       break;
 726 +
 727 +               now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 728 +               hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 729 +               hrtimer_start(&rt_b->rt_period_timer,
 730 +                             rt_b->rt_period_timer.expires,
 731 +                             HRTIMER_MODE_ABS);
 732 +       }
 733 +       spin_unlock(&rt_b->rt_runtime_lock);
 734 +}
 735 +
 736 +#ifdef CONFIG_RT_GROUP_SCHED
 737 +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 738 +{
 739 +       hrtimer_cancel(&rt_b->rt_period_timer);
 740 +}
 741 +#endif
 742 +
 743 +/*
 744 + * sched_domains_mutex serializes calls to arch_init_sched_domains,
 745 + * detach_destroy_domains and partition_sched_domains.
 746 + */
 747 +static DEFINE_MUTEX(sched_domains_mutex);
 748 +
 749 +#ifdef CONFIG_GROUP_SCHED
 750 +
 751 +#include <linux/cgroup.h>
 752 +
 753 +struct cfs_rq;
 754 +
 755 +static LIST_HEAD(task_groups);
 756 +
 757 +/* task group related information */
 758 +struct task_group {
 759 +#ifdef CONFIG_CGROUP_SCHED
 760 +       struct cgroup_subsys_state css;
 761 +#endif
 762 +
 763 +#ifdef CONFIG_FAIR_GROUP_SCHED
 764 +       /* schedulable entities of this group on each cpu */
 765 +       struct sched_entity **se;
 766 +       /* runqueue "owned" by this group on each cpu */
 767 +       struct cfs_rq **cfs_rq;
 768 +       unsigned long shares;
 769 +#endif
 770 +
 771 +#ifdef CONFIG_RT_GROUP_SCHED
 772 +       struct sched_rt_entity **rt_se;
 773 +       struct rt_rq **rt_rq;
 774 +
 775 +       struct rt_bandwidth rt_bandwidth;
 776 +#endif
 777 +
 778 +       struct rcu_head rcu;
 779 +       struct list_head list;
 780 +
 781 +       struct task_group *parent;
 782 +       struct list_head siblings;
 783 +       struct list_head children;
 784 +};
 785 +
 786 +#ifdef CONFIG_USER_SCHED
 787 +
 788 +/*
 789 + * Root task group.
 790 + *     Every UID task group (including init_task_group aka UID-0) will
 791 + *     be a child to this group.
 792 + */
 793 +struct task_group root_task_group;
 794 +
 795 +#ifdef CONFIG_FAIR_GROUP_SCHED
 796 +/* Default task group's sched entity on each cpu */
 797 +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 798 +/* Default task group's cfs_rq on each cpu */
 799 +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 800 +#endif /* CONFIG_FAIR_GROUP_SCHED */
 801 +
 802 +#ifdef CONFIG_RT_GROUP_SCHED
 803 +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 804 +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 805 +#endif /* CONFIG_RT_GROUP_SCHED */
 806 +#else /* !CONFIG_FAIR_GROUP_SCHED */
 807 +#define root_task_group init_task_group
 808 +#endif /* CONFIG_FAIR_GROUP_SCHED */
 809 +
 810 +/* task_group_lock serializes add/remove of task groups and also changes to
 811 + * a task group's cpu shares.
 812 + */
 813 +static DEFINE_SPINLOCK(task_group_lock);
 814 +
 815 +#ifdef CONFIG_FAIR_GROUP_SCHED
 816 +#ifdef CONFIG_USER_SCHED
 817 +# define INIT_TASK_GROUP_LOAD  (2*NICE_0_LOAD)
 818 +#else /* !CONFIG_USER_SCHED */
 819 +# define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
 820 +#endif /* CONFIG_USER_SCHED */
 821 +
 822 +/*
 823 + * A weight of 0 or 1 can cause arithmetics problems.
 824 + * A weight of a cfs_rq is the sum of weights of which entities
 825 + * are queued on this cfs_rq, so a weight of a entity should not be
 826 + * too large, so as the shares value of a task group.
 827 + * (The default weight is 1024 - so there's no practical
 828 + *  limitation from this.)
 829 + */
 830 +#define MIN_SHARES     2
 831 +#define MAX_SHARES     (1UL << 18)
 832 +
 833 +static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 834 +#endif
 835 +
 836 +/* Default task group.
 837 + *     Every task in system belong to this group at bootup.
 838 + */
 839 +struct task_group init_task_group;
 840 +
 841 +/* return group to which a task belongs */
 842 +static inline struct task_group *task_group(struct task_struct *p)
 843 +{
 844 +       struct task_group *tg;
 845 +
 846 +#ifdef CONFIG_USER_SCHED
 847 +       tg = p->user->tg;
 848 +#elif defined(CONFIG_CGROUP_SCHED)
 849 +       tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 850 +                               struct task_group, css);
 851 +#else
 852 +       tg = &init_task_group;
 853 +#endif
 854 +       return tg;
 855 +}
 856 +
 857 +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 858 +static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 859 +{
 860 +#ifdef CONFIG_FAIR_GROUP_SCHED
 861 +       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 862 +       p->se.parent = task_group(p)->se[cpu];
 863 +#endif
 864 +
 865 +#ifdef CONFIG_RT_GROUP_SCHED
 866 +       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 867 +       p->rt.parent = task_group(p)->rt_se[cpu];
 868 +#endif
 869 +}
 870 +
 871 +#else
 872 +
 873 +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 874 +static inline struct task_group *task_group(struct task_struct *p)
 875 +{
 876 +       return NULL;
 877 +}
 878 +
 879 +#endif /* CONFIG_GROUP_SCHED */
 880 +
 881 +/* CFS-related fields in a runqueue */
 882 +struct cfs_rq {
 883 +       struct load_weight load;
 884 +       unsigned long nr_running;
 885 +
 886 +       u64 exec_clock;
 887 +       u64 min_vruntime;
 888 +       u64 pair_start;
 889 +
 890 +       struct rb_root tasks_timeline;
 891 +       struct rb_node *rb_leftmost;
 892 +
 893 +       struct list_head tasks;
 894 +       struct list_head *balance_iterator;
 895 +
 896 +       /*
 897 +        * 'curr' points to currently running entity on this cfs_rq.
 898 +        * It is set to NULL otherwise (i.e when none are currently running).
 899 +        */
 900 +       struct sched_entity *curr, *next;
 901 +
 902 +       unsigned long nr_spread_over;
 903 +
 904 +#ifdef CONFIG_FAIR_GROUP_SCHED
 905 +       struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 906 +
 907 +       /*
 908 +        * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 909 +        * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 910 +        * (like users, containers etc.)
 911 +        *
 912 +        * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 913 +        * list is used during load balance.
 914 +        */
 915 +       struct list_head leaf_cfs_rq_list;
 916 +       struct task_group *tg;  /* group that "owns" this runqueue */
 917 +
 918 +#ifdef CONFIG_SMP
 919 +       /*
 920 +        * the part of load.weight contributed by tasks
 921 +        */
 922 +       unsigned long task_weight;
 923 +
 924 +       /*
 925 +        *   h_load = weight * f(tg)
 926 +        *
 927 +        * Where f(tg) is the recursive weight fraction assigned to
 928 +        * this group.
 929 +        */
 930 +       unsigned long h_load;
 931 +
 932 +       /*
 933 +        * this cpu's part of tg->shares
 934 +        */
 935 +       unsigned long shares;
 936 +
 937 +       /*
 938 +        * load.weight at the time we set shares
 939 +        */
 940 +       unsigned long rq_weight;
 941 +#endif
 942 +#endif
 943 +};
 944 +
 945 +/* Real-Time classes' related field in a runqueue: */
 946 +struct rt_rq {
 947 +       struct rt_prio_array active;
 948 +       unsigned long rt_nr_running;
 949 +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 950 +       int highest_prio; /* highest queued rt task prio */
 951 +#endif
 952 +#ifdef CONFIG_SMP
 953 +       unsigned long rt_nr_migratory;
 954 +       int overloaded;
 955 +#endif
 956 +       int rt_throttled;
 957 +       u64 rt_time;
 958 +       u64 rt_runtime;
 959 +       /* Nests inside the rq lock: */
 960 +       spinlock_t rt_runtime_lock;
 961 +
 962 +#ifdef CONFIG_RT_GROUP_SCHED
 963 +       unsigned long rt_nr_boosted;
 964 +
 965 +       struct rq *rq;
 966 +       struct list_head leaf_rt_rq_list;
 967 +       struct task_group *tg;
 968 +       struct sched_rt_entity *rt_se;
 969 +#endif
 970 +};
 971 +
 972 +#ifdef CONFIG_SMP
 973 +
 974 +/*
 975 + * We add the notion of a root-domain which will be used to define per-domain
 976 + * variables. Each exclusive cpuset essentially defines an island domain by
 977 + * fully partitioning the member cpus from any other cpuset. Whenever a new
 978 + * exclusive cpuset is created, we also create and attach a new root-domain
 979 + * object.
 980 + *
 981 + */
 982 +struct root_domain {
 983 +       atomic_t refcount;
 984 +       cpumask_t span;
 985 +       cpumask_t online;
 986 +
 987 +       /*
 988 +        * The "RT overload" flag: it gets set if a CPU has more than
 989 +        * one runnable RT task.
 990 +        */
 991 +       cpumask_t rto_mask;
 992 +       atomic_t rto_count;
 993 +#ifdef CONFIG_SMP
 994 +       struct cpupri cpupri;
 995 +#endif
 996 +};
 997 +
 998 +/*
 999 + * By default the system creates a single root-domain with all cpus as
1000 + * members (mimicking the global state we have today).
1001 + */
1002 +static struct root_domain def_root_domain;
1003 +
1004 +#endif
1005 +       unsigned long norm_time;
1006 +       unsigned long idle_time;
1007 +#ifdef CONFIG_VSERVER_IDLETIME
1008 +       int idle_skip;
1009 +#endif
1010 +#ifdef CONFIG_VSERVER_HARDCPU
1011 +       struct list_head hold_queue;
1012 +       unsigned long nr_onhold;
1013 +       int idle_tokens;
1014 +#endif
1015 +
1016 +/*
1017 + * This is the main, per-CPU runqueue data structure.
1018 + *
1019 + * Locking rule: those places that want to lock multiple runqueues
1020 + * (such as the load balancing or the thread migration code), lock
1021 + * acquire operations must be ordered by ascending &runqueue.
1022 + */
1023 +struct rq {
1024 +       /* runqueue lock: */
1025 +       spinlock_t lock;
1026 +
1027 +       /*
1028 +        * nr_running and cpu_load should be in the same cacheline because
1029 +        * remote CPUs use both these fields when doing load calculation.
1030 +        */
1031 +       unsigned long nr_running;
1032 +       #define CPU_LOAD_IDX_MAX 5
1033 +       unsigned long cpu_load[CPU_LOAD_IDX_MAX];
1034 +       unsigned char idle_at_tick;
1035 +#ifdef CONFIG_NO_HZ
1036 +       unsigned long last_tick_seen;
1037 +       unsigned char in_nohz_recently;
1038 +#endif
1039 +       /* capture load from *all* tasks on this cpu: */
1040 +       struct load_weight load;
1041 +       unsigned long nr_load_updates;
1042 +       u64 nr_switches;
1043 +
1044 +       struct cfs_rq cfs;
1045 +       struct rt_rq rt;
1046 +
1047 +#ifdef CONFIG_FAIR_GROUP_SCHED
1048 +       /* list of leaf cfs_rq on this cpu: */
1049 +       struct list_head leaf_cfs_rq_list;
1050 +#endif
1051 +#ifdef CONFIG_RT_GROUP_SCHED
1052 +       struct list_head leaf_rt_rq_list;
1053 +#endif
1054 +
1055 +       /*
1056 +        * This is part of a global counter where only the total sum
1057 +        * over all CPUs matters. A task can increase this counter on
1058 +        * one CPU and if it got migrated afterwards it may decrease
1059 +        * it on another CPU. Always updated under the runqueue lock:
1060 +        */
1061 +       unsigned long nr_uninterruptible;
1062 +
1063 +       struct task_struct *curr, *idle;
1064 +       unsigned long next_balance;
1065 +       struct mm_struct *prev_mm;
1066 +
1067 +       u64 clock;
1068 +
1069 +       atomic_t nr_iowait;
1070 +
1071 +#ifdef CONFIG_SMP
1072 +       struct root_domain *rd;
1073 +       struct sched_domain *sd;
1074 +
1075 +       /* For active balancing */
1076 +       int active_balance;
1077 +       int push_cpu;
1078 +       /* cpu of this runqueue: */
1079 +       int cpu;
1080 +       int online;
1081 +
1082 +       unsigned long avg_load_per_task;
1083 +
1084 +       struct task_struct *migration_thread;
1085 +       struct list_head migration_queue;
1086 +#endif
1087 +
1088 +#ifdef CONFIG_SCHED_HRTICK
1089 +#ifdef CONFIG_SMP
1090 +       int hrtick_csd_pending;
1091 +       struct call_single_data hrtick_csd;
1092 +#endif
1093 +       struct hrtimer hrtick_timer;
1094 +#endif
1095 +
1096 +#ifdef CONFIG_SCHEDSTATS
1097 +       /* latency stats */
1098 +       struct sched_info rq_sched_info;
1099 +
1100 +       /* sys_sched_yield() stats */
1101 +       unsigned int yld_exp_empty;
1102 +       unsigned int yld_act_empty;
1103 +       unsigned int yld_both_empty;
1104 +       unsigned int yld_count;
1105 +
1106 +       /* schedule() stats */
1107 +       unsigned int sched_switch;
1108 +       unsigned int sched_count;
1109 +       unsigned int sched_goidle;
1110 +
1111 +       /* try_to_wake_up() stats */
1112 +       unsigned int ttwu_count;
1113 +       unsigned int ttwu_local;
1114 +
1115 +       /* BKL stats */
1116 +       unsigned int bkl_count;
1117 +#endif
1118 +};
1119 +
1120 +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
1121 +
1122 +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
1123 +{
1124 +       rq->curr->sched_class->check_preempt_curr(rq, p);
1125 +}
1126 +
1127 +static inline int cpu_of(struct rq *rq)
1128 +{
1129 +#ifdef CONFIG_SMP
1130 +       return rq->cpu;
1131 +#else
1132 +       return 0;
1133 +#endif
1134 +}
1135 +
1136 +/*
1137 + * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1138 + * See detach_destroy_domains: synchronize_sched for details.
1139 + *
1140 + * The domain tree of any CPU may only be accessed from within
1141 + * preempt-disabled sections.
1142 + */
1143 +#define for_each_domain(cpu, __sd) \
1144 +       for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
1145 +
1146 +#define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
1147 +#define this_rq()              (&__get_cpu_var(runqueues))
1148 +#define task_rq(p)             cpu_rq(task_cpu(p))
1149 +#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
1150 +
1151 +static inline void update_rq_clock(struct rq *rq)
1152 +{
1153 +       rq->clock = sched_clock_cpu(cpu_of(rq));
1154 +}
1155 +
1156 +/*
1157 + * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
1158 + */
1159 +#ifdef CONFIG_SCHED_DEBUG
1160 +# define const_debug __read_mostly
1161 +#else
1162 +# define const_debug static const
1163 +#endif
1164 +
1165 +/**
1166 + * runqueue_is_locked
1167 + *
1168 + * Returns true if the current cpu runqueue is locked.
1169 + * This interface allows printk to be called with the runqueue lock
1170 + * held and know whether or not it is OK to wake up the klogd.
1171 + */
1172 +int runqueue_is_locked(void)
1173 +{
1174 +       int cpu = get_cpu();
1175 +       struct rq *rq = cpu_rq(cpu);
1176 +       int ret;
1177 +
1178 +       ret = spin_is_locked(&rq->lock);
1179 +       put_cpu();
1180 +       return ret;
1181 +}
1182 +
1183 +/*
1184 + * Debugging: various feature bits
1185 + */
1186 +
1187 +#define SCHED_FEAT(name, enabled)      \
1188 +       __SCHED_FEAT_##name ,
1189 +
1190 +enum {
1191 +#include "sched_features.h"
1192 +};
1193 +
1194 +#undef SCHED_FEAT
1195 +
1196 +#define SCHED_FEAT(name, enabled)      \
1197 +       (1UL << __SCHED_FEAT_##name) * enabled |
1198 +
1199 +const_debug unsigned int sysctl_sched_features =
1200 +#include "sched_features.h"
1201 +       0;
1202 +
1203 +#undef SCHED_FEAT
1204 +
1205 +#ifdef CONFIG_SCHED_DEBUG
1206 +#define SCHED_FEAT(name, enabled)      \
1207 +       #name ,
1208 +
1209 +static __read_mostly char *sched_feat_names[] = {
1210 +#include "sched_features.h"
1211 +       NULL
1212 +};
1213 +
1214 +#undef SCHED_FEAT
1215 +
1216 +static int sched_feat_open(struct inode *inode, struct file *filp)
1217 +{
1218 +       filp->private_data = inode->i_private;
1219 +       return 0;
1220 +}
1221 +
1222 +static ssize_t
1223 +sched_feat_read(struct file *filp, char __user *ubuf,
1224 +               size_t cnt, loff_t *ppos)
1225 +{
1226 +       char *buf;
1227 +       int r = 0;
1228 +       int len = 0;
1229 +       int i;
1230 +
1231 +       for (i = 0; sched_feat_names[i]; i++) {
1232 +               len += strlen(sched_feat_names[i]);
1233 +               len += 4;
1234 +       }
1235 +
1236 +       buf = kmalloc(len + 2, GFP_KERNEL);
1237 +       if (!buf)
1238 +               return -ENOMEM;
1239 +
1240 +       for (i = 0; sched_feat_names[i]; i++) {
1241 +               if (sysctl_sched_features & (1UL << i))
1242 +                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
1243 +               else
1244 +                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
1245 +       }
1246 +
1247 +       r += sprintf(buf + r, "\n");
1248 +       WARN_ON(r >= len + 2);
1249 +
1250 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1251 +
1252 +       kfree(buf);
1253 +
1254 +       return r;
1255 +}
1256 +
1257 +static ssize_t
1258 +sched_feat_write(struct file *filp, const char __user *ubuf,
1259 +               size_t cnt, loff_t *ppos)
1260 +{
1261 +       char buf[64];
1262 +       char *cmp = buf;
1263 +       int neg = 0;
1264 +       int i;
1265 +
1266 +       if (cnt > 63)
1267 +               cnt = 63;
1268 +
1269 +       if (copy_from_user(&buf, ubuf, cnt))
1270 +               return -EFAULT;
1271 +
1272 +       buf[cnt] = 0;
1273 +
1274 +       if (strncmp(buf, "NO_", 3) == 0) {
1275 +               neg = 1;
1276 +               cmp += 3;
1277 +       }
1278 +
1279 +       for (i = 0; sched_feat_names[i]; i++) {
1280 +               int len = strlen(sched_feat_names[i]);
1281 +
1282 +               if (strncmp(cmp, sched_feat_names[i], len) == 0) {
1283 +                       if (neg)
1284 +                               sysctl_sched_features &= ~(1UL << i);
1285 +                       else
1286 +                               sysctl_sched_features |= (1UL << i);
1287 +                       break;
1288 +               }
1289 +       }
1290 +
1291 +       if (!sched_feat_names[i])
1292 +               return -EINVAL;
1293 +
1294 +       filp->f_pos += cnt;
1295 +
1296 +       return cnt;
1297 +}
1298 +
1299 +static struct file_operations sched_feat_fops = {
1300 +       .open   = sched_feat_open,
1301 +       .read   = sched_feat_read,
1302 +       .write  = sched_feat_write,
1303 +};
1304 +
1305 +static __init int sched_init_debug(void)
1306 +{
1307 +       debugfs_create_file("sched_features", 0644, NULL, NULL,
1308 +                       &sched_feat_fops);
1309 +
1310 +       return 0;
1311 +}
1312 +late_initcall(sched_init_debug);
1313 +
1314 +#endif
1315 +
1316 +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
1317 +
1318 +/*
1319 + * Number of tasks to iterate in a single balance run.
1320 + * Limited because this is done with IRQs disabled.
1321 + */
1322 +const_debug unsigned int sysctl_sched_nr_migrate = 32;
1323 +
1324 +/*
1325 + * ratelimit for updating the group shares.
1326 + * default: 0.25ms
1327 + */
1328 +unsigned int sysctl_sched_shares_ratelimit = 250000;
1329 +
1330 +/*
1331 + * period over which we measure -rt task cpu usage in us.
1332 + * default: 1s
1333 + */
1334 +unsigned int sysctl_sched_rt_period = 1000000;
1335 +
1336 +static __read_mostly int scheduler_running;
1337 +
1338 +/*
1339 + * part of the period that we allow rt tasks to run in us.
1340 + * default: 0.95s
1341 + */
1342 +int sysctl_sched_rt_runtime = 950000;
1343 +
1344 +static inline u64 global_rt_period(void)
1345 +{
1346 +       return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
1347 +}
1348 +
1349 +static inline u64 global_rt_runtime(void)
1350 +{
1351 +       if (sysctl_sched_rt_runtime < 0)
1352 +               return RUNTIME_INF;
1353 +
1354 +       return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
1355 +}
1356 +
1357 +#ifndef prepare_arch_switch
1358 +# define prepare_arch_switch(next)     do { } while (0)
1359 +#endif
1360 +#ifndef finish_arch_switch
1361 +# define finish_arch_switch(prev)      do { } while (0)
1362 +#endif
1363 +
1364 +static inline int task_current(struct rq *rq, struct task_struct *p)
1365 +{
1366 +       return rq->curr == p;
1367 +}
1368 +
1369 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1370 +static inline int task_running(struct rq *rq, struct task_struct *p)
1371 +{
1372 +       return task_current(rq, p);
1373 +}
1374 +
1375 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1376 +{
1377 +}
1378 +
1379 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1380 +{
1381 +#ifdef CONFIG_DEBUG_SPINLOCK
1382 +       /* this is a valid case when another task releases the spinlock */
1383 +       rq->lock.owner = current;
1384 +#endif
1385 +       /*
1386 +        * If we are tracking spinlock dependencies then we have to
1387 +        * fix up the runqueue lock - which gets 'carried over' from
1388 +        * prev into current:
1389 +        */
1390 +       spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1391 +
1392 +       spin_unlock_irq(&rq->lock);
1393 +}
1394 +
1395 +#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1396 +static inline int task_running(struct rq *rq, struct task_struct *p)
1397 +{
1398 +#ifdef CONFIG_SMP
1399 +       return p->oncpu;
1400 +#else
1401 +       return task_current(rq, p);
1402 +#endif
1403 +}
1404 +
1405 +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1406 +{
1407 +#ifdef CONFIG_SMP
1408 +       /*
1409 +        * We can optimise this out completely for !SMP, because the
1410 +        * SMP rebalancing from interrupt is the only thing that cares
1411 +        * here.
1412 +        */
1413 +       next->oncpu = 1;
1414 +#endif
1415 +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1416 +       spin_unlock_irq(&rq->lock);
1417 +#else
1418 +       spin_unlock(&rq->lock);
1419 +#endif
1420 +}
1421 +
1422 +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1423 +{
1424 +#ifdef CONFIG_SMP
1425 +       /*
1426 +        * After ->oncpu is cleared, the task can be moved to a different CPU.
1427 +        * We must ensure this doesn't happen until the switch is completely
1428 +        * finished.
1429 +        */
1430 +       smp_wmb();
1431 +       prev->oncpu = 0;
1432 +#endif
1433 +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1434 +       local_irq_enable();
1435 +#endif
1436 +}
1437 +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1438 +
1439 +/*
1440 + * __task_rq_lock - lock the runqueue a given task resides on.
1441 + * Must be called interrupts disabled.
1442 + */
1443 +static inline struct rq *__task_rq_lock(struct task_struct *p)
1444 +       __acquires(rq->lock)
1445 +{
1446 +       for (;;) {
1447 +               struct rq *rq = task_rq(p);
1448 +               spin_lock(&rq->lock);
1449 +               if (likely(rq == task_rq(p)))
1450 +                       return rq;
1451 +               spin_unlock(&rq->lock);
1452 +       }
1453 +}
1454 +
1455 +/*
1456 + * task_rq_lock - lock the runqueue a given task resides on and disable
1457 + * interrupts. Note the ordering: we can safely lookup the task_rq without
1458 + * explicitly disabling preemption.
1459 + */
1460 +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1461 +       __acquires(rq->lock)
1462 +{
1463 +       struct rq *rq;
1464 +
1465 +       for (;;) {
1466 +               local_irq_save(*flags);
1467 +               rq = task_rq(p);
1468 +               spin_lock(&rq->lock);
1469 +               if (likely(rq == task_rq(p)))
1470 +                       return rq;
1471 +               spin_unlock_irqrestore(&rq->lock, *flags);
1472 +       }
1473 +}
1474 +
1475 +static void __task_rq_unlock(struct rq *rq)
1476 +       __releases(rq->lock)
1477 +{
1478 +       spin_unlock(&rq->lock);
1479 +}
1480 +
1481 +static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1482 +       __releases(rq->lock)
1483 +{
1484 +       spin_unlock_irqrestore(&rq->lock, *flags);
1485 +}
1486 +
1487 +/*
1488 + * this_rq_lock - lock this runqueue and disable interrupts.
1489 + */
1490 +static struct rq *this_rq_lock(void)
1491 +       __acquires(rq->lock)
1492 +{
1493 +       struct rq *rq;
1494 +
1495 +       local_irq_disable();
1496 +       rq = this_rq();
1497 +       spin_lock(&rq->lock);
1498 +
1499 +       return rq;
1500 +}
1501 +
1502 +#ifdef CONFIG_SCHED_HRTICK
1503 +/*
1504 + * Use HR-timers to deliver accurate preemption points.
1505 + *
1506 + * Its all a bit involved since we cannot program an hrt while holding the
1507 + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1508 + * reschedule event.
1509 + *
1510 + * When we get rescheduled we reprogram the hrtick_timer outside of the
1511 + * rq->lock.
1512 + */
1513 +
1514 +/*
1515 + * Use hrtick when:
1516 + *  - enabled by features
1517 + *  - hrtimer is actually high res
1518 + */
1519 +static inline int hrtick_enabled(struct rq *rq)
1520 +{
1521 +       if (!sched_feat(HRTICK))
1522 +               return 0;
1523 +       if (!cpu_active(cpu_of(rq)))
1524 +               return 0;
1525 +       return hrtimer_is_hres_active(&rq->hrtick_timer);
1526 +}
1527 +
1528 +static void hrtick_clear(struct rq *rq)
1529 +{
1530 +       if (hrtimer_active(&rq->hrtick_timer))
1531 +               hrtimer_cancel(&rq->hrtick_timer);
1532 +}
1533 +
1534 +/*
1535 + * High-resolution timer tick.
1536 + * Runs from hardirq context with interrupts disabled.
1537 + */
1538 +static enum hrtimer_restart hrtick(struct hrtimer *timer)
1539 +{
1540 +       struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1541 +
1542 +       WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1543 +
1544 +       spin_lock(&rq->lock);
1545 +       update_rq_clock(rq);
1546 +       rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1547 +       spin_unlock(&rq->lock);
1548 +
1549 +       return HRTIMER_NORESTART;
1550 +}
1551 +
1552 +#ifdef CONFIG_SMP
1553 +/*
1554 + * called from hardirq (IPI) context
1555 + */
1556 +static void __hrtick_start(void *arg)
1557 +{
1558 +       struct rq *rq = arg;
1559 +
1560 +       spin_lock(&rq->lock);
1561 +       hrtimer_restart(&rq->hrtick_timer);
1562 +       rq->hrtick_csd_pending = 0;
1563 +       spin_unlock(&rq->lock);
1564 +}
1565 +
1566 +/*
1567 + * Called to set the hrtick timer state.
1568 + *
1569 + * called with rq->lock held and irqs disabled
1570 + */
1571 +static void hrtick_start(struct rq *rq, u64 delay)
1572 +{
1573 +       struct hrtimer *timer = &rq->hrtick_timer;
1574 +       ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1575 +
1576 +       timer->expires = time;
1577 +
1578 +       if (rq == this_rq()) {
1579 +               hrtimer_restart(timer);
1580 +       } else if (!rq->hrtick_csd_pending) {
1581 +               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1582 +               rq->hrtick_csd_pending = 1;
1583 +       }
1584 +}
1585 +
1586 +static int
1587 +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1588 +{
1589 +       int cpu = (int)(long)hcpu;
1590 +
1591 +       switch (action) {
1592 +       case CPU_UP_CANCELED:
1593 +       case CPU_UP_CANCELED_FROZEN:
1594 +       case CPU_DOWN_PREPARE:
1595 +       case CPU_DOWN_PREPARE_FROZEN:
1596 +       case CPU_DEAD:
1597 +       case CPU_DEAD_FROZEN:
1598 +               hrtick_clear(cpu_rq(cpu));
1599 +               return NOTIFY_OK;
1600 +       }
1601 +
1602 +       return NOTIFY_DONE;
1603 +}
1604 +
1605 +static __init void init_hrtick(void)
1606 +{
1607 +       hotcpu_notifier(hotplug_hrtick, 0);
1608 +}
1609 +#else
1610 +/*
1611 + * Called to set the hrtick timer state.
1612 + *
1613 + * called with rq->lock held and irqs disabled
1614 + */
1615 +static void hrtick_start(struct rq *rq, u64 delay)
1616 +{
1617 +       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1618 +}
1619 +
1620 +static void init_hrtick(void)
1621 +{
1622 +}
1623 +#endif /* CONFIG_SMP */
1624 +
1625 +static void init_rq_hrtick(struct rq *rq)
1626 +{
1627 +#ifdef CONFIG_SMP
1628 +       rq->hrtick_csd_pending = 0;
1629 +
1630 +       rq->hrtick_csd.flags = 0;
1631 +       rq->hrtick_csd.func = __hrtick_start;
1632 +       rq->hrtick_csd.info = rq;
1633 +#endif
1634 +
1635 +       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1636 +       rq->hrtick_timer.function = hrtick;
1637 +       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1638 +}
1639 +#else
1640 +static inline void hrtick_clear(struct rq *rq)
1641 +{
1642 +}
1643 +
1644 +static inline void init_rq_hrtick(struct rq *rq)
1645 +{
1646 +}
1647 +
1648 +static inline void init_hrtick(void)
1649 +{
1650 +}
1651 +#endif
1652 +
1653 +/*
1654 + * resched_task - mark a task 'to be rescheduled now'.
1655 + *
1656 + * On UP this means the setting of the need_resched flag, on SMP it
1657 + * might also involve a cross-CPU call to trigger the scheduler on
1658 + * the target CPU.
1659 + */
1660 +#ifdef CONFIG_SMP
1661 +
1662 +#ifndef tsk_is_polling
1663 +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1664 +#endif
1665 +
1666 +static void resched_task(struct task_struct *p)
1667 +{
1668 +       int cpu;
1669 +
1670 +       assert_spin_locked(&task_rq(p)->lock);
1671 +
1672 +       if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1673 +               return;
1674 +
1675 +       set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1676 +
1677 +       cpu = task_cpu(p);
1678 +       if (cpu == smp_processor_id())
1679 +               return;
1680 +
1681 +       /* NEED_RESCHED must be visible before we test polling */
1682 +       smp_mb();
1683 +       if (!tsk_is_polling(p))
1684 +               smp_send_reschedule(cpu);
1685 +}
1686 +
1687 +static void resched_cpu(int cpu)
1688 +{
1689 +       struct rq *rq = cpu_rq(cpu);
1690 +       unsigned long flags;
1691 +
1692 +       if (!spin_trylock_irqsave(&rq->lock, flags))
1693 +               return;
1694 +       resched_task(cpu_curr(cpu));
1695 +       spin_unlock_irqrestore(&rq->lock, flags);
1696 +}
1697 +
1698 +#ifdef CONFIG_NO_HZ
1699 +/*
1700 + * When add_timer_on() enqueues a timer into the timer wheel of an
1701 + * idle CPU then this timer might expire before the next timer event
1702 + * which is scheduled to wake up that CPU. In case of a completely
1703 + * idle system the next event might even be infinite time into the
1704 + * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1705 + * leaves the inner idle loop so the newly added timer is taken into
1706 + * account when the CPU goes back to idle and evaluates the timer
1707 + * wheel for the next timer event.
1708 + */
1709 +void wake_up_idle_cpu(int cpu)
1710 +{
1711 +       struct rq *rq = cpu_rq(cpu);
1712 +
1713 +       if (cpu == smp_processor_id())
1714 +               return;
1715 +
1716 +       /*
1717 +        * This is safe, as this function is called with the timer
1718 +        * wheel base lock of (cpu) held. When the CPU is on the way
1719 +        * to idle and has not yet set rq->curr to idle then it will
1720 +        * be serialized on the timer wheel base lock and take the new
1721 +        * timer into account automatically.
1722 +        */
1723 +       if (rq->curr != rq->idle)
1724 +               return;
1725 +
1726 +       /*
1727 +        * We can set TIF_RESCHED on the idle task of the other CPU
1728 +        * lockless. The worst case is that the other CPU runs the
1729 +        * idle task through an additional NOOP schedule()
1730 +        */
1731 +       set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
1732 +
1733 +       /* NEED_RESCHED must be visible before we test polling */
1734 +       smp_mb();
1735 +       if (!tsk_is_polling(rq->idle))
1736 +               smp_send_reschedule(cpu);
1737 +}
1738 +#endif /* CONFIG_NO_HZ */
1739 +
1740 +#else /* !CONFIG_SMP */
1741 +static void resched_task(struct task_struct *p)
1742 +{
1743 +       assert_spin_locked(&task_rq(p)->lock);
1744 +       set_tsk_need_resched(p);
1745 +}
1746 +#endif /* CONFIG_SMP */
1747 +
1748 +#if BITS_PER_LONG == 32
1749 +# define WMULT_CONST   (~0UL)
1750 +#else
1751 +# define WMULT_CONST   (1UL << 32)
1752 +#endif
1753 +
1754 +#define WMULT_SHIFT    32
1755 +
1756 +/*
1757 + * Shift right and round:
1758 + */
1759 +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1760 +
1761 +/*
1762 + * delta *= weight / lw
1763 + */
1764 +static unsigned long
1765 +calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1766 +               struct load_weight *lw)
1767 +{
1768 +       u64 tmp;
1769 +
1770 +       if (!lw->inv_weight) {
1771 +               if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1772 +                       lw->inv_weight = 1;
1773 +               else
1774 +                       lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1775 +                               / (lw->weight+1);
1776 +       }
1777 +
1778 +       tmp = (u64)delta_exec * weight;
1779 +       /*
1780 +        * Check whether we'd overflow the 64-bit multiplication:
1781 +        */
1782 +       if (unlikely(tmp > WMULT_CONST))
1783 +               tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1784 +                       WMULT_SHIFT/2);
1785 +       else
1786 +               tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1787 +
1788 +       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1789 +}
1790 +
1791 +static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1792 +{
1793 +       lw->weight += inc;
1794 +       lw->inv_weight = 0;
1795 +}
1796 +
1797 +static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1798 +{
1799 +       lw->weight -= dec;
1800 +       lw->inv_weight = 0;
1801 +}
1802 +
1803 +/*
1804 + * To aid in avoiding the subversion of "niceness" due to uneven distribution
1805 + * of tasks with abnormal "nice" values across CPUs the contribution that
1806 + * each task makes to its run queue's load is weighted according to its
1807 + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1808 + * scaled version of the new time slice allocation that they receive on time
1809 + * slice expiry etc.
1810 + */
1811 +
1812 +#define WEIGHT_IDLEPRIO                2
1813 +#define WMULT_IDLEPRIO         (1 << 31)
1814 +
1815 +/*
1816 + * Nice levels are multiplicative, with a gentle 10% change for every
1817 + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1818 + * nice 1, it will get ~10% less CPU time than another CPU-bound task
1819 + * that remained on nice 0.
1820 + *
1821 + * The "10% effect" is relative and cumulative: from _any_ nice level,
1822 + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1823 + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1824 + * If a task goes up by ~10% and another task goes down by ~10% then
1825 + * the relative distance between them is ~25%.)
1826 + */
1827 +static const int prio_to_weight[40] = {
1828 + /* -20 */     88761,     71755,     56483,     46273,     36291,
1829 + /* -15 */     29154,     23254,     18705,     14949,     11916,
1830 + /* -10 */      9548,      7620,      6100,      4904,      3906,
1831 + /*  -5 */      3121,      2501,      1991,      1586,      1277,
1832 + /*   0 */      1024,       820,       655,       526,       423,
1833 + /*   5 */       335,       272,       215,       172,       137,
1834 + /*  10 */       110,        87,        70,        56,        45,
1835 + /*  15 */        36,        29,        23,        18,        15,
1836 +};
1837 +
1838 +/*
1839 + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1840 + *
1841 + * In cases where the weight does not change often, we can use the
1842 + * precalculated inverse to speed up arithmetics by turning divisions
1843 + * into multiplications:
1844 + */
1845 +static const u32 prio_to_wmult[40] = {
1846 + /* -20 */     48388,     59856,     76040,     92818,    118348,
1847 + /* -15 */    147320,    184698,    229616,    287308,    360437,
1848 + /* -10 */    449829,    563644,    704093,    875809,   1099582,
1849 + /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
1850 + /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
1851 + /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
1852 + /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
1853 + /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1854 +};
1855 +
1856 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1857 +
1858 +/*
1859 + * runqueue iterator, to support SMP load-balancing between different
1860 + * scheduling classes, without having to expose their internal data
1861 + * structures to the load-balancing proper:
1862 + */
1863 +struct rq_iterator {
1864 +       void *arg;
1865 +       struct task_struct *(*start)(void *);
1866 +       struct task_struct *(*next)(void *);
1867 +};
1868 +
1869 +#ifdef CONFIG_SMP
1870 +static unsigned long
1871 +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1872 +             unsigned long max_load_move, struct sched_domain *sd,
1873 +             enum cpu_idle_type idle, int *all_pinned,
1874 +             int *this_best_prio, struct rq_iterator *iterator);
1875 +
1876 +static int
1877 +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1878 +                  struct sched_domain *sd, enum cpu_idle_type idle,
1879 +                  struct rq_iterator *iterator);
1880 +#endif
1881 +
1882 +#ifdef CONFIG_CGROUP_CPUACCT
1883 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1884 +#else
1885 +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1886 +#endif
1887 +
1888 +static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1889 +{
1890 +       update_load_add(&rq->load, load);
1891 +}
1892 +
1893 +static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1894 +{
1895 +       update_load_sub(&rq->load, load);
1896 +}
1897 +
1898 +#ifdef CONFIG_SMP
1899 +static unsigned long source_load(int cpu, int type);
1900 +static unsigned long target_load(int cpu, int type);
1901 +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1902 +
1903 +static unsigned long cpu_avg_load_per_task(int cpu)
1904 +{
1905 +       struct rq *rq = cpu_rq(cpu);
1906 +
1907 +       if (rq->nr_running)
1908 +               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1909 +
1910 +       return rq->avg_load_per_task;
1911 +}
1912 +
1913 +#ifdef CONFIG_FAIR_GROUP_SCHED
1914 +
1915 +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1916 +
1917 +/*
1918 + * Iterate the full tree, calling @down when first entering a node and @up when
1919 + * leaving it for the final time.
1920 + */
1921 +static void
1922 +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1923 +{
1924 +       struct task_group *parent, *child;
1925 +
1926 +       rcu_read_lock();
1927 +       parent = &root_task_group;
1928 +down:
1929 +       (*down)(parent, cpu, sd);
1930 +       list_for_each_entry_rcu(child, &parent->children, siblings) {
1931 +               parent = child;
1932 +               goto down;
1933 +
1934 +up:
1935 +               continue;
1936 +       }
1937 +       (*up)(parent, cpu, sd);
1938 +
1939 +       child = parent;
1940 +       parent = parent->parent;
1941 +       if (parent)
1942 +               goto up;
1943 +       rcu_read_unlock();
1944 +}
1945 +
1946 +static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1947 +
1948 +/*
1949 + * Calculate and set the cpu's group shares.
1950 + */
1951 +static void
1952 +__update_group_shares_cpu(struct task_group *tg, int cpu,
1953 +                         unsigned long sd_shares, unsigned long sd_rq_weight)
1954 +{
1955 +       int boost = 0;
1956 +       unsigned long shares;
1957 +       unsigned long rq_weight;
1958 +
1959 +       if (!tg->se[cpu])
1960 +               return;
1961 +
1962 +       rq_weight = tg->cfs_rq[cpu]->load.weight;
1963 +
1964 +       /*
1965 +        * If there are currently no tasks on the cpu pretend there is one of
1966 +        * average load so that when a new task gets to run here it will not
1967 +        * get delayed by group starvation.
1968 +        */
1969 +       if (!rq_weight) {
1970 +               boost = 1;
1971 +               rq_weight = NICE_0_LOAD;
1972 +       }
1973 +
1974 +       if (unlikely(rq_weight > sd_rq_weight))
1975 +               rq_weight = sd_rq_weight;
1976 +
1977 +       /*
1978 +        *           \Sum shares * rq_weight
1979 +        * shares =  -----------------------
1980 +        *               \Sum rq_weight
1981 +        *
1982 +        */
1983 +       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1984 +
1985 +       /*
1986 +        * record the actual number of shares, not the boosted amount.
1987 +        */
1988 +       tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1989 +       tg->cfs_rq[cpu]->rq_weight = rq_weight;
1990 +
1991 +       if (shares < MIN_SHARES)
1992 +               shares = MIN_SHARES;
1993 +       else if (shares > MAX_SHARES)
1994 +               shares = MAX_SHARES;
1995 +
1996 +       __set_se_shares(tg->se[cpu], shares);
1997 +}
1998 +
1999 +/*
2000 + * Re-compute the task group their per cpu shares over the given domain.
2001 + * This needs to be done in a bottom-up fashion because the rq weight of a
2002 + * parent group depends on the shares of its child groups.
2003 + */
2004 +static void
2005 +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
2006 +{
2007 +       unsigned long rq_weight = 0;
2008 +       unsigned long shares = 0;
2009 +       int i;
2010 +
2011 +       for_each_cpu_mask(i, sd->span) {
2012 +               rq_weight += tg->cfs_rq[i]->load.weight;
2013 +               shares += tg->cfs_rq[i]->shares;
2014 +       }
2015 +
2016 +       if ((!shares && rq_weight) || shares > tg->shares)
2017 +               shares = tg->shares;
2018 +
2019 +       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
2020 +               shares = tg->shares;
2021 +
2022 +       if (!rq_weight)
2023 +               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
2024 +
2025 +       for_each_cpu_mask(i, sd->span) {
2026 +               struct rq *rq = cpu_rq(i);
2027 +               unsigned long flags;
2028 +
2029 +               spin_lock_irqsave(&rq->lock, flags);
2030 +               __update_group_shares_cpu(tg, i, shares, rq_weight);
2031 +               spin_unlock_irqrestore(&rq->lock, flags);
2032 +       }
2033 +}
2034 +
2035 +/*
2036 + * Compute the cpu's hierarchical load factor for each task group.
2037 + * This needs to be done in a top-down fashion because the load of a child
2038 + * group is a fraction of its parents load.
2039 + */
2040 +static void
2041 +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
2042 +{
2043 +       unsigned long load;
2044 +
2045 +       if (!tg->parent) {
2046 +               load = cpu_rq(cpu)->load.weight;
2047 +       } else {
2048 +               load = tg->parent->cfs_rq[cpu]->h_load;
2049 +               load *= tg->cfs_rq[cpu]->shares;
2050 +               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
2051 +       }
2052 +
2053 +       tg->cfs_rq[cpu]->h_load = load;
2054 +}
2055 +
2056 +static void
2057 +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
2058 +{
2059 +}
2060 +
2061 +static void update_shares(struct sched_domain *sd)
2062 +{
2063 +       u64 now = cpu_clock(raw_smp_processor_id());
2064 +       s64 elapsed = now - sd->last_update;
2065 +
2066 +       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
2067 +               sd->last_update = now;
2068 +               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
2069 +       }
2070 +}
2071 +
2072 +static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
2073 +{
2074 +       spin_unlock(&rq->lock);
2075 +       update_shares(sd);
2076 +       spin_lock(&rq->lock);
2077 +}
2078 +
2079 +static void update_h_load(int cpu)
2080 +{
2081 +       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
2082 +}
2083 +
2084 +#else
2085 +
2086 +static inline void update_shares(struct sched_domain *sd)
2087 +{
2088 +}
2089 +
2090 +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
2091 +{
2092 +}
2093 +
2094 +#endif
2095 +
2096 +#endif
2097 +
2098 +#ifdef CONFIG_FAIR_GROUP_SCHED
2099 +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
2100 +{
2101 +#ifdef CONFIG_SMP
2102 +       cfs_rq->shares = shares;
2103 +#endif
2104 +}
2105 +#endif
2106 +
2107 +#include "sched_stats.h"
2108 +#include "sched_idletask.c"
2109 +#include "sched_fair.c"
2110 +#include "sched_rt.c"
2111 +#ifdef CONFIG_SCHED_DEBUG
2112 +# include "sched_debug.c"
2113 +#endif
2114 +
2115 +#define sched_class_highest (&rt_sched_class)
2116 +#define for_each_class(class) \
2117 +   for (class = sched_class_highest; class; class = class->next)
2118 +
2119 +static void inc_nr_running(struct rq *rq)
2120 +{
2121 +       rq->nr_running++;
2122 +}
2123 +
2124 +static void dec_nr_running(struct rq *rq)
2125 +{
2126 +       rq->nr_running--;
2127 +}
2128 +
2129 +static void set_load_weight(struct task_struct *p)
2130 +{
2131 +       if (task_has_rt_policy(p)) {
2132 +               p->se.load.weight = prio_to_weight[0] * 2;
2133 +               p->se.load.inv_weight = prio_to_wmult[0] >> 1;
2134 +               return;
2135 +       }
2136 +
2137 +       /*
2138 +        * SCHED_IDLE tasks get minimal weight:
2139 +        */
2140 +       if (p->policy == SCHED_IDLE) {
2141 +               p->se.load.weight = WEIGHT_IDLEPRIO;
2142 +               p->se.load.inv_weight = WMULT_IDLEPRIO;
2143 +               return;
2144 +       }
2145 +
2146 +       p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
2147 +       p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
2148 +}
2149 +
2150 +static void update_avg(u64 *avg, u64 sample)
2151 +{
2152 +       s64 diff = sample - *avg;
2153 +       *avg += diff >> 3;
2154 +}
2155 +
2156 +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
2157 +{
2158 +       // BUG_ON(p->state & TASK_ONHOLD);
2159 +       sched_info_queued(p);
2160 +       p->sched_class->enqueue_task(rq, p, wakeup);
2161 +       p->se.on_rq = 1;
2162 +}
2163 +
2164 +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
2165 +{
2166 +       if (sleep && p->se.last_wakeup) {
2167 +               update_avg(&p->se.avg_overlap,
2168 +                          p->se.sum_exec_runtime - p->se.last_wakeup);
2169 +               p->se.last_wakeup = 0;
2170 +       }
2171 +
2172 +       sched_info_dequeued(p);
2173 +       p->sched_class->dequeue_task(rq, p, sleep);
2174 +       p->se.on_rq = 0;
2175 +}
2176 +
2177 +/*
2178 + * __normal_prio - return the priority that is based on the static prio
2179 + */
2180 +static inline int __normal_prio(struct task_struct *p)
2181 +{
2182 +       return p->static_prio;
2183 +}
2184 +
2185 +/*
2186 + * Calculate the expected normal priority: i.e. priority
2187 + * without taking RT-inheritance into account. Might be
2188 + * boosted by interactivity modifiers. Changes upon fork,
2189 + * setprio syscalls, and whenever the interactivity
2190 + * estimator recalculates.
2191 + */
2192 +static inline int normal_prio(struct task_struct *p)
2193 +{
2194 +       int prio;
2195 +
2196 +       if (task_has_rt_policy(p))
2197 +               prio = MAX_RT_PRIO-1 - p->rt_priority;
2198 +       else
2199 +               prio = __normal_prio(p);
2200 +       return prio;
2201 +}
2202 +
2203 +/*
2204 + * Calculate the current priority, i.e. the priority
2205 + * taken into account by the scheduler. This value might
2206 + * be boosted by RT tasks, or might be boosted by
2207 + * interactivity modifiers. Will be RT if the task got
2208 + * RT-boosted. If not then it returns p->normal_prio.
2209 + */
2210 +static int effective_prio(struct task_struct *p)
2211 +{
2212 +       p->normal_prio = normal_prio(p);
2213 +       /*
2214 +        * If we are RT tasks or we were boosted to RT priority,
2215 +        * keep the priority unchanged. Otherwise, update priority
2216 +        * to the normal priority:
2217 +        */
2218 +       if (!rt_prio(p->prio))
2219 +               return p->normal_prio;
2220 +       return p->prio;
2221 +}
2222 +
2223 +/*
2224 + * activate_task - move a task to the runqueue.
2225 + */
2226 +static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
2227 +{
2228 +       if (task_contributes_to_load(p))
2229 +               rq->nr_uninterruptible--;
2230 +
2231 +       enqueue_task(rq, p, wakeup);
2232 +       inc_nr_running(rq);
2233 +}
2234 +
2235 +/*
2236 + * deactivate_task - remove a task from the runqueue.
2237 + */
2238 +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
2239 +{
2240 +       if (task_contributes_to_load(p))
2241 +               rq->nr_uninterruptible++;
2242 +
2243 +       dequeue_task(rq, p, sleep);
2244 +       dec_nr_running(rq);
2245 +}
2246 +
2247 +/**
2248 + * task_curr - is this task currently executing on a CPU?
2249 + * @p: the task in question.
2250 + */
2251 +inline int task_curr(const struct task_struct *p)
2252 +{
2253 +       return cpu_curr(task_cpu(p)) == p;
2254 +}
2255 +
2256 +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
2257 +{
2258 +       set_task_rq(p, cpu);
2259 +#ifdef CONFIG_SMP
2260 +       /*
2261 +        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
2262 +        * successfuly executed on another CPU. We must ensure that updates of
2263 +        * per-task data have been completed by this moment.
2264 +        */
2265 +       smp_wmb();
2266 +       task_thread_info(p)->cpu = cpu;
2267 +#endif
2268 +}
2269 +
2270 +static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2271 +                                      const struct sched_class *prev_class,
2272 +                                      int oldprio, int running)
2273 +{
2274 +       if (prev_class != p->sched_class) {
2275 +               if (prev_class->switched_from)
2276 +                       prev_class->switched_from(rq, p, running);
2277 +               p->sched_class->switched_to(rq, p, running);
2278 +       } else
2279 +               p->sched_class->prio_changed(rq, p, oldprio, running);
2280 +}
2281 +
2282 +#ifdef CONFIG_SMP
2283 +
2284 +/* Used instead of source_load when we know the type == 0 */
2285 +static unsigned long weighted_cpuload(const int cpu)
2286 +{
2287 +       return cpu_rq(cpu)->load.weight;
2288 +}
2289 +
2290 +/*
2291 + * Is this task likely cache-hot:
2292 + */
2293 +static int
2294 +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2295 +{
2296 +       s64 delta;
2297 +
2298 +       /*
2299 +        * Buddy candidates are cache hot:
2300 +        */
2301 +       if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
2302 +               return 1;
2303 +
2304 +       if (p->sched_class != &fair_sched_class)
2305 +               return 0;
2306 +
2307 +       if (sysctl_sched_migration_cost == -1)
2308 +               return 1;
2309 +       if (sysctl_sched_migration_cost == 0)
2310 +               return 0;
2311 +
2312 +       delta = now - p->se.exec_start;
2313 +
2314 +       return delta < (s64)sysctl_sched_migration_cost;
2315 +}
2316 +
2317 +
2318 +void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2319 +{
2320 +       int old_cpu = task_cpu(p);
2321 +       struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2322 +       struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2323 +                     *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2324 +       u64 clock_offset;
2325 +
2326 +       clock_offset = old_rq->clock - new_rq->clock;
2327 +
2328 +#ifdef CONFIG_SCHEDSTATS
2329 +       if (p->se.wait_start)
2330 +               p->se.wait_start -= clock_offset;
2331 +       if (p->se.sleep_start)
2332 +               p->se.sleep_start -= clock_offset;
2333 +       if (p->se.block_start)
2334 +               p->se.block_start -= clock_offset;
2335 +       if (old_cpu != new_cpu) {
2336 +               schedstat_inc(p, se.nr_migrations);
2337 +               if (task_hot(p, old_rq->clock, NULL))
2338 +                       schedstat_inc(p, se.nr_forced2_migrations);
2339 +       }
2340 +#endif
2341 +       p->se.vruntime -= old_cfsrq->min_vruntime -
2342 +                                        new_cfsrq->min_vruntime;
2343 +
2344 +       __set_task_cpu(p, new_cpu);
2345 +}
2346 +
2347 +struct migration_req {
2348 +       struct list_head list;
2349 +
2350 +       struct task_struct *task;
2351 +       int dest_cpu;
2352 +
2353 +       struct completion done;
2354 +};
2355 +
2356 +#include "sched_mon.h"
2357 +
2358 +
2359 +/*
2360 + * The task's runqueue lock must be held.
2361 + * Returns true if you have to wait for migration thread.
2362 + */
2363 +static int
2364 +migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2365 +{
2366 +       struct rq *rq = task_rq(p);
2367 +
2368 +       vxm_migrate_task(p, rq, dest_cpu);
2369 +       /*
2370 +        * If the task is not on a runqueue (and not running), then
2371 +        * it is sufficient to simply update the task's cpu field.
2372 +        */
2373 +       if (!p->se.on_rq && !task_running(rq, p)) {
2374 +               set_task_cpu(p, dest_cpu);
2375 +               return 0;
2376 +       }
2377 +
2378 +       init_completion(&req->done);
2379 +       req->task = p;
2380 +       req->dest_cpu = dest_cpu;
2381 +       list_add(&req->list, &rq->migration_queue);
2382 +
2383 +       return 1;
2384 +}
2385 +
2386 +/*
2387 + * wait_task_inactive - wait for a thread to unschedule.
2388 + *
2389 + * If @match_state is nonzero, it's the @p->state value just checked and
2390 + * not expected to change.  If it changes, i.e. @p might have woken up,
2391 + * then return zero.  When we succeed in waiting for @p to be off its CPU,
2392 + * we return a positive number (its total switch count).  If a second call
2393 + * a short while later returns the same number, the caller can be sure that
2394 + * @p has remained unscheduled the whole time.
2395 + *
2396 + * The caller must ensure that the task *will* unschedule sometime soon,
2397 + * else this function might spin for a *long* time. This function can't
2398 + * be called with interrupts off, or it may introduce deadlock with
2399 + * smp_call_function() if an IPI is sent by the same process we are
2400 + * waiting to become inactive.
2401 + */
2402 +unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2403 +{
2404 +       unsigned long flags;
2405 +       int running, on_rq;
2406 +       unsigned long ncsw;
2407 +       struct rq *rq;
2408 +
2409 +       for (;;) {
2410 +               /*
2411 +                * We do the initial early heuristics without holding
2412 +                * any task-queue locks at all. We'll only try to get
2413 +                * the runqueue lock when things look like they will
2414 +                * work out!
2415 +                */
2416 +               rq = task_rq(p);
2417 +
2418 +               /*
2419 +                * If the task is actively running on another CPU
2420 +                * still, just relax and busy-wait without holding
2421 +                * any locks.
2422 +                *
2423 +                * NOTE! Since we don't hold any locks, it's not
2424 +                * even sure that "rq" stays as the right runqueue!
2425 +                * But we don't care, since "task_running()" will
2426 +                * return false if the runqueue has changed and p
2427 +                * is actually now running somewhere else!
2428 +                */
2429 +               while (task_running(rq, p)) {
2430 +                       if (match_state && unlikely(p->state != match_state))
2431 +                               return 0;
2432 +                       cpu_relax();
2433 +               }
2434 +
2435 +               /*
2436 +                * Ok, time to look more closely! We need the rq
2437 +                * lock now, to be *sure*. If we're wrong, we'll
2438 +                * just go back and repeat.
2439 +                */
2440 +               rq = task_rq_lock(p, &flags);
2441 +               running = task_running(rq, p);
2442 +               on_rq = p->se.on_rq;
2443 +               ncsw = 0;
2444 +               if (!match_state || p->state == match_state) {
2445 +                       ncsw = p->nivcsw + p->nvcsw;
2446 +                       if (unlikely(!ncsw))
2447 +                               ncsw = 1;
2448 +               }
2449 +               task_rq_unlock(rq, &flags);
2450 +
2451 +               /*
2452 +                * If it changed from the expected state, bail out now.
2453 +                */
2454 +               if (unlikely(!ncsw))
2455 +                       break;
2456 +
2457 +               /*
2458 +                * Was it really running after all now that we
2459 +                * checked with the proper locks actually held?
2460 +                *
2461 +                * Oops. Go back and try again..
2462 +                */
2463 +               if (unlikely(running)) {
2464 +                       cpu_relax();
2465 +                       continue;
2466 +               }
2467 +
2468 +               /*
2469 +                * It's not enough that it's not actively running,
2470 +                * it must be off the runqueue _entirely_, and not
2471 +                * preempted!
2472 +                *
2473 +                * So if it wa still runnable (but just not actively
2474 +                * running right now), it's preempted, and we should
2475 +                * yield - it could be a while.
2476 +                */
2477 +               if (unlikely(on_rq)) {
2478 +                       schedule_timeout_uninterruptible(1);
2479 +                       continue;
2480 +               }
2481 +
2482 +               /*
2483 +                * Ahh, all good. It wasn't running, and it wasn't
2484 +                * runnable, which means that it will never become
2485 +                * running in the future either. We're all done!
2486 +                */
2487 +               break;
2488 +       }
2489 +
2490 +       return ncsw;
2491 +}
2492 +
2493 +/***
2494 + * kick_process - kick a running thread to enter/exit the kernel
2495 + * @p: the to-be-kicked thread
2496 + *
2497 + * Cause a process which is running on another CPU to enter
2498 + * kernel-mode, without any delay. (to get signals handled.)
2499 + *
2500 + * NOTE: this function doesnt have to take the runqueue lock,
2501 + * because all it wants to ensure is that the remote task enters
2502 + * the kernel. If the IPI races and the task has been migrated
2503 + * to another CPU then no harm is done and the purpose has been
2504 + * achieved as well.
2505 + */
2506 +void kick_process(struct task_struct *p)
2507 +{
2508 +       int cpu;
2509 +
2510 +       preempt_disable();
2511 +       cpu = task_cpu(p);
2512 +       if ((cpu != smp_processor_id()) && task_curr(p))
2513 +               smp_send_reschedule(cpu);
2514 +       preempt_enable();
2515 +}
2516 +
2517 +/*
2518 + * Return a low guess at the load of a migration-source cpu weighted
2519 + * according to the scheduling class and "nice" value.
2520 + *
2521 + * We want to under-estimate the load of migration sources, to
2522 + * balance conservatively.
2523 + */
2524 +static unsigned long source_load(int cpu, int type)
2525 +{
2526 +       struct rq *rq = cpu_rq(cpu);
2527 +       unsigned long total = weighted_cpuload(cpu);
2528 +
2529 +       if (type == 0 || !sched_feat(LB_BIAS))
2530 +               return total;
2531 +
2532 +       return min(rq->cpu_load[type-1], total);
2533 +}
2534 +
2535 +/*
2536 + * Return a high guess at the load of a migration-target cpu weighted
2537 + * according to the scheduling class and "nice" value.
2538 + */
2539 +static unsigned long target_load(int cpu, int type)
2540 +{
2541 +       struct rq *rq = cpu_rq(cpu);
2542 +       unsigned long total = weighted_cpuload(cpu);
2543 +
2544 +       if (type == 0 || !sched_feat(LB_BIAS))
2545 +               return total;
2546 +
2547 +       return max(rq->cpu_load[type-1], total);
2548 +}
2549 +
2550 +/*
2551 + * find_idlest_group finds and returns the least busy CPU group within the
2552 + * domain.
2553 + */
2554 +static struct sched_group *
2555 +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2556 +{
2557 +       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2558 +       unsigned long min_load = ULONG_MAX, this_load = 0;
2559 +       int load_idx = sd->forkexec_idx;
2560 +       int imbalance = 100 + (sd->imbalance_pct-100)/2;
2561 +
2562 +       do {
2563 +               unsigned long load, avg_load;
2564 +               int local_group;
2565 +               int i;
2566 +
2567 +               /* Skip over this group if it has no CPUs allowed */
2568 +               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
2569 +                       continue;
2570 +
2571 +               local_group = cpu_isset(this_cpu, group->cpumask);
2572 +
2573 +               /* Tally up the load of all CPUs in the group */
2574 +               avg_load = 0;
2575 +
2576 +               for_each_cpu_mask_nr(i, group->cpumask) {
2577 +                       /* Bias balancing toward cpus of our domain */
2578 +                       if (local_group)
2579 +                               load = source_load(i, load_idx);
2580 +                       else
2581 +                               load = target_load(i, load_idx);
2582 +
2583 +                       avg_load += load;
2584 +               }
2585 +
2586 +               /* Adjust by relative CPU power of the group */
2587 +               avg_load = sg_div_cpu_power(group,
2588 +                               avg_load * SCHED_LOAD_SCALE);
2589 +
2590 +               if (local_group) {
2591 +                       this_load = avg_load;
2592 +                       this = group;
2593 +               } else if (avg_load < min_load) {
2594 +                       min_load = avg_load;
2595 +                       idlest = group;
2596 +               }
2597 +       } while (group = group->next, group != sd->groups);
2598 +
2599 +       if (!idlest || 100*this_load < imbalance*min_load)
2600 +               return NULL;
2601 +       return idlest;
2602 +}
2603 +
2604 +/*
2605 + * find_idlest_cpu - find the idlest cpu among the cpus in group.
2606 + */
2607 +static int
2608 +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2609 +               cpumask_t *tmp)
2610 +{
2611 +       unsigned long load, min_load = ULONG_MAX;
2612 +       int idlest = -1;
2613 +       int i;
2614 +
2615 +       /* Traverse only the allowed CPUs */
2616 +       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
2617 +
2618 +       for_each_cpu_mask_nr(i, *tmp) {
2619 +               load = weighted_cpuload(i);
2620 +
2621 +               if (load < min_load || (load == min_load && i == this_cpu)) {
2622 +                       min_load = load;
2623 +                       idlest = i;
2624 +               }
2625 +       }
2626 +
2627 +       return idlest;
2628 +}
2629 +
2630 +/*
2631 + * sched_balance_self: balance the current task (running on cpu) in domains
2632 + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2633 + * SD_BALANCE_EXEC.
2634 + *
2635 + * Balance, ie. select the least loaded group.
2636 + *
2637 + * Returns the target CPU number, or the same CPU if no balancing is needed.
2638 + *
2639 + * preempt must be disabled.
2640 + */
2641 +static int sched_balance_self(int cpu, int flag)
2642 +{
2643 +       struct task_struct *t = current;
2644 +       struct sched_domain *tmp, *sd = NULL;
2645 +
2646 +       for_each_domain(cpu, tmp) {
2647 +               /*
2648 +                * If power savings logic is enabled for a domain, stop there.
2649 +                */
2650 +               if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2651 +                       break;
2652 +               if (tmp->flags & flag)
2653 +                       sd = tmp;
2654 +       }
2655 +
2656 +       if (sd)
2657 +               update_shares(sd);
2658 +
2659 +       while (sd) {
2660 +               cpumask_t span, tmpmask;
2661 +               struct sched_group *group;
2662 +               int new_cpu, weight;
2663 +
2664 +               if (!(sd->flags & flag)) {
2665 +                       sd = sd->child;
2666 +                       continue;
2667 +               }
2668 +
2669 +               span = sd->span;
2670 +               group = find_idlest_group(sd, t, cpu);
2671 +               if (!group) {
2672 +                       sd = sd->child;
2673 +                       continue;
2674 +               }
2675 +
2676 +               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
2677 +               if (new_cpu == -1 || new_cpu == cpu) {
2678 +                       /* Now try balancing at a lower domain level of cpu */
2679 +                       sd = sd->child;
2680 +                       continue;
2681 +               }
2682 +
2683 +               /* Now try balancing at a lower domain level of new_cpu */
2684 +               cpu = new_cpu;
2685 +               sd = NULL;
2686 +               weight = cpus_weight(span);
2687 +               for_each_domain(cpu, tmp) {
2688 +                       if (weight <= cpus_weight(tmp->span))
2689 +                               break;
2690 +                       if (tmp->flags & flag)
2691 +                               sd = tmp;
2692 +               }
2693 +               /* while loop will break here if sd == NULL */
2694 +       }
2695 +
2696 +       return cpu;
2697 +}
2698 +
2699 +#endif /* CONFIG_SMP */
2700 +
2701 +/***
2702 + * try_to_wake_up - wake up a thread
2703 + * @p: the to-be-woken-up thread
2704 + * @state: the mask of task states that can be woken
2705 + * @sync: do a synchronous wakeup?
2706 + *
2707 + * Put it on the run-queue if it's not already there. The "current"
2708 + * thread is always on the run-queue (except when the actual
2709 + * re-schedule is in progress), and as such you're allowed to do
2710 + * the simpler "current->state = TASK_RUNNING" to mark yourself
2711 + * runnable without the overhead of this.
2712 + *
2713 + * returns failure only if the task is already active.
2714 + */
2715 +static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2716 +{
2717 +       int cpu, orig_cpu, this_cpu, success = 0;
2718 +       unsigned long flags;
2719 +       long old_state;
2720 +       struct rq *rq;
2721 +
2722 +       if (!sched_feat(SYNC_WAKEUPS))
2723 +               sync = 0;
2724 +
2725 +#ifdef CONFIG_SMP
2726 +       if (sched_feat(LB_WAKEUP_UPDATE)) {
2727 +               struct sched_domain *sd;
2728 +
2729 +               this_cpu = raw_smp_processor_id();
2730 +               cpu = task_cpu(p);
2731 +
2732 +               for_each_domain(this_cpu, sd) {
2733 +                       if (cpu_isset(cpu, sd->span)) {
2734 +                               update_shares(sd);
2735 +                               break;
2736 +                       }
2737 +               }
2738 +       }
2739 +#endif
2740 +
2741 +       smp_wmb();
2742 +       rq = task_rq_lock(p, &flags);
2743 +       old_state = p->state;
2744 +       if (!(old_state & state))
2745 +               goto out;
2746 +
2747 +       if (p->se.on_rq)
2748 +               goto out_running;
2749 +
2750 +       cpu = task_cpu(p);
2751 +       orig_cpu = cpu;
2752 +       this_cpu = smp_processor_id();
2753 +
2754 +#ifdef CONFIG_SMP
2755 +       if (unlikely(task_running(rq, p)))
2756 +               goto out_activate;
2757 +
2758 +       cpu = p->sched_class->select_task_rq(p, sync);
2759 +       if (cpu != orig_cpu) {
2760 +               set_task_cpu(p, cpu);
2761 +               task_rq_unlock(rq, &flags);
2762 +               /* might preempt at this point */
2763 +               rq = task_rq_lock(p, &flags);
2764 +               old_state = p->state;
2765 +
2766 +       /* we need to unhold suspended tasks
2767 +       if (old_state & TASK_ONHOLD) {
2768 +               vx_unhold_task(p, rq);
2769 +               old_state = p->state;
2770 +       } */
2771 +               if (!(old_state & state))
2772 +                       goto out;
2773 +               if (p->se.on_rq)
2774 +                       goto out_running;
2775 +
2776 +               this_cpu = smp_processor_id();
2777 +               cpu = task_cpu(p);
2778 +       }
2779 +
2780 +#ifdef CONFIG_SCHEDSTATS
2781 +       schedstat_inc(rq, ttwu_count);
2782 +       if (cpu == this_cpu)
2783 +               schedstat_inc(rq, ttwu_local);
2784 +       else {
2785 +               struct sched_domain *sd;
2786 +               for_each_domain(this_cpu, sd) {
2787 +                       if (cpu_isset(cpu, sd->span)) {
2788 +                               schedstat_inc(sd, ttwu_wake_remote);
2789 +                               break;
2790 +                       }
2791 +               }
2792 +       }
2793 +#endif /* CONFIG_SCHEDSTATS */
2794 +
2795 +out_activate:
2796 +#endif /* CONFIG_SMP */
2797 +       schedstat_inc(p, se.nr_wakeups);
2798 +       if (sync)
2799 +               schedstat_inc(p, se.nr_wakeups_sync);
2800 +       if (orig_cpu != cpu)
2801 +               schedstat_inc(p, se.nr_wakeups_migrate);
2802 +       if (cpu == this_cpu)
2803 +               schedstat_inc(p, se.nr_wakeups_local);
2804 +       else
2805 +               schedstat_inc(p, se.nr_wakeups_remote);
2806 +       update_rq_clock(rq);
2807 +       activate_task(rq, p, 1);
2808 +       success = 1;
2809 +
2810 +out_running:
2811 +       trace_mark(kernel_sched_wakeup,
2812 +               "pid %d state %ld ## rq %p task %p rq->curr %p",
2813 +               p->pid, p->state, rq, p, rq->curr);
2814 +       check_preempt_curr(rq, p);
2815 +
2816 +       p->state = TASK_RUNNING;
2817 +#ifdef CONFIG_SMP
2818 +       if (p->sched_class->task_wake_up)
2819 +               p->sched_class->task_wake_up(rq, p);
2820 +#endif
2821 +out:
2822 +       current->se.last_wakeup = current->se.sum_exec_runtime;
2823 +
2824 +       task_rq_unlock(rq, &flags);
2825 +
2826 +       return success;
2827 +}
2828 +
2829 +int wake_up_process(struct task_struct *p)
2830 +{
2831 +       return try_to_wake_up(p, TASK_ALL, 0);
2832 +}
2833 +EXPORT_SYMBOL(wake_up_process);
2834 +
2835 +int wake_up_state(struct task_struct *p, unsigned int state)
2836 +{
2837 +       return try_to_wake_up(p, state, 0);
2838 +}
2839 +
2840 +/*
2841 + * Perform scheduler related setup for a newly forked process p.
2842 + * p is forked by current.
2843 + *
2844 + * __sched_fork() is basic setup used by init_idle() too:
2845 + */
2846 +static void __sched_fork(struct task_struct *p)
2847 +{
2848 +       p->se.exec_start                = 0;
2849 +       p->se.sum_exec_runtime          = 0;
2850 +       p->se.prev_sum_exec_runtime     = 0;
2851 +       p->se.last_wakeup               = 0;
2852 +       p->se.avg_overlap               = 0;
2853 +
2854 +#ifdef CONFIG_SCHEDSTATS
2855 +       p->se.wait_start                = 0;
2856 +       p->se.sum_sleep_runtime         = 0;
2857 +       p->se.sleep_start               = 0;
2858 +       p->se.block_start               = 0;
2859 +       p->se.sleep_max                 = 0;
2860 +       p->se.block_max                 = 0;
2861 +       p->se.exec_max                  = 0;
2862 +       p->se.slice_max                 = 0;
2863 +       p->se.wait_max                  = 0;
2864 +#endif
2865 +
2866 +       INIT_LIST_HEAD(&p->rt.run_list);
2867 +       p->se.on_rq = 0;
2868 +       INIT_LIST_HEAD(&p->se.group_node);
2869 +
2870 +#ifdef CONFIG_PREEMPT_NOTIFIERS
2871 +       INIT_HLIST_HEAD(&p->preempt_notifiers);
2872 +#endif
2873 +
2874 +       /*
2875 +        * We mark the process as running here, but have not actually
2876 +        * inserted it onto the runqueue yet. This guarantees that
2877 +        * nobody will actually run it, and a signal or other external
2878 +        * event cannot wake it up and insert it on the runqueue either.
2879 +        */
2880 +       p->state = TASK_RUNNING;
2881 +}
2882 +
2883 +/*
2884 + * fork()/clone()-time setup:
2885 + */
2886 +void sched_fork(struct task_struct *p, int clone_flags)
2887 +{
2888 +       int cpu = get_cpu();
2889 +
2890 +       __sched_fork(p);
2891 +
2892 +#ifdef CONFIG_SMP
2893 +       cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2894 +#endif
2895 +       set_task_cpu(p, cpu);
2896 +
2897 +       /*
2898 +        * Make sure we do not leak PI boosting priority to the child:
2899 +        */
2900 +       p->prio = current->normal_prio;
2901 +       if (!rt_prio(p->prio))
2902 +               p->sched_class = &fair_sched_class;
2903 +
2904 +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2905 +       if (likely(sched_info_on()))
2906 +               memset(&p->sched_info, 0, sizeof(p->sched_info));
2907 +#endif
2908 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2909 +       p->oncpu = 0;
2910 +#endif
2911 +#ifdef CONFIG_PREEMPT
2912 +       /* Want to start with kernel preemption disabled. */
2913 +       task_thread_info(p)->preempt_count = 1;
2914 +#endif
2915 +       put_cpu();
2916 +}
2917 +
2918 +/*
2919 + * wake_up_new_task - wake up a newly created task for the first time.
2920 + *
2921 + * This function will do some initial scheduler statistics housekeeping
2922 + * that must be done for every newly created context, then puts the task
2923 + * on the runqueue and wakes it.
2924 + */
2925 +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2926 +{
2927 +       unsigned long flags;
2928 +       struct rq *rq;
2929 +
2930 +       rq = task_rq_lock(p, &flags);
2931 +       BUG_ON(p->state != TASK_RUNNING);
2932 +       update_rq_clock(rq);
2933 +
2934 +       p->prio = effective_prio(p);
2935 +
2936 +       if (!p->sched_class->task_new || !current->se.on_rq) {
2937 +               activate_task(rq, p, 0);
2938 +       } else {
2939 +               /*
2940 +                * Let the scheduling class do new task startup
2941 +                * management (if any):
2942 +                */
2943 +               p->sched_class->task_new(rq, p);
2944 +               inc_nr_running(rq);
2945 +       }
2946 +       trace_mark(kernel_sched_wakeup_new,
2947 +               "pid %d state %ld ## rq %p task %p rq->curr %p",
2948 +               p->pid, p->state, rq, p, rq->curr);
2949 +       check_preempt_curr(rq, p);
2950 +#ifdef CONFIG_SMP
2951 +       if (p->sched_class->task_wake_up)
2952 +               p->sched_class->task_wake_up(rq, p);
2953 +#endif
2954 +       task_rq_unlock(rq, &flags);
2955 +}
2956 +
2957 +#ifdef CONFIG_PREEMPT_NOTIFIERS
2958 +
2959 +/**
2960 + * preempt_notifier_register - tell me when current is being being preempted & rescheduled
2961 + * @notifier: notifier struct to register
2962 + */
2963 +void preempt_notifier_register(struct preempt_notifier *notifier)
2964 +{
2965 +       hlist_add_head(&notifier->link, &current->preempt_notifiers);
2966 +}
2967 +EXPORT_SYMBOL_GPL(preempt_notifier_register);
2968 +
2969 +/**
2970 + * preempt_notifier_unregister - no longer interested in preemption notifications
2971 + * @notifier: notifier struct to unregister
2972 + *
2973 + * This is safe to call from within a preemption notifier.
2974 + */
2975 +void preempt_notifier_unregister(struct preempt_notifier *notifier)
2976 +{
2977 +       hlist_del(&notifier->link);
2978 +}
2979 +EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2980 +
2981 +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2982 +{
2983 +       struct preempt_notifier *notifier;
2984 +       struct hlist_node *node;
2985 +
2986 +       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2987 +               notifier->ops->sched_in(notifier, raw_smp_processor_id());
2988 +}
2989 +
2990 +static void
2991 +fire_sched_out_preempt_notifiers(struct task_struct *curr,
2992 +                                struct task_struct *next)
2993 +{
2994 +       struct preempt_notifier *notifier;
2995 +       struct hlist_node *node;
2996 +
2997 +       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2998 +               notifier->ops->sched_out(notifier, next);
2999 +}
3000 +
3001 +#else /* !CONFIG_PREEMPT_NOTIFIERS */
3002 +
3003 +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3004 +{
3005 +}
3006 +
3007 +static void
3008 +fire_sched_out_preempt_notifiers(struct task_struct *curr,
3009 +                                struct task_struct *next)
3010 +{
3011 +}
3012 +
3013 +#endif /* CONFIG_PREEMPT_NOTIFIERS */
3014 +
3015 +/**
3016 + * prepare_task_switch - prepare to switch tasks
3017 + * @rq: the runqueue preparing to switch
3018 + * @prev: the current task that is being switched out
3019 + * @next: the task we are going to switch to.
3020 + *
3021 + * This is called with the rq lock held and interrupts off. It must
3022 + * be paired with a subsequent finish_task_switch after the context
3023 + * switch.
3024 + *
3025 + * prepare_task_switch sets up locking and calls architecture specific
3026 + * hooks.
3027 + */
3028 +static inline void
3029 +prepare_task_switch(struct rq *rq, struct task_struct *prev,
3030 +                   struct task_struct *next)
3031 +{
3032 +       fire_sched_out_preempt_notifiers(prev, next);
3033 +       prepare_lock_switch(rq, next);
3034 +       prepare_arch_switch(next);
3035 +}
3036 +
3037 +/**
3038 + * finish_task_switch - clean up after a task-switch
3039 + * @rq: runqueue associated with task-switch
3040 + * @prev: the thread we just switched away from.
3041 + *
3042 + * finish_task_switch must be called after the context switch, paired
3043 + * with a prepare_task_switch call before the context switch.
3044 + * finish_task_switch will reconcile locking set up by prepare_task_switch,
3045 + * and do any other architecture-specific cleanup actions.
3046 + *
3047 + * Note that we may have delayed dropping an mm in context_switch(). If
3048 + * so, we finish that here outside of the runqueue lock. (Doing it
3049 + * with the lock held can cause deadlocks; see schedule() for
3050 + * details.)
3051 + */
3052 +static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3053 +       __releases(rq->lock)
3054 +{
3055 +       struct mm_struct *mm = rq->prev_mm;
3056 +       long prev_state;
3057 +
3058 +       rq->prev_mm = NULL;
3059 +
3060 +       /*
3061 +        * A task struct has one reference for the use as "current".
3062 +        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
3063 +        * schedule one last time. The schedule call will never return, and
3064 +        * the scheduled task must drop that reference.
3065 +        * The test for TASK_DEAD must occur while the runqueue locks are
3066 +        * still held, otherwise prev could be scheduled on another cpu, die
3067 +        * there before we look at prev->state, and then the reference would
3068 +        * be dropped twice.
3069 +        *              Manfred Spraul <manfred@colorfullife.com>
3070 +        */
3071 +       prev_state = prev->state;
3072 +       finish_arch_switch(prev);
3073 +       finish_lock_switch(rq, prev);
3074 +#ifdef CONFIG_SMP
3075 +       if (current->sched_class->post_schedule)
3076 +               current->sched_class->post_schedule(rq);
3077 +#endif
3078 +
3079 +       fire_sched_in_preempt_notifiers(current);
3080 +       if (mm)
3081 +               mmdrop(mm);
3082 +       if (unlikely(prev_state == TASK_DEAD)) {
3083 +               /*
3084 +                * Remove function-return probe instances associated with this
3085 +                * task and put them back on the free list.
3086 +                */
3087 +               kprobe_flush_task(prev);
3088 +               put_task_struct(prev);
3089 +       }
3090 +}
3091 +
3092 +/**
3093 + * schedule_tail - first thing a freshly forked thread must call.
3094 + * @prev: the thread we just switched away from.
3095 + */
3096 +asmlinkage void schedule_tail(struct task_struct *prev)
3097 +       __releases(rq->lock)
3098 +{
3099 +       struct rq *rq = this_rq();
3100 +
3101 +       finish_task_switch(rq, prev);
3102 +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
3103 +       /* In this case, finish_task_switch does not reenable preemption */
3104 +       preempt_enable();
3105 +#endif
3106 +       if (current->set_child_tid)
3107 +               put_user(task_pid_vnr(current), current->set_child_tid);
3108 +}
3109 +
3110 +/*
3111 + * context_switch - switch to the new MM and the new
3112 + * thread's register state.
3113 + */
3114 +static inline void
3115 +context_switch(struct rq *rq, struct task_struct *prev,
3116 +              struct task_struct *next)
3117 +{
3118 +       struct mm_struct *mm, *oldmm;
3119 +
3120 +       prepare_task_switch(rq, prev, next);
3121 +       trace_mark(kernel_sched_schedule,
3122 +               "prev_pid %d next_pid %d prev_state %ld "
3123 +               "## rq %p prev %p next %p",
3124 +               prev->pid, next->pid, prev->state,
3125 +               rq, prev, next);
3126 +       mm = next->mm;
3127 +       oldmm = prev->active_mm;
3128 +       /*
3129 +        * For paravirt, this is coupled with an exit in switch_to to
3130 +        * combine the page table reload and the switch backend into
3131 +        * one hypercall.
3132 +        */
3133 +       arch_enter_lazy_cpu_mode();
3134 +
3135 +       if (unlikely(!mm)) {
3136 +               next->active_mm = oldmm;
3137 +               atomic_inc(&oldmm->mm_count);
3138 +               enter_lazy_tlb(oldmm, next);
3139 +       } else
3140 +               switch_mm(oldmm, mm, next);
3141 +
3142 +       if (unlikely(!prev->mm)) {
3143 +               prev->active_mm = NULL;
3144 +               rq->prev_mm = oldmm;
3145 +       }
3146 +       /*
3147 +        * Since the runqueue lock will be released by the next
3148 +        * task (which is an invalid locking op but in the case
3149 +        * of the scheduler it's an obvious special-case), so we
3150 +        * do an early lockdep release here:
3151 +        */
3152 +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
3153 +       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3154 +#endif
3155 +
3156 +       /* Here we just switch the register state and the stack. */
3157 +       switch_to(prev, next, prev);
3158 +
3159 +       barrier();
3160 +       /*
3161 +        * this_rq must be evaluated again because prev may have moved
3162 +        * CPUs since it called schedule(), thus the 'rq' on its stack
3163 +        * frame will be invalid.
3164 +        */
3165 +       finish_task_switch(this_rq(), prev);
3166 +}
3167 +
3168 +/*
3169 + * nr_running, nr_uninterruptible and nr_context_switches:
3170 + *
3171 + * externally visible scheduler statistics: current number of runnable
3172 + * threads, current number of uninterruptible-sleeping threads, total
3173 + * number of context switches performed since bootup.
3174 + */
3175 +unsigned long nr_running(void)
3176 +{
3177 +       unsigned long i, sum = 0;
3178 +
3179 +       for_each_online_cpu(i)
3180 +               sum += cpu_rq(i)->nr_running;
3181 +
3182 +       return sum;
3183 +}
3184 +
3185 +unsigned long nr_uninterruptible(void)
3186 +{
3187 +       unsigned long i, sum = 0;
3188 +
3189 +       for_each_possible_cpu(i)
3190 +               sum += cpu_rq(i)->nr_uninterruptible;
3191 +
3192 +       /*
3193 +        * Since we read the counters lockless, it might be slightly
3194 +        * inaccurate. Do not allow it to go below zero though:
3195 +        */
3196 +       if (unlikely((long)sum < 0))
3197 +               sum = 0;
3198 +
3199 +       return sum;
3200 +}
3201 +
3202 +unsigned long long nr_context_switches(void)
3203 +{
3204 +       int i;
3205 +       unsigned long long sum = 0;
3206 +
3207 +       for_each_possible_cpu(i)
3208 +               sum += cpu_rq(i)->nr_switches;
3209 +
3210 +       return sum;
3211 +}
3212 +
3213 +unsigned long nr_iowait(void)
3214 +{
3215 +       unsigned long i, sum = 0;
3216 +
3217 +       for_each_possible_cpu(i)
3218 +               sum += atomic_read(&cpu_rq(i)->nr_iowait);
3219 +
3220 +       return sum;
3221 +}
3222 +
3223 +unsigned long nr_active(void)
3224 +{
3225 +       unsigned long i, running = 0, uninterruptible = 0;
3226 +
3227 +       for_each_online_cpu(i) {
3228 +               running += cpu_rq(i)->nr_running;
3229 +               uninterruptible += cpu_rq(i)->nr_uninterruptible;
3230 +       }
3231 +
3232 +       if (unlikely((long)uninterruptible < 0))
3233 +               uninterruptible = 0;
3234 +
3235 +       return running + uninterruptible;
3236 +}
3237 +
3238 +/*
3239 + * Update rq->cpu_load[] statistics. This function is usually called every
3240 + * scheduler tick (TICK_NSEC).
3241 + */
3242 +static void update_cpu_load(struct rq *this_rq)
3243 +{
3244 +       unsigned long this_load = this_rq->load.weight;
3245 +       int i, scale;
3246 +
3247 +       this_rq->nr_load_updates++;
3248 +
3249 +       /* Update our load: */
3250 +       for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3251 +               unsigned long old_load, new_load;
3252 +
3253 +               /* scale is effectively 1 << i now, and >> i divides by scale */
3254 +
3255 +               old_load = this_rq->cpu_load[i];
3256 +               new_load = this_load;
3257 +               /*
3258 +                * Round up the averaging division if load is increasing. This
3259 +                * prevents us from getting stuck on 9 if the load is 10, for
3260 +                * example.
3261 +                */
3262 +               if (new_load > old_load)
3263 +                       new_load += scale-1;
3264 +               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3265 +       }
3266 +}
3267 +
3268 +#ifdef CONFIG_SMP
3269 +
3270 +/*
3271 + * double_rq_lock - safely lock two runqueues
3272 + *
3273 + * Note this does not disable interrupts like task_rq_lock,
3274 + * you need to do so manually before calling.
3275 + */
3276 +static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3277 +       __acquires(rq1->lock)
3278 +       __acquires(rq2->lock)
3279 +{
3280 +       BUG_ON(!irqs_disabled());
3281 +       if (rq1 == rq2) {
3282 +               spin_lock(&rq1->lock);
3283 +               __acquire(rq2->lock);   /* Fake it out ;) */
3284 +       } else {
3285 +               if (rq1 < rq2) {
3286 +                       spin_lock(&rq1->lock);
3287 +                       spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3288 +               } else {
3289 +                       spin_lock(&rq2->lock);
3290 +                       spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3291 +               }
3292 +       }
3293 +       update_rq_clock(rq1);
3294 +       update_rq_clock(rq2);
3295 +}
3296 +
3297 +/*
3298 + * double_rq_unlock - safely unlock two runqueues
3299 + *
3300 + * Note this does not restore interrupts like task_rq_unlock,
3301 + * you need to do so manually after calling.
3302 + */
3303 +static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3304 +       __releases(rq1->lock)
3305 +       __releases(rq2->lock)
3306 +{
3307 +       spin_unlock(&rq1->lock);
3308 +       if (rq1 != rq2)
3309 +               spin_unlock(&rq2->lock);
3310 +       else
3311 +               __release(rq2->lock);
3312 +}
3313 +
3314 +/*
3315 + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
3316 + */
3317 +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
3318 +       __releases(this_rq->lock)
3319 +       __acquires(busiest->lock)
3320 +       __acquires(this_rq->lock)
3321 +{
3322 +       int ret = 0;
3323 +
3324 +       if (unlikely(!irqs_disabled())) {
3325 +               /* printk() doesn't work good under rq->lock */
3326 +               spin_unlock(&this_rq->lock);
3327 +               BUG_ON(1);
3328 +       }
3329 +       if (unlikely(!spin_trylock(&busiest->lock))) {
3330 +               if (busiest < this_rq) {
3331 +                       spin_unlock(&this_rq->lock);
3332 +                       spin_lock(&busiest->lock);
3333 +                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
3334 +                       ret = 1;
3335 +               } else
3336 +                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
3337 +       }
3338 +       return ret;
3339 +}
3340 +
3341 +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
3342 +       __releases(busiest->lock)
3343 +{
3344 +       spin_unlock(&busiest->lock);
3345 +       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
3346 +}
3347 +
3348 +/*
3349 + * If dest_cpu is allowed for this process, migrate the task to it.
3350 + * This is accomplished by forcing the cpu_allowed mask to only
3351 + * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3352 + * the cpu_allowed mask is restored.
3353 + */
3354 +static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3355 +{
3356 +       struct migration_req req;
3357 +       unsigned long flags;
3358 +       struct rq *rq;
3359 +
3360 +       rq = task_rq_lock(p, &flags);
3361 +       if (!cpu_isset(dest_cpu, p->cpus_allowed)
3362 +           || unlikely(!cpu_active(dest_cpu)))
3363 +               goto out;
3364 +
3365 +       /* force the process onto the specified CPU */
3366 +       if (migrate_task(p, dest_cpu, &req)) {
3367 +               /* Need to wait for migration thread (might exit: take ref). */
3368 +               struct task_struct *mt = rq->migration_thread;
3369 +
3370 +               get_task_struct(mt);
3371 +               task_rq_unlock(rq, &flags);
3372 +               wake_up_process(mt);
3373 +               put_task_struct(mt);
3374 +               wait_for_completion(&req.done);
3375 +
3376 +               return;
3377 +       }
3378 +out:
3379 +       task_rq_unlock(rq, &flags);
3380 +}
3381 +
3382 +/*
3383 + * sched_exec - execve() is a valuable balancing opportunity, because at
3384 + * this point the task has the smallest effective memory and cache footprint.
3385 + */
3386 +void sched_exec(void)
3387 +{
3388 +       int new_cpu, this_cpu = get_cpu();
3389 +       new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
3390 +       put_cpu();
3391 +       if (new_cpu != this_cpu)
3392 +               sched_migrate_task(current, new_cpu);
3393 +}
3394 +
3395 +/*
3396 + * pull_task - move a task from a remote runqueue to the local runqueue.
3397 + * Both runqueues must be locked.
3398 + */
3399 +static void pull_task(struct rq *src_rq, struct task_struct *p,
3400 +                     struct rq *this_rq, int this_cpu)
3401 +{
3402 +       deactivate_task(src_rq, p, 0);
3403 +       set_task_cpu(p, this_cpu);
3404 +       activate_task(this_rq, p, 0);
3405 +       /*
3406 +        * Note that idle threads have a prio of MAX_PRIO, for this test
3407 +        * to be always true for them.
3408 +        */
3409 +       check_preempt_curr(this_rq, p);
3410 +}
3411 +
3412 +/*
3413 + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3414 + */
3415 +static
3416 +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3417 +                    struct sched_domain *sd, enum cpu_idle_type idle,
3418 +                    int *all_pinned)
3419 +{
3420 +       /*
3421 +        * We do not migrate tasks that are:
3422 +        * 1) running (obviously), or
3423 +        * 2) cannot be migrated to this CPU due to cpus_allowed, or
3424 +        * 3) are cache-hot on their current CPU.
3425 +        */
3426 +       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
3427 +               schedstat_inc(p, se.nr_failed_migrations_affine);
3428 +               return 0;
3429 +       }
3430 +       *all_pinned = 0;
3431 +
3432 +       if (task_running(rq, p)) {
3433 +               schedstat_inc(p, se.nr_failed_migrations_running);
3434 +               return 0;
3435 +       }
3436 +
3437 +       /*
3438 +        * Aggressive migration if:
3439 +        * 1) task is cache cold, or
3440 +        * 2) too many balance attempts have failed.
3441 +        */
3442 +
3443 +       if (!task_hot(p, rq->clock, sd) ||
3444 +                       sd->nr_balance_failed > sd->cache_nice_tries) {
3445 +#ifdef CONFIG_SCHEDSTATS
3446 +               if (task_hot(p, rq->clock, sd)) {
3447 +                       schedstat_inc(sd, lb_hot_gained[idle]);
3448 +                       schedstat_inc(p, se.nr_forced_migrations);
3449 +               }
3450 +#endif
3451 +               return 1;
3452 +       }
3453 +
3454 +       if (task_hot(p, rq->clock, sd)) {
3455 +               schedstat_inc(p, se.nr_failed_migrations_hot);
3456 +               return 0;
3457 +       }
3458 +       return 1;
3459 +}
3460 +
3461 +static unsigned long
3462 +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3463 +             unsigned long max_load_move, struct sched_domain *sd,
3464 +             enum cpu_idle_type idle, int *all_pinned,
3465 +             int *this_best_prio, struct rq_iterator *iterator)
3466 +{
3467 +       int loops = 0, pulled = 0, pinned = 0;
3468 +       struct task_struct *p;
3469 +       long rem_load_move = max_load_move;
3470 +
3471 +       if (max_load_move == 0)
3472 +               goto out;
3473 +
3474 +       pinned = 1;
3475 +
3476 +       /*
3477 +        * Start the load-balancing iterator:
3478 +        */
3479 +       p = iterator->start(iterator->arg);
3480 +next:
3481 +       if (!p || loops++ > sysctl_sched_nr_migrate)
3482 +               goto out;
3483 +
3484 +       if ((p->se.load.weight >> 1) > rem_load_move ||
3485 +           !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3486 +               p = iterator->next(iterator->arg);
3487 +               goto next;
3488 +       }
3489 +
3490 +       pull_task(busiest, p, this_rq, this_cpu);
3491 +       pulled++;
3492 +       rem_load_move -= p->se.load.weight;
3493 +
3494 +       /*
3495 +        * We only want to steal up to the prescribed amount of weighted load.
3496 +        */
3497 +       if (rem_load_move > 0) {
3498 +               if (p->prio < *this_best_prio)
3499 +                       *this_best_prio = p->prio;
3500 +               p = iterator->next(iterator->arg);
3501 +               goto next;
3502 +       }
3503 +out:
3504 +       /*
3505 +        * Right now, this is one of only two places pull_task() is called,
3506 +        * so we can safely collect pull_task() stats here rather than
3507 +        * inside pull_task().
3508 +        */
3509 +       schedstat_add(sd, lb_gained[idle], pulled);
3510 +
3511 +       if (all_pinned)
3512 +               *all_pinned = pinned;
3513 +
3514 +       return max_load_move - rem_load_move;
3515 +}
3516 +
3517 +/*
3518 + * move_tasks tries to move up to max_load_move weighted load from busiest to
3519 + * this_rq, as part of a balancing operation within domain "sd".
3520 + * Returns 1 if successful and 0 otherwise.
3521 + *
3522 + * Called with both runqueues locked.
3523 + */
3524 +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3525 +                     unsigned long max_load_move,
3526 +                     struct sched_domain *sd, enum cpu_idle_type idle,
3527 +                     int *all_pinned)
3528 +{
3529 +       const struct sched_class *class = sched_class_highest;
3530 +       unsigned long total_load_moved = 0;
3531 +       int this_best_prio = this_rq->curr->prio;
3532 +
3533 +       do {
3534 +               total_load_moved +=
3535 +                       class->load_balance(this_rq, this_cpu, busiest,
3536 +                               max_load_move - total_load_moved,
3537 +                               sd, idle, all_pinned, &this_best_prio);
3538 +               class = class->next;
3539 +
3540 +               if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3541 +                       break;
3542 +
3543 +       } while (class && max_load_move > total_load_moved);
3544 +
3545 +       return total_load_moved > 0;
3546 +}
3547 +
3548 +static int
3549 +iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3550 +                  struct sched_domain *sd, enum cpu_idle_type idle,
3551 +                  struct rq_iterator *iterator)
3552 +{
3553 +       struct task_struct *p = iterator->start(iterator->arg);
3554 +       int pinned = 0;
3555 +
3556 +       while (p) {
3557 +               if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3558 +                       pull_task(busiest, p, this_rq, this_cpu);
3559 +                       /*
3560 +                        * Right now, this is only the second place pull_task()
3561 +                        * is called, so we can safely collect pull_task()
3562 +                        * stats here rather than inside pull_task().
3563 +                        */
3564 +                       schedstat_inc(sd, lb_gained[idle]);
3565 +
3566 +                       return 1;
3567 +               }
3568 +               p = iterator->next(iterator->arg);
3569 +       }
3570 +
3571 +       return 0;
3572 +}
3573 +
3574 +/*
3575 + * move_one_task tries to move exactly one task from busiest to this_rq, as
3576 + * part of active balancing operations within "domain".
3577 + * Returns 1 if successful and 0 otherwise.
3578 + *
3579 + * Called with both runqueues locked.
3580 + */
3581 +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3582 +                        struct sched_domain *sd, enum cpu_idle_type idle)
3583 +{
3584 +       const struct sched_class *class;
3585 +
3586 +       for (class = sched_class_highest; class; class = class->next)
3587 +               if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3588 +                       return 1;
3589 +
3590 +       return 0;
3591 +}
3592 +
3593 +/*
3594 + * find_busiest_group finds and returns the busiest CPU group within the
3595 + * domain. It calculates and returns the amount of weighted load which
3596 + * should be moved to restore balance via the imbalance parameter.
3597 + */
3598 +static struct sched_group *
3599 +find_busiest_group(struct sched_domain *sd, int this_cpu,
3600 +                  unsigned long *imbalance, enum cpu_idle_type idle,
3601 +                  int *sd_idle, const cpumask_t *cpus, int *balance)
3602 +{
3603 +       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3604 +       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
3605 +       unsigned long max_pull;
3606 +       unsigned long busiest_load_per_task, busiest_nr_running;
3607 +       unsigned long this_load_per_task, this_nr_running;
3608 +       int load_idx, group_imb = 0;
3609 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3610 +       int power_savings_balance = 1;
3611 +       unsigned long leader_nr_running = 0, min_load_per_task = 0;
3612 +       unsigned long min_nr_running = ULONG_MAX;
3613 +       struct sched_group *group_min = NULL, *group_leader = NULL;
3614 +#endif
3615 +
3616 +       max_load = this_load = total_load = total_pwr = 0;
3617 +       busiest_load_per_task = busiest_nr_running = 0;
3618 +       this_load_per_task = this_nr_running = 0;
3619 +
3620 +       if (idle == CPU_NOT_IDLE)
3621 +               load_idx = sd->busy_idx;
3622 +       else if (idle == CPU_NEWLY_IDLE)
3623 +               load_idx = sd->newidle_idx;
3624 +       else
3625 +               load_idx = sd->idle_idx;
3626 +
3627 +       do {
3628 +               unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
3629 +               int local_group;
3630 +               int i;
3631 +               int __group_imb = 0;
3632 +               unsigned int balance_cpu = -1, first_idle_cpu = 0;
3633 +               unsigned long sum_nr_running, sum_weighted_load;
3634 +               unsigned long sum_avg_load_per_task;
3635 +               unsigned long avg_load_per_task;
3636 +
3637 +               local_group = cpu_isset(this_cpu, group->cpumask);
3638 +
3639 +               if (local_group)
3640 +                       balance_cpu = first_cpu(group->cpumask);
3641 +
3642 +               /* Tally up the load of all CPUs in the group */
3643 +               sum_weighted_load = sum_nr_running = avg_load = 0;
3644 +               sum_avg_load_per_task = avg_load_per_task = 0;
3645 +
3646 +               max_cpu_load = 0;
3647 +               min_cpu_load = ~0UL;
3648 +
3649 +               for_each_cpu_mask_nr(i, group->cpumask) {
3650 +                       struct rq *rq;
3651 +
3652 +                       if (!cpu_isset(i, *cpus))
3653 +                               continue;
3654 +
3655 +                       rq = cpu_rq(i);
3656 +
3657 +                       if (*sd_idle && rq->nr_running)
3658 +                               *sd_idle = 0;
3659 +
3660 +                       /* Bias balancing toward cpus of our domain */
3661 +                       if (local_group) {
3662 +                               if (idle_cpu(i) && !first_idle_cpu) {
3663 +                                       first_idle_cpu = 1;
3664 +                                       balance_cpu = i;
3665 +                               }
3666 +
3667 +                               load = target_load(i, load_idx);
3668 +                       } else {
3669 +                               load = source_load(i, load_idx);
3670 +                               if (load > max_cpu_load)
3671 +                                       max_cpu_load = load;
3672 +                               if (min_cpu_load > load)
3673 +                                       min_cpu_load = load;
3674 +                       }
3675 +
3676 +                       avg_load += load;
3677 +                       sum_nr_running += rq->nr_running;
3678 +                       sum_weighted_load += weighted_cpuload(i);
3679 +
3680 +                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
3681 +               }
3682 +
3683 +               /*
3684 +                * First idle cpu or the first cpu(busiest) in this sched group
3685 +                * is eligible for doing load balancing at this and above
3686 +                * domains. In the newly idle case, we will allow all the cpu's
3687 +                * to do the newly idle load balance.
3688 +                */
3689 +               if (idle != CPU_NEWLY_IDLE && local_group &&
3690 +                   balance_cpu != this_cpu && balance) {
3691 +                       *balance = 0;
3692 +                       goto ret;
3693 +               }
3694 +
3695 +               total_load += avg_load;
3696 +               total_pwr += group->__cpu_power;
3697 +
3698 +               /* Adjust by relative CPU power of the group */
3699 +               avg_load = sg_div_cpu_power(group,
3700 +                               avg_load * SCHED_LOAD_SCALE);
3701 +
3702 +
3703 +               /*
3704 +                * Consider the group unbalanced when the imbalance is larger
3705 +                * than the average weight of two tasks.
3706 +                *
3707 +                * APZ: with cgroup the avg task weight can vary wildly and
3708 +                *      might not be a suitable number - should we keep a
3709 +                *      normalized nr_running number somewhere that negates
3710 +                *      the hierarchy?
3711 +                */
3712 +               avg_load_per_task = sg_div_cpu_power(group,
3713 +                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
3714 +
3715 +               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3716 +                       __group_imb = 1;
3717 +
3718 +               group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3719 +
3720 +               if (local_group) {
3721 +                       this_load = avg_load;
3722 +                       this = group;
3723 +                       this_nr_running = sum_nr_running;
3724 +                       this_load_per_task = sum_weighted_load;
3725 +               } else if (avg_load > max_load &&
3726 +                          (sum_nr_running > group_capacity || __group_imb)) {
3727 +                       max_load = avg_load;
3728 +                       busiest = group;
3729 +                       busiest_nr_running = sum_nr_running;
3730 +                       busiest_load_per_task = sum_weighted_load;
3731 +                       group_imb = __group_imb;
3732 +               }
3733 +
3734 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3735 +               /*
3736 +                * Busy processors will not participate in power savings
3737 +                * balance.
3738 +                */
3739 +               if (idle == CPU_NOT_IDLE ||
3740 +                               !(sd->flags & SD_POWERSAVINGS_BALANCE))
3741 +                       goto group_next;
3742 +
3743 +               /*
3744 +                * If the local group is idle or completely loaded
3745 +                * no need to do power savings balance at this domain
3746 +                */
3747 +               if (local_group && (this_nr_running >= group_capacity ||
3748 +                                   !this_nr_running))
3749 +                       power_savings_balance = 0;
3750 +
3751 +               /*
3752 +                * If a group is already running at full capacity or idle,
3753 +                * don't include that group in power savings calculations
3754 +                */
3755 +               if (!power_savings_balance || sum_nr_running >= group_capacity
3756 +                   || !sum_nr_running)
3757 +                       goto group_next;
3758 +
3759 +               /*
3760 +                * Calculate the group which has the least non-idle load.
3761 +                * This is the group from where we need to pick up the load
3762 +                * for saving power
3763 +                */
3764 +               if ((sum_nr_running < min_nr_running) ||
3765 +                   (sum_nr_running == min_nr_running &&
3766 +                    first_cpu(group->cpumask) <
3767 +                    first_cpu(group_min->cpumask))) {
3768 +                       group_min = group;
3769 +                       min_nr_running = sum_nr_running;
3770 +                       min_load_per_task = sum_weighted_load /
3771 +                                               sum_nr_running;
3772 +               }
3773 +
3774 +               /*
3775 +                * Calculate the group which is almost near its
3776 +                * capacity but still has some space to pick up some load
3777 +                * from other group and save more power
3778 +                */
3779 +               if (sum_nr_running <= group_capacity - 1) {
3780 +                       if (sum_nr_running > leader_nr_running ||
3781 +                           (sum_nr_running == leader_nr_running &&
3782 +                            first_cpu(group->cpumask) >
3783 +                             first_cpu(group_leader->cpumask))) {
3784 +                               group_leader = group;
3785 +                               leader_nr_running = sum_nr_running;
3786 +                       }
3787 +               }
3788 +group_next:
3789 +#endif
3790 +               group = group->next;
3791 +       } while (group != sd->groups);
3792 +
3793 +       if (!busiest || this_load >= max_load || busiest_nr_running == 0)
3794 +               goto out_balanced;
3795 +
3796 +       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3797 +
3798 +       if (this_load >= avg_load ||
3799 +                       100*max_load <= sd->imbalance_pct*this_load)
3800 +               goto out_balanced;
3801 +
3802 +       busiest_load_per_task /= busiest_nr_running;
3803 +       if (group_imb)
3804 +               busiest_load_per_task = min(busiest_load_per_task, avg_load);
3805 +
3806 +       /*
3807 +        * We're trying to get all the cpus to the average_load, so we don't
3808 +        * want to push ourselves above the average load, nor do we wish to
3809 +        * reduce the max loaded cpu below the average load, as either of these
3810 +        * actions would just result in more rebalancing later, and ping-pong
3811 +        * tasks around. Thus we look for the minimum possible imbalance.
3812 +        * Negative imbalances (*we* are more loaded than anyone else) will
3813 +        * be counted as no imbalance for these purposes -- we can't fix that
3814 +        * by pulling tasks to us. Be careful of negative numbers as they'll
3815 +        * appear as very large values with unsigned longs.
3816 +        */
3817 +       if (max_load <= busiest_load_per_task)
3818 +               goto out_balanced;
3819 +
3820 +       /*
3821 +        * In the presence of smp nice balancing, certain scenarios can have
3822 +        * max load less than avg load(as we skip the groups at or below
3823 +        * its cpu_power, while calculating max_load..)
3824 +        */
3825 +       if (max_load < avg_load) {
3826 +               *imbalance = 0;
3827 +               goto small_imbalance;
3828 +       }
3829 +
3830 +       /* Don't want to pull so many tasks that a group would go idle */
3831 +       max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
3832 +
3833 +       /* How much load to actually move to equalise the imbalance */
3834 +       *imbalance = min(max_pull * busiest->__cpu_power,
3835 +                               (avg_load - this_load) * this->__cpu_power)
3836 +                       / SCHED_LOAD_SCALE;
3837 +
3838 +       /*
3839 +        * if *imbalance is less than the average load per runnable task
3840 +        * there is no gaurantee that any tasks will be moved so we'll have
3841 +        * a think about bumping its value to force at least one task to be
3842 +        * moved
3843 +        */
3844 +       if (*imbalance < busiest_load_per_task) {
3845 +               unsigned long tmp, pwr_now, pwr_move;
3846 +               unsigned int imbn;
3847 +
3848 +small_imbalance:
3849 +               pwr_move = pwr_now = 0;
3850 +               imbn = 2;
3851 +               if (this_nr_running) {
3852 +                       this_load_per_task /= this_nr_running;
3853 +                       if (busiest_load_per_task > this_load_per_task)
3854 +                               imbn = 1;
3855 +               } else
3856 +                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
3857 +
3858 +               if (max_load - this_load + 2*busiest_load_per_task >=
3859 +                                       busiest_load_per_task * imbn) {
3860 +                       *imbalance = busiest_load_per_task;
3861 +                       return busiest;
3862 +               }
3863 +
3864 +               /*
3865 +                * OK, we don't have enough imbalance to justify moving tasks,
3866 +                * however we may be able to increase total CPU power used by
3867 +                * moving them.
3868 +                */
3869 +
3870 +               pwr_now += busiest->__cpu_power *
3871 +                               min(busiest_load_per_task, max_load);
3872 +               pwr_now += this->__cpu_power *
3873 +                               min(this_load_per_task, this_load);
3874 +               pwr_now /= SCHED_LOAD_SCALE;
3875 +
3876 +               /* Amount of load we'd subtract */
3877 +               tmp = sg_div_cpu_power(busiest,
3878 +                               busiest_load_per_task * SCHED_LOAD_SCALE);
3879 +               if (max_load > tmp)
3880 +                       pwr_move += busiest->__cpu_power *
3881 +                               min(busiest_load_per_task, max_load - tmp);
3882 +
3883 +               /* Amount of load we'd add */
3884 +               if (max_load * busiest->__cpu_power <
3885 +                               busiest_load_per_task * SCHED_LOAD_SCALE)
3886 +                       tmp = sg_div_cpu_power(this,
3887 +                                       max_load * busiest->__cpu_power);
3888 +               else
3889 +                       tmp = sg_div_cpu_power(this,
3890 +                               busiest_load_per_task * SCHED_LOAD_SCALE);
3891 +               pwr_move += this->__cpu_power *
3892 +                               min(this_load_per_task, this_load + tmp);
3893 +               pwr_move /= SCHED_LOAD_SCALE;
3894 +
3895 +               /* Move if we gain throughput */
3896 +               if (pwr_move > pwr_now)
3897 +                       *imbalance = busiest_load_per_task;
3898 +       }
3899 +
3900 +       return busiest;
3901 +
3902 +out_balanced:
3903 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3904 +       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3905 +               goto ret;
3906 +
3907 +       if (this == group_leader && group_leader != group_min) {
3908 +               *imbalance = min_load_per_task;
3909 +               return group_min;
3910 +       }
3911 +#endif
3912 +ret:
3913 +       *imbalance = 0;
3914 +       return NULL;
3915 +}
3916 +
3917 +/*
3918 + * find_busiest_queue - find the busiest runqueue among the cpus in group.
3919 + */
3920 +static struct rq *
3921 +find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3922 +                  unsigned long imbalance, const cpumask_t *cpus)
3923 +{
3924 +       struct rq *busiest = NULL, *rq;
3925 +       unsigned long max_load = 0;
3926 +       int i;
3927 +
3928 +       for_each_cpu_mask_nr(i, group->cpumask) {
3929 +               unsigned long wl;
3930 +
3931 +               if (!cpu_isset(i, *cpus))
3932 +                       continue;
3933 +
3934 +               rq = cpu_rq(i);
3935 +               wl = weighted_cpuload(i);
3936 +
3937 +               if (rq->nr_running == 1 && wl > imbalance)
3938 +                       continue;
3939 +
3940 +               if (wl > max_load) {
3941 +                       max_load = wl;
3942 +                       busiest = rq;
3943 +               }
3944 +       }
3945 +
3946 +       return busiest;
3947 +}
3948 +
3949 +/*
3950 + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
3951 + * so long as it is large enough.
3952 + */
3953 +#define MAX_PINNED_INTERVAL    512
3954 +
3955 +/*
3956 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
3957 + * tasks if there is an imbalance.
3958 + */
3959 +static int load_balance(int this_cpu, struct rq *this_rq,
3960 +                       struct sched_domain *sd, enum cpu_idle_type idle,
3961 +                       int *balance, cpumask_t *cpus)
3962 +{
3963 +       int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3964 +       struct sched_group *group;
3965 +       unsigned long imbalance;
3966 +       struct rq *busiest;
3967 +       unsigned long flags;
3968 +
3969 +       cpus_setall(*cpus);
3970 +
3971 +       /*
3972 +        * When power savings policy is enabled for the parent domain, idle
3973 +        * sibling can pick up load irrespective of busy siblings. In this case,
3974 +        * let the state of idle sibling percolate up as CPU_IDLE, instead of
3975 +        * portraying it as CPU_NOT_IDLE.
3976 +        */
3977 +       if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3978 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3979 +               sd_idle = 1;
3980 +
3981 +       schedstat_inc(sd, lb_count[idle]);
3982 +
3983 +redo:
3984 +       update_shares(sd);
3985 +       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3986 +                                  cpus, balance);
3987 +
3988 +       if (*balance == 0)
3989 +               goto out_balanced;
3990 +
3991 +       if (!group) {
3992 +               schedstat_inc(sd, lb_nobusyg[idle]);
3993 +               goto out_balanced;
3994 +       }
3995 +
3996 +       busiest = find_busiest_queue(group, idle, imbalance, cpus);
3997 +       if (!busiest) {
3998 +               schedstat_inc(sd, lb_nobusyq[idle]);
3999 +               goto out_balanced;
4000 +       }
4001 +
4002 +       BUG_ON(busiest == this_rq);
4003 +
4004 +       schedstat_add(sd, lb_imbalance[idle], imbalance);
4005 +
4006 +       ld_moved = 0;
4007 +       if (busiest->nr_running > 1) {
4008 +               /*
4009 +                * Attempt to move tasks. If find_busiest_group has found
4010 +                * an imbalance but busiest->nr_running <= 1, the group is
4011 +                * still unbalanced. ld_moved simply stays zero, so it is
4012 +                * correctly treated as an imbalance.
4013 +                */
4014 +               local_irq_save(flags);
4015 +               double_rq_lock(this_rq, busiest);
4016 +               ld_moved = move_tasks(this_rq, this_cpu, busiest,
4017 +                                     imbalance, sd, idle, &all_pinned);
4018 +               double_rq_unlock(this_rq, busiest);
4019 +               local_irq_restore(flags);
4020 +
4021 +               /*
4022 +                * some other cpu did the load balance for us.
4023 +                */
4024 +               if (ld_moved && this_cpu != smp_processor_id())
4025 +                       resched_cpu(this_cpu);
4026 +
4027 +               /* All tasks on this runqueue were pinned by CPU affinity */
4028 +               if (unlikely(all_pinned)) {
4029 +                       cpu_clear(cpu_of(busiest), *cpus);
4030 +                       if (!cpus_empty(*cpus))
4031 +                               goto redo;
4032 +                       goto out_balanced;
4033 +               }
4034 +       }
4035 +
4036 +       if (!ld_moved) {
4037 +               schedstat_inc(sd, lb_failed[idle]);
4038 +               sd->nr_balance_failed++;
4039 +
4040 +               if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4041 +
4042 +                       spin_lock_irqsave(&busiest->lock, flags);
4043 +
4044 +                       /* don't kick the migration_thread, if the curr
4045 +                        * task on busiest cpu can't be moved to this_cpu
4046 +                        */
4047 +                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
4048 +                               spin_unlock_irqrestore(&busiest->lock, flags);
4049 +                               all_pinned = 1;
4050 +                               goto out_one_pinned;
4051 +                       }
4052 +
4053 +                       if (!busiest->active_balance) {
4054 +                               busiest->active_balance = 1;
4055 +                               busiest->push_cpu = this_cpu;
4056 +                               active_balance = 1;
4057 +                       }
4058 +                       spin_unlock_irqrestore(&busiest->lock, flags);
4059 +                       if (active_balance)
4060 +                               wake_up_process(busiest->migration_thread);
4061 +
4062 +                       /*
4063 +                        * We've kicked active balancing, reset the failure
4064 +                        * counter.
4065 +                        */
4066 +                       sd->nr_balance_failed = sd->cache_nice_tries+1;
4067 +               }
4068 +       } else
4069 +               sd->nr_balance_failed = 0;
4070 +
4071 +       if (likely(!active_balance)) {
4072 +               /* We were unbalanced, so reset the balancing interval */
4073 +               sd->balance_interval = sd->min_interval;
4074 +       } else {
4075 +               /*
4076 +                * If we've begun active balancing, start to back off. This
4077 +                * case may not be covered by the all_pinned logic if there
4078 +                * is only 1 task on the busy runqueue (because we don't call
4079 +                * move_tasks).
4080 +                */
4081 +               if (sd->balance_interval < sd->max_interval)
4082 +                       sd->balance_interval *= 2;
4083 +       }
4084 +
4085 +       if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4086 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4087 +               ld_moved = -1;
4088 +
4089 +       goto out;
4090 +
4091 +out_balanced:
4092 +       schedstat_inc(sd, lb_balanced[idle]);
4093 +
4094 +       sd->nr_balance_failed = 0;
4095 +
4096 +out_one_pinned:
4097 +       /* tune up the balancing interval */
4098 +       if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4099 +                       (sd->balance_interval < sd->max_interval))
4100 +               sd->balance_interval *= 2;
4101 +
4102 +       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4103 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4104 +               ld_moved = -1;
4105 +       else
4106 +               ld_moved = 0;
4107 +out:
4108 +       if (ld_moved)
4109 +               update_shares(sd);
4110 +       return ld_moved;
4111 +}
4112 +
4113 +/*
4114 + * Check this_cpu to ensure it is balanced within domain. Attempt to move
4115 + * tasks if there is an imbalance.
4116 + *
4117 + * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4118 + * this_rq is locked.
4119 + */
4120 +static int
4121 +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
4122 +                       cpumask_t *cpus)
4123 +{
4124 +       struct sched_group *group;
4125 +       struct rq *busiest = NULL;
4126 +       unsigned long imbalance;
4127 +       int ld_moved = 0;
4128 +       int sd_idle = 0;
4129 +       int all_pinned = 0;
4130 +
4131 +       cpus_setall(*cpus);
4132 +
4133 +       /*
4134 +        * When power savings policy is enabled for the parent domain, idle
4135 +        * sibling can pick up load irrespective of busy siblings. In this case,
4136 +        * let the state of idle sibling percolate up as IDLE, instead of
4137 +        * portraying it as CPU_NOT_IDLE.
4138 +        */
4139 +       if (sd->flags & SD_SHARE_CPUPOWER &&
4140 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4141 +               sd_idle = 1;
4142 +
4143 +       schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4144 +redo:
4145 +       update_shares_locked(this_rq, sd);
4146 +       group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4147 +                                  &sd_idle, cpus, NULL);
4148 +       if (!group) {
4149 +               schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4150 +               goto out_balanced;
4151 +       }
4152 +
4153 +       busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4154 +       if (!busiest) {
4155 +               schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4156 +               goto out_balanced;
4157 +       }
4158 +
4159 +       BUG_ON(busiest == this_rq);
4160 +
4161 +       schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4162 +
4163 +       ld_moved = 0;
4164 +       if (busiest->nr_running > 1) {
4165 +               /* Attempt to move tasks */
4166 +               double_lock_balance(this_rq, busiest);
4167 +               /* this_rq->clock is already updated */
4168 +               update_rq_clock(busiest);
4169 +               ld_moved = move_tasks(this_rq, this_cpu, busiest,
4170 +                                       imbalance, sd, CPU_NEWLY_IDLE,
4171 +                                       &all_pinned);
4172 +               double_unlock_balance(this_rq, busiest);
4173 +
4174 +               if (unlikely(all_pinned)) {
4175 +                       cpu_clear(cpu_of(busiest), *cpus);
4176 +                       if (!cpus_empty(*cpus))
4177 +                               goto redo;
4178 +               }
4179 +       }
4180 +
4181 +       if (!ld_moved) {
4182 +               schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4183 +               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4184 +                   !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4185 +                       return -1;
4186 +       } else
4187 +               sd->nr_balance_failed = 0;
4188 +
4189 +       update_shares_locked(this_rq, sd);
4190 +       return ld_moved;
4191 +
4192 +out_balanced:
4193 +       schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4194 +       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4195 +           !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4196 +               return -1;
4197 +       sd->nr_balance_failed = 0;
4198 +
4199 +       return 0;
4200 +}
4201 +
4202 +/*
4203 + * idle_balance is called by schedule() if this_cpu is about to become
4204 + * idle. Attempts to pull tasks from other CPUs.
4205 + */
4206 +static void idle_balance(int this_cpu, struct rq *this_rq)
4207 +{
4208 +       struct sched_domain *sd;
4209 +       int pulled_task = -1;
4210 +       unsigned long next_balance = jiffies + HZ;
4211 +       cpumask_t tmpmask;
4212 +
4213 +       for_each_domain(this_cpu, sd) {
4214 +               unsigned long interval;
4215 +
4216 +               if (!(sd->flags & SD_LOAD_BALANCE))
4217 +                       continue;
4218 +
4219 +               if (sd->flags & SD_BALANCE_NEWIDLE)
4220 +                       /* If we've pulled tasks over stop searching: */
4221 +                       pulled_task = load_balance_newidle(this_cpu, this_rq,
4222 +                                                          sd, &tmpmask);
4223 +
4224 +               interval = msecs_to_jiffies(sd->balance_interval);
4225 +               if (time_after(next_balance, sd->last_balance + interval))
4226 +                       next_balance = sd->last_balance + interval;
4227 +               if (pulled_task)
4228 +                       break;
4229 +       }
4230 +       if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4231 +               /*
4232 +                * We are going idle. next_balance may be set based on
4233 +                * a busy processor. So reset next_balance.
4234 +                */
4235 +               this_rq->next_balance = next_balance;
4236 +       }
4237 +}
4238 +
4239 +/*
4240 + * active_load_balance is run by migration threads. It pushes running tasks
4241 + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4242 + * running on each physical CPU where possible, and avoids physical /
4243 + * logical imbalances.
4244 + *
4245 + * Called with busiest_rq locked.
4246 + */
4247 +static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4248 +{
4249 +       int target_cpu = busiest_rq->push_cpu;
4250 +       struct sched_domain *sd;
4251 +       struct rq *target_rq;
4252 +
4253 +       /* Is there any task to move? */
4254 +       if (busiest_rq->nr_running <= 1)
4255 +               return;
4256 +
4257 +       target_rq = cpu_rq(target_cpu);
4258 +
4259 +       /*
4260 +        * This condition is "impossible", if it occurs
4261 +        * we need to fix it. Originally reported by
4262 +        * Bjorn Helgaas on a 128-cpu setup.
4263 +        */
4264 +       BUG_ON(busiest_rq == target_rq);
4265 +
4266 +       /* move a task from busiest_rq to target_rq */
4267 +       double_lock_balance(busiest_rq, target_rq);
4268 +       update_rq_clock(busiest_rq);
4269 +       update_rq_clock(target_rq);
4270 +
4271 +       /* Search for an sd spanning us and the target CPU. */
4272 +       for_each_domain(target_cpu, sd) {
4273 +               if ((sd->flags & SD_LOAD_BALANCE) &&
4274 +                   cpu_isset(busiest_cpu, sd->span))
4275 +                               break;
4276 +       }
4277 +
4278 +       if (likely(sd)) {
4279 +               schedstat_inc(sd, alb_count);
4280 +
4281 +               if (move_one_task(target_rq, target_cpu, busiest_rq,
4282 +                                 sd, CPU_IDLE))
4283 +                       schedstat_inc(sd, alb_pushed);
4284 +               else
4285 +                       schedstat_inc(sd, alb_failed);
4286 +       }
4287 +       double_unlock_balance(busiest_rq, target_rq);
4288 +}
4289 +
4290 +#ifdef CONFIG_NO_HZ
4291 +static struct {
4292 +       atomic_t load_balancer;
4293 +       cpumask_t cpu_mask;
4294 +} nohz ____cacheline_aligned = {
4295 +       .load_balancer = ATOMIC_INIT(-1),
4296 +       .cpu_mask = CPU_MASK_NONE,
4297 +};
4298 +
4299 +/*
4300 + * This routine will try to nominate the ilb (idle load balancing)
4301 + * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4302 + * load balancing on behalf of all those cpus. If all the cpus in the system
4303 + * go into this tickless mode, then there will be no ilb owner (as there is
4304 + * no need for one) and all the cpus will sleep till the next wakeup event
4305 + * arrives...
4306 + *
4307 + * For the ilb owner, tick is not stopped. And this tick will be used
4308 + * for idle load balancing. ilb owner will still be part of
4309 + * nohz.cpu_mask..
4310 + *
4311 + * While stopping the tick, this cpu will become the ilb owner if there
4312 + * is no other owner. And will be the owner till that cpu becomes busy
4313 + * or if all cpus in the system stop their ticks at which point
4314 + * there is no need for ilb owner.
4315 + *
4316 + * When the ilb owner becomes busy, it nominates another owner, during the
4317 + * next busy scheduler_tick()
4318 + */
4319 +int select_nohz_load_balancer(int stop_tick)
4320 +{
4321 +       int cpu = smp_processor_id();
4322 +
4323 +       if (stop_tick) {
4324 +               cpu_set(cpu, nohz.cpu_mask);
4325 +               cpu_rq(cpu)->in_nohz_recently = 1;
4326 +
4327 +               /*
4328 +                * If we are going offline and still the leader, give up!
4329 +                */
4330 +               if (!cpu_active(cpu) &&
4331 +                   atomic_read(&nohz.load_balancer) == cpu) {
4332 +                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4333 +                               BUG();
4334 +                       return 0;
4335 +               }
4336 +
4337 +               /* time for ilb owner also to sleep */
4338 +               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
4339 +                       if (atomic_read(&nohz.load_balancer) == cpu)
4340 +                               atomic_set(&nohz.load_balancer, -1);
4341 +                       return 0;
4342 +               }
4343 +
4344 +               if (atomic_read(&nohz.load_balancer) == -1) {
4345 +                       /* make me the ilb owner */
4346 +                       if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4347 +                               return 1;
4348 +               } else if (atomic_read(&nohz.load_balancer) == cpu)
4349 +                       return 1;
4350 +       } else {
4351 +               if (!cpu_isset(cpu, nohz.cpu_mask))
4352 +                       return 0;
4353 +
4354 +               cpu_clear(cpu, nohz.cpu_mask);
4355 +
4356 +               if (atomic_read(&nohz.load_balancer) == cpu)
4357 +                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4358 +                               BUG();
4359 +       }
4360 +       return 0;
4361 +}
4362 +#endif
4363 +
4364 +static DEFINE_SPINLOCK(balancing);
4365 +
4366 +/*
4367 + * It checks each scheduling domain to see if it is due to be balanced,
4368 + * and initiates a balancing operation if so.
4369 + *
4370 + * Balancing parameters are set up in arch_init_sched_domains.
4371 + */
4372 +static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4373 +{
4374 +       int balance = 1;
4375 +       struct rq *rq = cpu_rq(cpu);
4376 +       unsigned long interval;
4377 +       struct sched_domain *sd;
4378 +       /* Earliest time when we have to do rebalance again */
4379 +       unsigned long next_balance = jiffies + 60*HZ;
4380 +       int update_next_balance = 0;
4381 +       int need_serialize;
4382 +       cpumask_t tmp;
4383 +
4384 +       for_each_domain(cpu, sd) {
4385 +               if (!(sd->flags & SD_LOAD_BALANCE))
4386 +                       continue;
4387 +
4388 +               interval = sd->balance_interval;
4389 +               if (idle != CPU_IDLE)
4390 +                       interval *= sd->busy_factor;
4391 +
4392 +               /* scale ms to jiffies */
4393 +               interval = msecs_to_jiffies(interval);
4394 +               if (unlikely(!interval))
4395 +                       interval = 1;
4396 +               if (interval > HZ*NR_CPUS/10)
4397 +                       interval = HZ*NR_CPUS/10;
4398 +
4399 +               need_serialize = sd->flags & SD_SERIALIZE;
4400 +
4401 +               if (need_serialize) {
4402 +                       if (!spin_trylock(&balancing))
4403 +                               goto out;
4404 +               }
4405 +
4406 +               if (time_after_eq(jiffies, sd->last_balance + interval)) {
4407 +                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
4408 +                               /*
4409 +                                * We've pulled tasks over so either we're no
4410 +                                * longer idle, or one of our SMT siblings is
4411 +                                * not idle.
4412 +                                */
4413 +                               idle = CPU_NOT_IDLE;
4414 +                       }
4415 +                       sd->last_balance = jiffies;
4416 +               }
4417 +               if (need_serialize)
4418 +                       spin_unlock(&balancing);
4419 +out:
4420 +               if (time_after(next_balance, sd->last_balance + interval)) {
4421 +                       next_balance = sd->last_balance + interval;
4422 +                       update_next_balance = 1;
4423 +               }
4424 +
4425 +               /*
4426 +                * Stop the load balance at this level. There is another
4427 +                * CPU in our sched group which is doing load balancing more
4428 +                * actively.
4429 +                */
4430 +               if (!balance)
4431 +                       break;
4432 +       }
4433 +
4434 +       /*
4435 +        * next_balance will be updated only when there is a need.
4436 +        * When the cpu is attached to null domain for ex, it will not be
4437 +        * updated.
4438 +        */
4439 +       if (likely(update_next_balance))
4440 +               rq->next_balance = next_balance;
4441 +}
4442 +
4443 +/*
4444 + * run_rebalance_domains is triggered when needed from the scheduler tick.
4445 + * In CONFIG_NO_HZ case, the idle load balance owner will do the
4446 + * rebalancing for all the cpus for whom scheduler ticks are stopped.
4447 + */
4448 +static void run_rebalance_domains(struct softirq_action *h)
4449 +{
4450 +       int this_cpu = smp_processor_id();
4451 +       struct rq *this_rq = cpu_rq(this_cpu);
4452 +       enum cpu_idle_type idle = this_rq->idle_at_tick ?
4453 +                                               CPU_IDLE : CPU_NOT_IDLE;
4454 +
4455 +       rebalance_domains(this_cpu, idle);
4456 +
4457 +#ifdef CONFIG_NO_HZ
4458 +       /*
4459 +        * If this cpu is the owner for idle load balancing, then do the
4460 +        * balancing on behalf of the other idle cpus whose ticks are
4461 +        * stopped.
4462 +        */
4463 +       if (this_rq->idle_at_tick &&
4464 +           atomic_read(&nohz.load_balancer) == this_cpu) {
4465 +               cpumask_t cpus = nohz.cpu_mask;
4466 +               struct rq *rq;
4467 +               int balance_cpu;
4468 +
4469 +               cpu_clear(this_cpu, cpus);
4470 +               for_each_cpu_mask_nr(balance_cpu, cpus) {
4471 +                       /*
4472 +                        * If this cpu gets work to do, stop the load balancing
4473 +                        * work being done for other cpus. Next load
4474 +                        * balancing owner will pick it up.
4475 +                        */
4476 +                       if (need_resched())
4477 +                               break;
4478 +
4479 +                       rebalance_domains(balance_cpu, CPU_IDLE);
4480 +
4481 +                       rq = cpu_rq(balance_cpu);
4482 +                       if (time_after(this_rq->next_balance, rq->next_balance))
4483 +                               this_rq->next_balance = rq->next_balance;
4484 +               }
4485 +       }
4486 +#endif
4487 +}
4488 +
4489 +/*
4490 + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4491 + *
4492 + * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4493 + * idle load balancing owner or decide to stop the periodic load balancing,
4494 + * if the whole system is idle.
4495 + */
4496 +static inline void trigger_load_balance(struct rq *rq, int cpu)
4497 +{
4498 +#ifdef CONFIG_NO_HZ
4499 +       /*
4500 +        * If we were in the nohz mode recently and busy at the current
4501 +        * scheduler tick, then check if we need to nominate new idle
4502 +        * load balancer.
4503 +        */
4504 +       if (rq->in_nohz_recently && !rq->idle_at_tick) {
4505 +               rq->in_nohz_recently = 0;
4506 +
4507 +               if (atomic_read(&nohz.load_balancer) == cpu) {
4508 +                       cpu_clear(cpu, nohz.cpu_mask);
4509 +                       atomic_set(&nohz.load_balancer, -1);
4510 +               }
4511 +
4512 +               if (atomic_read(&nohz.load_balancer) == -1) {
4513 +                       /*
4514 +                        * simple selection for now: Nominate the
4515 +                        * first cpu in the nohz list to be the next
4516 +                        * ilb owner.
4517 +                        *
4518 +                        * TBD: Traverse the sched domains and nominate
4519 +                        * the nearest cpu in the nohz.cpu_mask.
4520 +                        */
4521 +                       int ilb = first_cpu(nohz.cpu_mask);
4522 +
4523 +                       if (ilb < nr_cpu_ids)
4524 +                               resched_cpu(ilb);
4525 +               }
4526 +       }
4527 +
4528 +       /*
4529 +        * If this cpu is idle and doing idle load balancing for all the
4530 +        * cpus with ticks stopped, is it time for that to stop?
4531 +        */
4532 +       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4533 +           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
4534 +               resched_cpu(cpu);
4535 +               return;
4536 +       }
4537 +
4538 +       /*
4539 +        * If this cpu is idle and the idle load balancing is done by
4540 +        * someone else, then no need raise the SCHED_SOFTIRQ
4541 +        */
4542 +       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4543 +           cpu_isset(cpu, nohz.cpu_mask))
4544 +               return;
4545 +#endif
4546 +       if (time_after_eq(jiffies, rq->next_balance))
4547 +               raise_softirq(SCHED_SOFTIRQ);
4548 +}
4549 +
4550 +#else  /* CONFIG_SMP */
4551 +
4552 +/*
4553 + * on UP we do not need to balance between CPUs:
4554 + */
4555 +static inline void idle_balance(int cpu, struct rq *rq)
4556 +{
4557 +}
4558 +
4559 +#endif
4560 +
4561 +DEFINE_PER_CPU(struct kernel_stat, kstat);
4562 +
4563 +EXPORT_PER_CPU_SYMBOL(kstat);
4564 +
4565 +/*
4566 + * Return p->sum_exec_runtime plus any more ns on the sched_clock
4567 + * that have not yet been banked in case the task is currently running.
4568 + */
4569 +unsigned long long task_sched_runtime(struct task_struct *p)
4570 +{
4571 +       unsigned long flags;
4572 +       u64 ns, delta_exec;
4573 +       struct rq *rq;
4574 +
4575 +       rq = task_rq_lock(p, &flags);
4576 +       ns = p->se.sum_exec_runtime;
4577 +       if (task_current(rq, p)) {
4578 +               update_rq_clock(rq);
4579 +               delta_exec = rq->clock - p->se.exec_start;
4580 +               if ((s64)delta_exec > 0)
4581 +                       ns += delta_exec;
4582 +       }
4583 +       task_rq_unlock(rq, &flags);
4584 +
4585 +       return ns;
4586 +}
4587 +
4588 +/*
4589 + * Account user cpu time to a process.
4590 + * @p: the process that the cpu time gets accounted to
4591 + * @cputime: the cpu time spent in user space since the last update
4592 + */
4593 +void account_user_time(struct task_struct *p, cputime_t cputime)
4594 +{
4595 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4596 +       struct vx_info *vxi = p->vx_info;  /* p is _always_ current */
4597 +       cputime64_t tmp;
4598 +       int nice = (TASK_NICE(p) > 0);
4599 +
4600 +       p->utime = cputime_add(p->utime, cputime);
4601 +       vx_account_user(vxi, cputime, nice);
4602 +
4603 +       /* Add user time to cpustat. */
4604 +       tmp = cputime_to_cputime64(cputime);
4605 +       if (nice)
4606 +               cpustat->nice = cputime64_add(cpustat->nice, tmp);
4607 +       else
4608 +               cpustat->user = cputime64_add(cpustat->user, tmp);
4609 +       /* Account for user time used */
4610 +       acct_update_integrals(p);
4611 +}
4612 +
4613 +/*
4614 + * Account guest cpu time to a process.
4615 + * @p: the process that the cpu time gets accounted to
4616 + * @cputime: the cpu time spent in virtual machine since the last update
4617 + */
4618 +static void account_guest_time(struct task_struct *p, cputime_t cputime)
4619 +{
4620 +       cputime64_t tmp;
4621 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4622 +
4623 +       tmp = cputime_to_cputime64(cputime);
4624 +
4625 +       p->utime = cputime_add(p->utime, cputime);
4626 +       p->gtime = cputime_add(p->gtime, cputime);
4627 +
4628 +       cpustat->user = cputime64_add(cpustat->user, tmp);
4629 +       cpustat->guest = cputime64_add(cpustat->guest, tmp);
4630 +}
4631 +
4632 +/*
4633 + * Account scaled user cpu time to a process.
4634 + * @p: the process that the cpu time gets accounted to
4635 + * @cputime: the cpu time spent in user space since the last update
4636 + */
4637 +void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4638 +{
4639 +       p->utimescaled = cputime_add(p->utimescaled, cputime);
4640 +}
4641 +
4642 +/*
4643 + * Account system cpu time to a process.
4644 + * @p: the process that the cpu time gets accounted to
4645 + * @hardirq_offset: the offset to subtract from hardirq_count()
4646 + * @cputime: the cpu time spent in kernel space since the last update
4647 + */
4648 +void account_system_time(struct task_struct *p, int hardirq_offset,
4649 +                        cputime_t cputime)
4650 +{
4651 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4652 +       struct vx_info *vxi = p->vx_info;  /* p is _always_ current */
4653 +       struct rq *rq = this_rq();
4654 +       cputime64_t tmp;
4655 +
4656 +       if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4657 +               account_guest_time(p, cputime);
4658 +               return;
4659 +       }
4660 +
4661 +       p->stime = cputime_add(p->stime, cputime);
4662 +       vx_account_system(vxi, cputime, (p == rq->idle));
4663 +
4664 +       /* Add system time to cpustat. */
4665 +       tmp = cputime_to_cputime64(cputime);
4666 +       if (hardirq_count() - hardirq_offset)
4667 +               cpustat->irq = cputime64_add(cpustat->irq, tmp);
4668 +       else if (softirq_count())
4669 +               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4670 +       else if (p != rq->idle)
4671 +               cpustat->system = cputime64_add(cpustat->system, tmp);
4672 +       else if (atomic_read(&rq->nr_iowait) > 0)
4673 +               cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4674 +       else
4675 +               cpustat->idle = cputime64_add(cpustat->idle, tmp);
4676 +       /* Account for system time used */
4677 +       acct_update_integrals(p);
4678 +}
4679 +
4680 +/*
4681 + * Account scaled system cpu time to a process.
4682 + * @p: the process that the cpu time gets accounted to
4683 + * @hardirq_offset: the offset to subtract from hardirq_count()
4684 + * @cputime: the cpu time spent in kernel space since the last update
4685 + */
4686 +void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
4687 +{
4688 +       p->stimescaled = cputime_add(p->stimescaled, cputime);
4689 +}
4690 +
4691 +/*
4692 + * Account for involuntary wait time.
4693 + * @p: the process from which the cpu time has been stolen
4694 + * @steal: the cpu time spent in involuntary wait
4695 + */
4696 +void account_steal_time(struct task_struct *p, cputime_t steal)
4697 +{
4698 +       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4699 +       cputime64_t tmp = cputime_to_cputime64(steal);
4700 +       struct rq *rq = this_rq();
4701 +
4702 +       if (p == rq->idle) {
4703 +               p->stime = cputime_add(p->stime, steal);
4704 +               if (atomic_read(&rq->nr_iowait) > 0)
4705 +                       cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4706 +               else
4707 +                       cpustat->idle = cputime64_add(cpustat->idle, tmp);
4708 +       } else
4709 +               cpustat->steal = cputime64_add(cpustat->steal, tmp);
4710 +}
4711 +
4712 +/*
4713 + * Use precise platform statistics if available:
4714 + */
4715 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4716 +cputime_t task_utime(struct task_struct *p)
4717 +{
4718 +       return p->utime;
4719 +}
4720 +
4721 +cputime_t task_stime(struct task_struct *p)
4722 +{
4723 +       return p->stime;
4724 +}
4725 +#else
4726 +cputime_t task_utime(struct task_struct *p)
4727 +{
4728 +       clock_t utime = cputime_to_clock_t(p->utime),
4729 +               total = utime + cputime_to_clock_t(p->stime);
4730 +       u64 temp;
4731 +
4732 +       /*
4733 +        * Use CFS's precise accounting:
4734 +        */
4735 +       temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4736 +
4737 +       if (total) {
4738 +               temp *= utime;
4739 +               do_div(temp, total);
4740 +       }
4741 +       utime = (clock_t)temp;
4742 +
4743 +       p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4744 +       return p->prev_utime;
4745 +}
4746 +
4747 +cputime_t task_stime(struct task_struct *p)
4748 +{
4749 +       clock_t stime;
4750 +
4751 +       /*
4752 +        * Use CFS's precise accounting. (we subtract utime from
4753 +        * the total, to make sure the total observed by userspace
4754 +        * grows monotonically - apps rely on that):
4755 +        */
4756 +       stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4757 +                       cputime_to_clock_t(task_utime(p));
4758 +
4759 +       if (stime >= 0)
4760 +               p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4761 +
4762 +       return p->prev_stime;
4763 +}
4764 +#endif
4765 +
4766 +inline cputime_t task_gtime(struct task_struct *p)
4767 +{
4768 +       return p->gtime;
4769 +}
4770 +
4771 +/*
4772 + * This function gets called by the timer code, with HZ frequency.
4773 + * We call it with interrupts disabled.
4774 + *
4775 + * It also gets called by the fork code, when changing the parent's
4776 + * timeslices.
4777 + */
4778 +void scheduler_tick(void)
4779 +{
4780 +       int cpu = smp_processor_id();
4781 +       struct rq *rq = cpu_rq(cpu);
4782 +       struct task_struct *curr = rq->curr;
4783 +
4784 +       sched_clock_tick();
4785 +
4786 +       spin_lock(&rq->lock);
4787 +       update_rq_clock(rq);
4788 +       update_cpu_load(rq);
4789 +       curr->sched_class->task_tick(rq, curr, 0);
4790 +       spin_unlock(&rq->lock);
4791 +
4792 +#ifdef CONFIG_SMP
4793 +       rq->idle_at_tick = idle_cpu(cpu);
4794 +       trigger_load_balance(rq, cpu);
4795 +#endif
4796 +}
4797 +
4798 +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4799 +                               defined(CONFIG_PREEMPT_TRACER))
4800 +
4801 +static inline unsigned long get_parent_ip(unsigned long addr)
4802 +{
4803 +       if (in_lock_functions(addr)) {
4804 +               addr = CALLER_ADDR2;
4805 +               if (in_lock_functions(addr))
4806 +                       addr = CALLER_ADDR3;
4807 +       }
4808 +       return addr;
4809 +}
4810 +
4811 +void __kprobes add_preempt_count(int val)
4812 +{
4813 +#ifdef CONFIG_DEBUG_PREEMPT
4814 +       /*
4815 +        * Underflow?
4816 +        */
4817 +       if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4818 +               return;
4819 +#endif
4820 +       preempt_count() += val;
4821 +#ifdef CONFIG_DEBUG_PREEMPT
4822 +       /*
4823 +        * Spinlock count overflowing soon?
4824 +        */
4825 +       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4826 +                               PREEMPT_MASK - 10);
4827 +#endif
4828 +       if (preempt_count() == val)
4829 +               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4830 +}
4831 +EXPORT_SYMBOL(add_preempt_count);
4832 +
4833 +void __kprobes sub_preempt_count(int val)
4834 +{
4835 +#ifdef CONFIG_DEBUG_PREEMPT
4836 +       /*
4837 +        * Underflow?
4838 +        */
4839 +       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4840 +               return;
4841 +       /*
4842 +        * Is the spinlock portion underflowing?
4843 +        */
4844 +       if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4845 +                       !(preempt_count() & PREEMPT_MASK)))
4846 +               return;
4847 +#endif
4848 +
4849 +       if (preempt_count() == val)
4850 +               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4851 +       preempt_count() -= val;
4852 +}
4853 +EXPORT_SYMBOL(sub_preempt_count);
4854 +
4855 +#endif
4856 +
4857 +/*
4858 + * Print scheduling while atomic bug:
4859 + */
4860 +static noinline void __schedule_bug(struct task_struct *prev)
4861 +{
4862 +       struct pt_regs *regs = get_irq_regs();
4863 +
4864 +       printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4865 +               prev->comm, prev->pid, preempt_count());
4866 +
4867 +       debug_show_held_locks(prev);
4868 +       print_modules();
4869 +       if (irqs_disabled())
4870 +               print_irqtrace_events(prev);
4871 +
4872 +       if (regs)
4873 +               show_regs(regs);
4874 +       else
4875 +               dump_stack();
4876 +}
4877 +
4878 +/*
4879 + * Various schedule()-time debugging checks and statistics:
4880 + */
4881 +static inline void schedule_debug(struct task_struct *prev)
4882 +{
4883 +       /*
4884 +        * Test if we are atomic. Since do_exit() needs to call into
4885 +        * schedule() atomically, we ignore that path for now.
4886 +        * Otherwise, whine if we are scheduling when we should not be.
4887 +        */
4888 +       if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4889 +               __schedule_bug(prev);
4890 +
4891 +       profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4892 +
4893 +       schedstat_inc(this_rq(), sched_count);
4894 +#ifdef CONFIG_SCHEDSTATS
4895 +       if (unlikely(prev->lock_depth >= 0)) {
4896 +               schedstat_inc(this_rq(), bkl_count);
4897 +               schedstat_inc(prev, sched_info.bkl_count);
4898 +       }
4899 +#endif
4900 +}
4901 +
4902 +/*
4903 + * Pick up the highest-prio task:
4904 + */
4905 +static inline struct task_struct *
4906 +pick_next_task(struct rq *rq, struct task_struct *prev)
4907 +{
4908 +       const struct sched_class *class;
4909 +       struct task_struct *p;
4910 +
4911 +       /*
4912 +        * Optimization: we know that if all tasks are in
4913 +        * the fair class we can call that function directly:
4914 +        */
4915 +       if (likely(rq->nr_running == rq->cfs.nr_running)) {
4916 +               p = fair_sched_class.pick_next_task(rq);
4917 +               if (likely(p))
4918 +                       return p;
4919 +       }
4920 +
4921 +       class = sched_class_highest;
4922 +       for ( ; ; ) {
4923 +               p = class->pick_next_task(rq);
4924 +               if (p)
4925 +                       return p;
4926 +               /*
4927 +                * Will never be NULL as the idle class always
4928 +                * returns a non-NULL p:
4929 +                */
4930 +               class = class->next;
4931 +       }
4932 +}
4933 +
4934 +void (*rec_event)(void *,unsigned int) = NULL;
4935 +EXPORT_SYMBOL(rec_event);
4936 +#ifdef CONFIG_CHOPSTIX
4937 +
4938 +struct event_spec {
4939 +    unsigned long pc;
4940 +    unsigned long dcookie;
4941 +    unsigned int count;
4942 +    unsigned int reason;
4943 +};
4944 +
4945 +/* To support safe calling from asm */
4946 +asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
4947 +    struct pt_regs *regs;
4948 +    struct event_spec *es = event_signature_in->event_data;
4949 +    regs = task_pt_regs(current);
4950 +    event_signature_in->task=current;
4951 +    es->pc=regs->ip;
4952 +    event_signature_in->count=1;
4953 +    (*rec_event)(event_signature_in, count);
4954 +}
4955 +#endif
4956 +
4957 +/*
4958 + * schedule() is the main scheduler function.
4959 + */
4960 +asmlinkage void __sched schedule(void)
4961 +{
4962 +       struct task_struct *prev, *next;
4963 +       unsigned long *switch_count;
4964 +       struct rq *rq;
4965 +       int cpu;
4966 +
4967 +need_resched:
4968 +       preempt_disable();
4969 +       cpu = smp_processor_id();
4970 +       rq = cpu_rq(cpu);
4971 +       rcu_qsctr_inc(cpu);
4972 +       prev = rq->curr;
4973 +       switch_count = &prev->nivcsw;
4974 +
4975 +       release_kernel_lock(prev);
4976 +need_resched_nonpreemptible:
4977 +
4978 +       schedule_debug(prev);
4979 +
4980 +       if (sched_feat(HRTICK))
4981 +               hrtick_clear(rq);
4982 +
4983 +       /*
4984 +        * Do the rq-clock update outside the rq lock:
4985 +        */
4986 +       local_irq_disable();
4987 +       update_rq_clock(rq);
4988 +       spin_lock(&rq->lock);
4989 +       clear_tsk_need_resched(prev);
4990 +
4991 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4992 +               if (unlikely(signal_pending_state(prev->state, prev)))
4993 +                       prev->state = TASK_RUNNING;
4994 +               else
4995 +                       deactivate_task(rq, prev, 1);
4996 +               switch_count = &prev->nvcsw;
4997 +       }
4998 +
4999 +#ifdef CONFIG_SMP
5000 +       if (prev->sched_class->pre_schedule)
5001 +               prev->sched_class->pre_schedule(rq, prev);
5002 +#endif
5003 +
5004 +       if (unlikely(!rq->nr_running))
5005 +               idle_balance(cpu, rq);
5006 +
5007 +       prev->sched_class->put_prev_task(rq, prev);
5008 +       next = pick_next_task(rq, prev);
5009 +
5010 +       if (likely(prev != next)) {
5011 +               sched_info_switch(prev, next);
5012 +
5013 +               rq->nr_switches++;
5014 +               rq->curr = next;
5015 +               ++*switch_count;
5016 +
5017 +               context_switch(rq, prev, next); /* unlocks the rq */
5018 +               /*
5019 +                * the context switch might have flipped the stack from under
5020 +                * us, hence refresh the local variables.
5021 +                */
5022 +               cpu = smp_processor_id();
5023 +               rq = cpu_rq(cpu);
5024 +       } else
5025 +               spin_unlock_irq(&rq->lock);
5026 +
5027 +       if (unlikely(reacquire_kernel_lock(current) < 0))
5028 +               goto need_resched_nonpreemptible;
5029 +
5030 +       preempt_enable_no_resched();
5031 +       if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
5032 +               goto need_resched;
5033 +}
5034 +EXPORT_SYMBOL(schedule);
5035 +
5036 +#ifdef CONFIG_PREEMPT
5037 +/*
5038 + * this is the entry point to schedule() from in-kernel preemption
5039 + * off of preempt_enable. Kernel preemptions off return from interrupt
5040 + * occur there and call schedule directly.
5041 + */
5042 +asmlinkage void __sched preempt_schedule(void)
5043 +{
5044 +       struct thread_info *ti = current_thread_info();
5045 +
5046 +       /*
5047 +        * If there is a non-zero preempt_count or interrupts are disabled,
5048 +        * we do not want to preempt the current task. Just return..
5049 +        */
5050 +       if (likely(ti->preempt_count || irqs_disabled()))
5051 +               return;
5052 +
5053 +       do {
5054 +               add_preempt_count(PREEMPT_ACTIVE);
5055 +               schedule();
5056 +               sub_preempt_count(PREEMPT_ACTIVE);
5057 +
5058 +               /*
5059 +                * Check again in case we missed a preemption opportunity
5060 +                * between schedule and now.
5061 +                */
5062 +               barrier();
5063 +       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
5064 +}
5065 +EXPORT_SYMBOL(preempt_schedule);
5066 +
5067 +/*
5068 + * this is the entry point to schedule() from kernel preemption
5069 + * off of irq context.
5070 + * Note, that this is called and return with irqs disabled. This will
5071 + * protect us against recursive calling from irq.
5072 + */
5073 +asmlinkage void __sched preempt_schedule_irq(void)
5074 +{
5075 +       struct thread_info *ti = current_thread_info();
5076 +
5077 +       /* Catch callers which need to be fixed */
5078 +       BUG_ON(ti->preempt_count || !irqs_disabled());
5079 +
5080 +       do {
5081 +               add_preempt_count(PREEMPT_ACTIVE);
5082 +               local_irq_enable();
5083 +               schedule();
5084 +               local_irq_disable();
5085 +               sub_preempt_count(PREEMPT_ACTIVE);
5086 +
5087 +               /*
5088 +                * Check again in case we missed a preemption opportunity
5089 +                * between schedule and now.
5090 +                */
5091 +               barrier();
5092 +       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
5093 +}
5094 +
5095 +#endif /* CONFIG_PREEMPT */
5096 +
5097 +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
5098 +                         void *key)
5099 +{
5100 +       return try_to_wake_up(curr->private, mode, sync);
5101 +}
5102 +EXPORT_SYMBOL(default_wake_function);
5103 +
5104 +/*
5105 + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
5106 + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
5107 + * number) then we wake all the non-exclusive tasks and one exclusive task.
5108 + *
5109 + * There are circumstances in which we can try to wake a task which has already
5110 + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5111 + * zero in this (rare) case, and we handle it by continuing to scan the queue.
5112 + */
5113 +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5114 +                            int nr_exclusive, int sync, void *key)
5115 +{
5116 +       wait_queue_t *curr, *next;
5117 +
5118 +       list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5119 +               unsigned flags = curr->flags;
5120 +
5121 +               if (curr->func(curr, mode, sync, key) &&
5122 +                               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5123 +                       break;
5124 +       }
5125 +}
5126 +
5127 +/**
5128 + * __wake_up - wake up threads blocked on a waitqueue.
5129 + * @q: the waitqueue
5130 + * @mode: which threads
5131 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
5132 + * @key: is directly passed to the wakeup function
5133 + */
5134 +void __wake_up(wait_queue_head_t *q, unsigned int mode,
5135 +                       int nr_exclusive, void *key)
5136 +{
5137 +       unsigned long flags;
5138 +
5139 +       spin_lock_irqsave(&q->lock, flags);
5140 +       __wake_up_common(q, mode, nr_exclusive, 0, key);
5141 +       spin_unlock_irqrestore(&q->lock, flags);
5142 +}
5143 +EXPORT_SYMBOL(__wake_up);
5144 +
5145 +/*
5146 + * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
5147 + */
5148 +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5149 +{
5150 +       __wake_up_common(q, mode, 1, 0, NULL);
5151 +}
5152 +
5153 +/**
5154 + * __wake_up_sync - wake up threads blocked on a waitqueue.
5155 + * @q: the waitqueue
5156 + * @mode: which threads
5157 + * @nr_exclusive: how many wake-one or wake-many threads to wake up
5158 + *
5159 + * The sync wakeup differs that the waker knows that it will schedule
5160 + * away soon, so while the target thread will be woken up, it will not
5161 + * be migrated to another CPU - ie. the two threads are 'synchronized'
5162 + * with each other. This can prevent needless bouncing between CPUs.
5163 + *
5164 + * On UP it can prevent extra preemption.
5165 + */
5166 +void
5167 +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5168 +{
5169 +       unsigned long flags;
5170 +       int sync = 1;
5171 +
5172 +       if (unlikely(!q))
5173 +               return;
5174 +
5175 +       if (unlikely(!nr_exclusive))
5176 +               sync = 0;
5177 +
5178 +       spin_lock_irqsave(&q->lock, flags);
5179 +       __wake_up_common(q, mode, nr_exclusive, sync, NULL);
5180 +       spin_unlock_irqrestore(&q->lock, flags);
5181 +}
5182 +EXPORT_SYMBOL_GPL(__wake_up_sync);     /* For internal use only */
5183 +
5184 +void complete(struct completion *x)
5185 +{
5186 +       unsigned long flags;
5187 +
5188 +       spin_lock_irqsave(&x->wait.lock, flags);
5189 +       x->done++;
5190 +       __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
5191 +       spin_unlock_irqrestore(&x->wait.lock, flags);
5192 +}
5193 +EXPORT_SYMBOL(complete);
5194 +
5195 +void complete_all(struct completion *x)
5196 +{
5197 +       unsigned long flags;
5198 +
5199 +       spin_lock_irqsave(&x->wait.lock, flags);
5200 +       x->done += UINT_MAX/2;
5201 +       __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
5202 +       spin_unlock_irqrestore(&x->wait.lock, flags);
5203 +}
5204 +EXPORT_SYMBOL(complete_all);
5205 +
5206 +static inline long __sched
5207 +do_wait_for_common(struct completion *x, long timeout, int state)
5208 +{
5209 +       if (!x->done) {
5210 +               DECLARE_WAITQUEUE(wait, current);
5211 +
5212 +               wait.flags |= WQ_FLAG_EXCLUSIVE;
5213 +               __add_wait_queue_tail(&x->wait, &wait);
5214 +               do {
5215 +                       if ((state == TASK_INTERRUPTIBLE &&
5216 +                            signal_pending(current)) ||
5217 +                           (state == TASK_KILLABLE &&
5218 +                            fatal_signal_pending(current))) {
5219 +                               timeout = -ERESTARTSYS;
5220 +                               break;
5221 +                       }
5222 +                       __set_current_state(state);
5223 +                       spin_unlock_irq(&x->wait.lock);
5224 +                       timeout = schedule_timeout(timeout);
5225 +                       spin_lock_irq(&x->wait.lock);
5226 +               } while (!x->done && timeout);
5227 +               __remove_wait_queue(&x->wait, &wait);
5228 +               if (!x->done)
5229 +                       return timeout;
5230 +       }
5231 +       x->done--;
5232 +       return timeout ?: 1;
5233 +}
5234 +
5235 +static long __sched
5236 +wait_for_common(struct completion *x, long timeout, int state)
5237 +{
5238 +       might_sleep();
5239 +
5240 +       spin_lock_irq(&x->wait.lock);
5241 +       timeout = do_wait_for_common(x, timeout, state);
5242 +       spin_unlock_irq(&x->wait.lock);
5243 +       return timeout;
5244 +}
5245 +
5246 +void __sched wait_for_completion(struct completion *x)
5247 +{
5248 +       wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
5249 +}
5250 +EXPORT_SYMBOL(wait_for_completion);
5251 +
5252 +unsigned long __sched
5253 +wait_for_completion_timeout(struct completion *x, unsigned long timeout)
5254 +{
5255 +       return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
5256 +}
5257 +EXPORT_SYMBOL(wait_for_completion_timeout);
5258 +
5259 +int __sched wait_for_completion_interruptible(struct completion *x)
5260 +{
5261 +       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
5262 +       if (t == -ERESTARTSYS)
5263 +               return t;
5264 +       return 0;
5265 +}
5266 +EXPORT_SYMBOL(wait_for_completion_interruptible);
5267 +
5268 +unsigned long __sched
5269 +wait_for_completion_interruptible_timeout(struct completion *x,
5270 +                                         unsigned long timeout)
5271 +{
5272 +       return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
5273 +}
5274 +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
5275 +
5276 +int __sched wait_for_completion_killable(struct completion *x)
5277 +{
5278 +       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
5279 +       if (t == -ERESTARTSYS)
5280 +               return t;
5281 +       return 0;
5282 +}
5283 +EXPORT_SYMBOL(wait_for_completion_killable);
5284 +
5285 +/**
5286 + *     try_wait_for_completion - try to decrement a completion without blocking
5287 + *     @x:     completion structure
5288 + *
5289 + *     Returns: 0 if a decrement cannot be done without blocking
5290 + *              1 if a decrement succeeded.
5291 + *
5292 + *     If a completion is being used as a counting completion,
5293 + *     attempt to decrement the counter without blocking. This
5294 + *     enables us to avoid waiting if the resource the completion
5295 + *     is protecting is not available.
5296 + */
5297 +bool try_wait_for_completion(struct completion *x)
5298 +{
5299 +       int ret = 1;
5300 +
5301 +       spin_lock_irq(&x->wait.lock);
5302 +       if (!x->done)
5303 +               ret = 0;
5304 +       else
5305 +               x->done--;
5306 +       spin_unlock_irq(&x->wait.lock);
5307 +       return ret;
5308 +}
5309 +EXPORT_SYMBOL(try_wait_for_completion);
5310 +
5311 +/**
5312 + *     completion_done - Test to see if a completion has any waiters
5313 + *     @x:     completion structure
5314 + *
5315 + *     Returns: 0 if there are waiters (wait_for_completion() in progress)
5316 + *              1 if there are no waiters.
5317 + *
5318 + */
5319 +bool completion_done(struct completion *x)
5320 +{
5321 +       int ret = 1;
5322 +
5323 +       spin_lock_irq(&x->wait.lock);
5324 +       if (!x->done)
5325 +               ret = 0;
5326 +       spin_unlock_irq(&x->wait.lock);
5327 +       return ret;
5328 +}
5329 +EXPORT_SYMBOL(completion_done);
5330 +
5331 +static long __sched
5332 +sleep_on_common(wait_queue_head_t *q, int state, long timeout)
5333 +{
5334 +       unsigned long flags;
5335 +       wait_queue_t wait;
5336 +
5337 +       init_waitqueue_entry(&wait, current);
5338 +
5339 +       __set_current_state(state);
5340 +
5341 +       spin_lock_irqsave(&q->lock, flags);
5342 +       __add_wait_queue(q, &wait);
5343 +       spin_unlock(&q->lock);
5344 +       timeout = schedule_timeout(timeout);
5345 +       spin_lock_irq(&q->lock);
5346 +       __remove_wait_queue(q, &wait);
5347 +       spin_unlock_irqrestore(&q->lock, flags);
5348 +
5349 +       return timeout;
5350 +}
5351 +
5352 +void __sched interruptible_sleep_on(wait_queue_head_t *q)
5353 +{
5354 +       sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5355 +}
5356 +EXPORT_SYMBOL(interruptible_sleep_on);
5357 +
5358 +long __sched
5359 +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
5360 +{
5361 +       return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
5362 +}
5363 +EXPORT_SYMBOL(interruptible_sleep_on_timeout);
5364 +
5365 +void __sched sleep_on(wait_queue_head_t *q)
5366 +{
5367 +       sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5368 +}
5369 +EXPORT_SYMBOL(sleep_on);
5370 +
5371 +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
5372 +{
5373 +       return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
5374 +}
5375 +EXPORT_SYMBOL(sleep_on_timeout);
5376 +
5377 +#ifdef CONFIG_RT_MUTEXES
5378 +
5379 +/*
5380 + * rt_mutex_setprio - set the current priority of a task
5381 + * @p: task
5382 + * @prio: prio value (kernel-internal form)
5383 + *
5384 + * This function changes the 'effective' priority of a task. It does
5385 + * not touch ->normal_prio like __setscheduler().
5386 + *
5387 + * Used by the rt_mutex code to implement priority inheritance logic.
5388 + */
5389 +void rt_mutex_setprio(struct task_struct *p, int prio)
5390 +{
5391 +       unsigned long flags;
5392 +       int oldprio, on_rq, running;
5393 +       struct rq *rq;
5394 +       const struct sched_class *prev_class = p->sched_class;
5395 +
5396 +       BUG_ON(prio < 0 || prio > MAX_PRIO);
5397 +
5398 +       rq = task_rq_lock(p, &flags);
5399 +       update_rq_clock(rq);
5400 +
5401 +       oldprio = p->prio;
5402 +       on_rq = p->se.on_rq;
5403 +       running = task_current(rq, p);
5404 +       if (on_rq)
5405 +               dequeue_task(rq, p, 0);
5406 +       if (running)
5407 +               p->sched_class->put_prev_task(rq, p);
5408 +
5409 +       if (rt_prio(prio))
5410 +               p->sched_class = &rt_sched_class;
5411 +       else
5412 +               p->sched_class = &fair_sched_class;
5413 +
5414 +       p->prio = prio;
5415 +
5416 +       if (running)
5417 +               p->sched_class->set_curr_task(rq);
5418 +       if (on_rq) {
5419 +               enqueue_task(rq, p, 0);
5420 +
5421 +               check_class_changed(rq, p, prev_class, oldprio, running);
5422 +       }
5423 +       task_rq_unlock(rq, &flags);
5424 +}
5425 +
5426 +#endif
5427 +
5428 +void set_user_nice(struct task_struct *p, long nice)
5429 +{
5430 +       int old_prio, delta, on_rq;
5431 +       unsigned long flags;
5432 +       struct rq *rq;
5433 +
5434 +       if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
5435 +               return;
5436 +       /*
5437 +        * We have to be careful, if called from sys_setpriority(),
5438 +        * the task might be in the middle of scheduling on another CPU.
5439 +        */
5440 +       rq = task_rq_lock(p, &flags);
5441 +       update_rq_clock(rq);
5442 +       /*
5443 +        * The RT priorities are set via sched_setscheduler(), but we still
5444 +        * allow the 'normal' nice value to be set - but as expected
5445 +        * it wont have any effect on scheduling until the task is
5446 +        * SCHED_FIFO/SCHED_RR:
5447 +        */
5448 +       if (task_has_rt_policy(p)) {
5449 +               p->static_prio = NICE_TO_PRIO(nice);
5450 +               goto out_unlock;
5451 +       }
5452 +       on_rq = p->se.on_rq;
5453 +       if (on_rq)
5454 +               dequeue_task(rq, p, 0);
5455 +
5456 +       p->static_prio = NICE_TO_PRIO(nice);
5457 +       set_load_weight(p);
5458 +       old_prio = p->prio;
5459 +       p->prio = effective_prio(p);
5460 +       delta = p->prio - old_prio;
5461 +
5462 +       if (on_rq) {
5463 +               enqueue_task(rq, p, 0);
5464 +               /*
5465 +                * If the task increased its priority or is running and
5466 +                * lowered its priority, then reschedule its CPU:
5467 +                */
5468 +               if (delta < 0 || (delta > 0 && task_running(rq, p)))
5469 +                       resched_task(rq->curr);
5470 +       }
5471 +out_unlock:
5472 +       task_rq_unlock(rq, &flags);
5473 +}
5474 +EXPORT_SYMBOL(set_user_nice);
5475 +
5476 +/*
5477 + * can_nice - check if a task can reduce its nice value
5478 + * @p: task
5479 + * @nice: nice value
5480 + */
5481 +int can_nice(const struct task_struct *p, const int nice)
5482 +{
5483 +       /* convert nice value [19,-20] to rlimit style value [1,40] */
5484 +       int nice_rlim = 20 - nice;
5485 +
5486 +       return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
5487 +               capable(CAP_SYS_NICE));
5488 +}
5489 +
5490 +#ifdef __ARCH_WANT_SYS_NICE
5491 +
5492 +/*
5493 + * sys_nice - change the priority of the current process.
5494 + * @increment: priority increment
5495 + *
5496 + * sys_setpriority is a more generic, but much slower function that
5497 + * does similar things.
5498 + */
5499 +SYSCALL_DEFINE1(nice, int, increment)
5500 +{
5501 +       long nice, retval;
5502 +
5503 +       /*
5504 +        * Setpriority might change our priority at the same moment.
5505 +        * We don't have to worry. Conceptually one call occurs first
5506 +        * and we have a single winner.
5507 +        */
5508 +       if (increment < -40)
5509 +               increment = -40;
5510 +       if (increment > 40)
5511 +               increment = 40;
5512 +
5513 +       nice = PRIO_TO_NICE(current->static_prio) + increment;
5514 +       if (nice < -20)
5515 +               nice = -20;
5516 +       if (nice > 19)
5517 +               nice = 19;
5518 +
5519 +       if (increment < 0 && !can_nice(current, nice))
5520 +               return vx_flags(VXF_IGNEG_NICE, 0) ? 0 : -EPERM;
5521 +
5522 +       retval = security_task_setnice(current, nice);
5523 +       if (retval)
5524 +               return retval;
5525 +
5526 +       set_user_nice(current, nice);
5527 +       return 0;
5528 +}
5529 +
5530 +#endif
5531 +
5532 +/**
5533 + * task_prio - return the priority value of a given task.
5534 + * @p: the task in question.
5535 + *
5536 + * This is the priority value as seen by users in /proc.
5537 + * RT tasks are offset by -200. Normal tasks are centered
5538 + * around 0, value goes from -16 to +15.
5539 + */
5540 +int task_prio(const struct task_struct *p)
5541 +{
5542 +       return p->prio - MAX_RT_PRIO;
5543 +}
5544 +
5545 +/**
5546 + * task_nice - return the nice value of a given task.
5547 + * @p: the task in question.
5548 + */
5549 +int task_nice(const struct task_struct *p)
5550 +{
5551 +       return TASK_NICE(p);
5552 +}
5553 +EXPORT_SYMBOL(task_nice);
5554 +
5555 +/**
5556 + * idle_cpu - is a given cpu idle currently?
5557 + * @cpu: the processor in question.
5558 + */
5559 +int idle_cpu(int cpu)
5560 +{
5561 +       return cpu_curr(cpu) == cpu_rq(cpu)->idle;
5562 +}
5563 +
5564 +/**
5565 + * idle_task - return the idle task for a given cpu.
5566 + * @cpu: the processor in question.
5567 + */
5568 +struct task_struct *idle_task(int cpu)
5569 +{
5570 +       return cpu_rq(cpu)->idle;
5571 +}
5572 +
5573 +/**
5574 + * find_process_by_pid - find a process with a matching PID value.
5575 + * @pid: the pid in question.
5576 + */
5577 +static struct task_struct *find_process_by_pid(pid_t pid)
5578 +{
5579 +       return pid ? find_task_by_vpid(pid) : current;
5580 +}
5581 +
5582 +/* Actually do priority change: must hold rq lock. */
5583 +static void
5584 +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5585 +{
5586 +       BUG_ON(p->se.on_rq);
5587 +
5588 +       p->policy = policy;
5589 +       switch (p->policy) {
5590 +       case SCHED_NORMAL:
5591 +       case SCHED_BATCH:
5592 +       case SCHED_IDLE:
5593 +               p->sched_class = &fair_sched_class;
5594 +               break;
5595 +       case SCHED_FIFO:
5596 +       case SCHED_RR:
5597 +               p->sched_class = &rt_sched_class;
5598 +               break;
5599 +       }
5600 +
5601 +       p->rt_priority = prio;
5602 +       p->normal_prio = normal_prio(p);
5603 +       /* we are holding p->pi_lock already */
5604 +       p->prio = rt_mutex_getprio(p);
5605 +       set_load_weight(p);
5606 +}
5607 +
5608 +static int __sched_setscheduler(struct task_struct *p, int policy,
5609 +                               struct sched_param *param, bool user)
5610 +{
5611 +       int retval, oldprio, oldpolicy = -1, on_rq, running;
5612 +       unsigned long flags;
5613 +       const struct sched_class *prev_class = p->sched_class;
5614 +       struct rq *rq;
5615 +
5616 +       /* may grab non-irq protected spin_locks */
5617 +       BUG_ON(in_interrupt());
5618 +recheck:
5619 +       /* double check policy once rq lock held */
5620 +       if (policy < 0)
5621 +               policy = oldpolicy = p->policy;
5622 +       else if (policy != SCHED_FIFO && policy != SCHED_RR &&
5623 +                       policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5624 +                       policy != SCHED_IDLE)
5625 +               return -EINVAL;
5626 +       /*
5627 +        * Valid priorities for SCHED_FIFO and SCHED_RR are
5628 +        * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5629 +        * SCHED_BATCH and SCHED_IDLE is 0.
5630 +        */
5631 +       if (param->sched_priority < 0 ||
5632 +           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5633 +           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5634 +               return -EINVAL;
5635 +       if (rt_policy(policy) != (param->sched_priority != 0))
5636 +               return -EINVAL;
5637 +
5638 +       /*
5639 +        * Allow unprivileged RT tasks to decrease priority:
5640 +        */
5641 +       if (user && !capable(CAP_SYS_NICE)) {
5642 +               if (rt_policy(policy)) {
5643 +                       unsigned long rlim_rtprio;
5644 +
5645 +                       if (!lock_task_sighand(p, &flags))
5646 +                               return -ESRCH;
5647 +                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
5648 +                       unlock_task_sighand(p, &flags);
5649 +
5650 +                       /* can't set/change the rt policy */
5651 +                       if (policy != p->policy && !rlim_rtprio)
5652 +                               return -EPERM;
5653 +
5654 +                       /* can't increase priority */
5655 +                       if (param->sched_priority > p->rt_priority &&
5656 +                           param->sched_priority > rlim_rtprio)
5657 +                               return -EPERM;
5658 +               }
5659 +               /*
5660 +                * Like positive nice levels, dont allow tasks to
5661 +                * move out of SCHED_IDLE either:
5662 +                */
5663 +               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
5664 +                       return -EPERM;
5665 +
5666 +               /* can't change other user's priorities */
5667 +               if ((current->euid != p->euid) &&
5668 +                   (current->euid != p->uid))
5669 +                       return -EPERM;
5670 +       }
5671 +
5672 +       if (user) {
5673 +#ifdef CONFIG_RT_GROUP_SCHED
5674 +               /*
5675 +                * Do not allow realtime tasks into groups that have no runtime
5676 +                * assigned.
5677 +                */
5678 +               if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
5679 +                       return -EPERM;
5680 +#endif
5681 +
5682 +               retval = security_task_setscheduler(p, policy, param);
5683 +               if (retval)
5684 +                       return retval;
5685 +       }
5686 +
5687 +       /*
5688 +        * make sure no PI-waiters arrive (or leave) while we are
5689 +        * changing the priority of the task:
5690 +        */
5691 +       spin_lock_irqsave(&p->pi_lock, flags);
5692 +       /*
5693 +        * To be able to change p->policy safely, the apropriate
5694 +        * runqueue lock must be held.
5695 +        */
5696 +       rq = __task_rq_lock(p);
5697 +       /* recheck policy now with rq lock held */
5698 +       if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5699 +               policy = oldpolicy = -1;
5700 +               __task_rq_unlock(rq);
5701 +               spin_unlock_irqrestore(&p->pi_lock, flags);
5702 +               goto recheck;
5703 +       }
5704 +       update_rq_clock(rq);
5705 +       on_rq = p->se.on_rq;
5706 +       running = task_current(rq, p);
5707 +       if (on_rq)
5708 +               deactivate_task(rq, p, 0);
5709 +       if (running)
5710 +               p->sched_class->put_prev_task(rq, p);
5711 +
5712 +       oldprio = p->prio;
5713 +       __setscheduler(rq, p, policy, param->sched_priority);
5714 +
5715 +       if (running)
5716 +               p->sched_class->set_curr_task(rq);
5717 +       if (on_rq) {
5718 +               activate_task(rq, p, 0);
5719 +
5720 +               check_class_changed(rq, p, prev_class, oldprio, running);
5721 +       }
5722 +       __task_rq_unlock(rq);
5723 +       spin_unlock_irqrestore(&p->pi_lock, flags);
5724 +
5725 +       rt_mutex_adjust_pi(p);
5726 +
5727 +       return 0;
5728 +}
5729 +
5730 +/**
5731 + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5732 + * @p: the task in question.
5733 + * @policy: new policy.
5734 + * @param: structure containing the new RT priority.
5735 + *
5736 + * NOTE that the task may be already dead.
5737 + */
5738 +int sched_setscheduler(struct task_struct *p, int policy,
5739 +                      struct sched_param *param)
5740 +{
5741 +       return __sched_setscheduler(p, policy, param, true);
5742 +}
5743 +EXPORT_SYMBOL_GPL(sched_setscheduler);
5744 +
5745 +/**
5746 + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5747 + * @p: the task in question.
5748 + * @policy: new policy.
5749 + * @param: structure containing the new RT priority.
5750 + *
5751 + * Just like sched_setscheduler, only don't bother checking if the
5752 + * current context has permission.  For example, this is needed in
5753 + * stop_machine(): we create temporary high priority worker threads,
5754 + * but our caller might not have that capability.
5755 + */
5756 +int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5757 +                              struct sched_param *param)
5758 +{
5759 +       return __sched_setscheduler(p, policy, param, false);
5760 +}
5761 +
5762 +static int
5763 +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5764 +{
5765 +       struct sched_param lparam;
5766 +       struct task_struct *p;
5767 +       int retval;
5768 +
5769 +       if (!param || pid < 0)
5770 +               return -EINVAL;
5771 +       if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5772 +               return -EFAULT;
5773 +
5774 +       rcu_read_lock();
5775 +       retval = -ESRCH;
5776 +       p = find_process_by_pid(pid);
5777 +       if (p != NULL)
5778 +               retval = sched_setscheduler(p, policy, &lparam);
5779 +       rcu_read_unlock();
5780 +
5781 +       return retval;
5782 +}
5783 +
5784 +/**
5785 + * sys_sched_setscheduler - set/change the scheduler policy and RT priority
5786 + * @pid: the pid in question.
5787 + * @policy: new policy.
5788 + * @param: structure containing the new RT priority.
5789 + */
5790 +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5791 +               struct sched_param __user *, param)
5792 +{
5793 +       /* negative values for policy are not valid */
5794 +       if (policy < 0)
5795 +               return -EINVAL;
5796 +
5797 +       return do_sched_setscheduler(pid, policy, param);
5798 +}
5799 +
5800 +/**
5801 + * sys_sched_setparam - set/change the RT priority of a thread
5802 + * @pid: the pid in question.
5803 + * @param: structure containing the new RT priority.
5804 + */
5805 +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5806 +{
5807 +       return do_sched_setscheduler(pid, -1, param);
5808 +}
5809 +
5810 +/**
5811 + * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5812 + * @pid: the pid in question.
5813 + */
5814 +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5815 +{
5816 +       struct task_struct *p;
5817 +       int retval;
5818 +
5819 +       if (pid < 0)
5820 +               return -EINVAL;
5821 +
5822 +       retval = -ESRCH;
5823 +       read_lock(&tasklist_lock);
5824 +       p = find_process_by_pid(pid);
5825 +       if (p) {
5826 +               retval = security_task_getscheduler(p);
5827 +               if (!retval)
5828 +                       retval = p->policy;
5829 +       }
5830 +       read_unlock(&tasklist_lock);
5831 +       return retval;
5832 +}
5833 +
5834 +/**
5835 + * sys_sched_getscheduler - get the RT priority of a thread
5836 + * @pid: the pid in question.
5837 + * @param: structure containing the RT priority.
5838 + */
5839 +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5840 +{
5841 +       struct sched_param lp;
5842 +       struct task_struct *p;
5843 +       int retval;
5844 +
5845 +       if (!param || pid < 0)
5846 +               return -EINVAL;
5847 +
5848 +       read_lock(&tasklist_lock);
5849 +       p = find_process_by_pid(pid);
5850 +       retval = -ESRCH;
5851 +       if (!p)
5852 +               goto out_unlock;
5853 +
5854 +       retval = security_task_getscheduler(p);
5855 +       if (retval)
5856 +               goto out_unlock;
5857 +
5858 +       lp.sched_priority = p->rt_priority;
5859 +       read_unlock(&tasklist_lock);
5860 +
5861 +       /*
5862 +        * This one might sleep, we cannot do it with a spinlock held ...
5863 +        */
5864 +       retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5865 +
5866 +       return retval;
5867 +
5868 +out_unlock:
5869 +       read_unlock(&tasklist_lock);
5870 +       return retval;
5871 +}
5872 +
5873 +long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5874 +{
5875 +       cpumask_t cpus_allowed;
5876 +       cpumask_t new_mask = *in_mask;
5877 +       struct task_struct *p;
5878 +       int retval;
5879 +
5880 +       get_online_cpus();
5881 +       read_lock(&tasklist_lock);
5882 +
5883 +       p = find_process_by_pid(pid);
5884 +       if (!p) {
5885 +               read_unlock(&tasklist_lock);
5886 +               put_online_cpus();
5887 +               return -ESRCH;
5888 +       }
5889 +
5890 +       /*
5891 +        * It is not safe to call set_cpus_allowed with the
5892 +        * tasklist_lock held. We will bump the task_struct's
5893 +        * usage count and then drop tasklist_lock.
5894 +        */
5895 +       get_task_struct(p);
5896 +       read_unlock(&tasklist_lock);
5897 +
5898 +
5899 +       retval = -EPERM;
5900 +       if ((current->euid != p->euid) && (current->euid != p->uid) &&
5901 +                       !capable(CAP_SYS_NICE))
5902 +               goto out_unlock;
5903 +
5904 +       retval = security_task_setscheduler(p, 0, NULL);
5905 +       if (retval)
5906 +               goto out_unlock;
5907 +
5908 +       cpuset_cpus_allowed(p, &cpus_allowed);
5909 +       cpus_and(new_mask, new_mask, cpus_allowed);
5910 + again:
5911 +       retval = set_cpus_allowed_ptr(p, &new_mask);
5912 +
5913 +       if (!retval) {
5914 +               cpuset_cpus_allowed(p, &cpus_allowed);
5915 +               if (!cpus_subset(new_mask, cpus_allowed)) {
5916 +                       /*
5917 +                        * We must have raced with a concurrent cpuset
5918 +                        * update. Just reset the cpus_allowed to the
5919 +                        * cpuset's cpus_allowed
5920 +                        */
5921 +                       new_mask = cpus_allowed;
5922 +                       goto again;
5923 +               }
5924 +       }
5925 +out_unlock:
5926 +       put_task_struct(p);
5927 +       put_online_cpus();
5928 +       return retval;
5929 +}
5930 +
5931 +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5932 +                            cpumask_t *new_mask)
5933 +{
5934 +       if (len < sizeof(cpumask_t)) {
5935 +               memset(new_mask, 0, sizeof(cpumask_t));
5936 +       } else if (len > sizeof(cpumask_t)) {
5937 +               len = sizeof(cpumask_t);
5938 +       }
5939 +       return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5940 +}
5941 +
5942 +/**
5943 + * sys_sched_setaffinity - set the cpu affinity of a process
5944 + * @pid: pid of the process
5945 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5946 + * @user_mask_ptr: user-space pointer to the new cpu mask
5947 + */
5948 +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5949 +               unsigned long __user *, user_mask_ptr)
5950 +{
5951 +       cpumask_t new_mask;
5952 +       int retval;
5953 +
5954 +       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
5955 +       if (retval)
5956 +               return retval;
5957 +
5958 +       return sched_setaffinity(pid, &new_mask);
5959 +}
5960 +
5961 +long sched_getaffinity(pid_t pid, cpumask_t *mask)
5962 +{
5963 +       struct task_struct *p;
5964 +       int retval;
5965 +
5966 +       get_online_cpus();
5967 +       read_lock(&tasklist_lock);
5968 +
5969 +       retval = -ESRCH;
5970 +       p = find_process_by_pid(pid);
5971 +       if (!p)
5972 +               goto out_unlock;
5973 +
5974 +       retval = security_task_getscheduler(p);
5975 +       if (retval)
5976 +               goto out_unlock;
5977 +
5978 +       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
5979 +
5980 +out_unlock:
5981 +       read_unlock(&tasklist_lock);
5982 +       put_online_cpus();
5983 +
5984 +       return retval;
5985 +}
5986 +
5987 +/**
5988 + * sys_sched_getaffinity - get the cpu affinity of a process
5989 + * @pid: pid of the process
5990 + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5991 + * @user_mask_ptr: user-space pointer to hold the current cpu mask
5992 + */
5993 +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5994 +               unsigned long __user *, user_mask_ptr)
5995 +{
5996 +       int ret;
5997 +       cpumask_t mask;
5998 +
5999 +       if (len < sizeof(cpumask_t))
6000 +               return -EINVAL;
6001 +
6002 +       ret = sched_getaffinity(pid, &mask);
6003 +       if (ret < 0)
6004 +               return ret;
6005 +
6006 +       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
6007 +               return -EFAULT;
6008 +
6009 +       return sizeof(cpumask_t);
6010 +}
6011 +
6012 +/**
6013 + * sys_sched_yield - yield the current processor to other threads.
6014 + *
6015 + * This function yields the current CPU to other tasks. If there are no
6016 + * other threads running on this CPU then this function will return.
6017 + */
6018 +SYSCALL_DEFINE0(sched_yield)
6019 +{
6020 +       struct rq *rq = this_rq_lock();
6021 +
6022 +       schedstat_inc(rq, yld_count);
6023 +       current->sched_class->yield_task(rq);
6024 +
6025 +       /*
6026 +        * Since we are going to call schedule() anyway, there's
6027 +        * no need to preempt or enable interrupts:
6028 +        */
6029 +       __release(rq->lock);
6030 +       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6031 +       _raw_spin_unlock(&rq->lock);
6032 +       preempt_enable_no_resched();
6033 +
6034 +       schedule();
6035 +
6036 +       return 0;
6037 +}
6038 +
6039 +static void __cond_resched(void)
6040 +{
6041 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6042 +       __might_sleep(__FILE__, __LINE__);
6043 +#endif
6044 +       /*
6045 +        * The BKS might be reacquired before we have dropped
6046 +        * PREEMPT_ACTIVE, which could trigger a second
6047 +        * cond_resched() call.
6048 +        */
6049 +       do {
6050 +               add_preempt_count(PREEMPT_ACTIVE);
6051 +               schedule();
6052 +               sub_preempt_count(PREEMPT_ACTIVE);
6053 +       } while (need_resched());
6054 +}
6055 +
6056 +int __sched _cond_resched(void)
6057 +{
6058 +       if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
6059 +                                       system_state == SYSTEM_RUNNING) {
6060 +               __cond_resched();
6061 +               return 1;
6062 +       }
6063 +       return 0;
6064 +}
6065 +EXPORT_SYMBOL(_cond_resched);
6066 +
6067 +/*
6068 + * cond_resched_lock() - if a reschedule is pending, drop the given lock,
6069 + * call schedule, and on return reacquire the lock.
6070 + *
6071 + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6072 + * operations here to prevent schedule() from being called twice (once via
6073 + * spin_unlock(), once by hand).
6074 + */
6075 +int cond_resched_lock(spinlock_t *lock)
6076 +{
6077 +       int resched = need_resched() && system_state == SYSTEM_RUNNING;
6078 +       int ret = 0;
6079 +
6080 +       if (spin_needbreak(lock) || resched) {
6081 +               spin_unlock(lock);
6082 +               if (resched && need_resched())
6083 +                       __cond_resched();
6084 +               else
6085 +                       cpu_relax();
6086 +               ret = 1;
6087 +               spin_lock(lock);
6088 +       }
6089 +       return ret;
6090 +}
6091 +EXPORT_SYMBOL(cond_resched_lock);
6092 +
6093 +int __sched cond_resched_softirq(void)
6094 +{
6095 +       BUG_ON(!in_softirq());
6096 +
6097 +       if (need_resched() && system_state == SYSTEM_RUNNING) {
6098 +               local_bh_enable();
6099 +               __cond_resched();
6100 +               local_bh_disable();
6101 +               return 1;
6102 +       }
6103 +       return 0;
6104 +}
6105 +EXPORT_SYMBOL(cond_resched_softirq);
6106 +
6107 +/**
6108 + * yield - yield the current processor to other threads.
6109 + *
6110 + * This is a shortcut for kernel-space yielding - it marks the
6111 + * thread runnable and calls sys_sched_yield().
6112 + */
6113 +void __sched yield(void)
6114 +{
6115 +       set_current_state(TASK_RUNNING);
6116 +       sys_sched_yield();
6117 +}
6118 +EXPORT_SYMBOL(yield);
6119 +
6120 +/*
6121 + * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6122 + * that process accounting knows that this is a task in IO wait state.
6123 + *
6124 + * But don't do that if it is a deliberate, throttling IO wait (this task
6125 + * has set its backing_dev_info: the queue against which it should throttle)
6126 + */
6127 +void __sched io_schedule(void)
6128 +{
6129 +       struct rq *rq = &__raw_get_cpu_var(runqueues);
6130 +
6131 +       delayacct_blkio_start();
6132 +       atomic_inc(&rq->nr_iowait);
6133 +       schedule();
6134 +       atomic_dec(&rq->nr_iowait);
6135 +       delayacct_blkio_end();
6136 +}
6137 +EXPORT_SYMBOL(io_schedule);
6138 +
6139 +long __sched io_schedule_timeout(long timeout)
6140 +{
6141 +       struct rq *rq = &__raw_get_cpu_var(runqueues);
6142 +       long ret;
6143 +
6144 +       delayacct_blkio_start();
6145 +       atomic_inc(&rq->nr_iowait);
6146 +       ret = schedule_timeout(timeout);
6147 +       atomic_dec(&rq->nr_iowait);
6148 +       delayacct_blkio_end();
6149 +       return ret;
6150 +}
6151 +
6152 +/**
6153 + * sys_sched_get_priority_max - return maximum RT priority.
6154 + * @policy: scheduling class.
6155 + *
6156 + * this syscall returns the maximum rt_priority that can be used
6157 + * by a given scheduling class.
6158 + */
6159 +SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6160 +{
6161 +       int ret = -EINVAL;
6162 +
6163 +       switch (policy) {
6164 +       case SCHED_FIFO:
6165 +       case SCHED_RR:
6166 +               ret = MAX_USER_RT_PRIO-1;
6167 +               break;
6168 +       case SCHED_NORMAL:
6169 +       case SCHED_BATCH:
6170 +       case SCHED_IDLE:
6171 +               ret = 0;
6172 +               break;
6173 +       }
6174 +       return ret;
6175 +}
6176 +
6177 +/**
6178 + * sys_sched_get_priority_min - return minimum RT priority.
6179 + * @policy: scheduling class.
6180 + *
6181 + * this syscall returns the minimum rt_priority that can be used
6182 + * by a given scheduling class.
6183 + */
6184 +SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6185 +{
6186 +       int ret = -EINVAL;
6187 +
6188 +       switch (policy) {
6189 +       case SCHED_FIFO:
6190 +       case SCHED_RR:
6191 +               ret = 1;
6192 +               break;
6193 +       case SCHED_NORMAL:
6194 +       case SCHED_BATCH:
6195 +       case SCHED_IDLE:
6196 +               ret = 0;
6197 +       }
6198 +       return ret;
6199 +}
6200 +
6201 +/**
6202 + * sys_sched_rr_get_interval - return the default timeslice of a process.
6203 + * @pid: pid of the process.
6204 + * @interval: userspace pointer to the timeslice value.
6205 + *
6206 + * this syscall writes the default timeslice value of a given process
6207 + * into the user-space timespec buffer. A value of '0' means infinity.
6208 + */
6209 +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6210 +               struct timespec __user *, interval)
6211 +{
6212 +       struct task_struct *p;
6213 +       unsigned int time_slice;
6214 +       int retval;
6215 +       struct timespec t;
6216 +
6217 +       if (pid < 0)
6218 +               return -EINVAL;
6219 +
6220 +       retval = -ESRCH;
6221 +       read_lock(&tasklist_lock);
6222 +       p = find_process_by_pid(pid);
6223 +       if (!p)
6224 +               goto out_unlock;
6225 +
6226 +       retval = security_task_getscheduler(p);
6227 +       if (retval)
6228 +               goto out_unlock;
6229 +
6230 +       /*
6231 +        * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6232 +        * tasks that are on an otherwise idle runqueue:
6233 +        */
6234 +       time_slice = 0;
6235 +       if (p->policy == SCHED_RR) {
6236 +               time_slice = DEF_TIMESLICE;
6237 +       } else if (p->policy != SCHED_FIFO) {
6238 +               struct sched_entity *se = &p->se;
6239 +               unsigned long flags;
6240 +               struct rq *rq;
6241 +
6242 +               rq = task_rq_lock(p, &flags);
6243 +               if (rq->cfs.load.weight)
6244 +                       time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6245 +               task_rq_unlock(rq, &flags);
6246 +       }
6247 +       read_unlock(&tasklist_lock);
6248 +       jiffies_to_timespec(time_slice, &t);
6249 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6250 +       return retval;
6251 +
6252 +out_unlock:
6253 +       read_unlock(&tasklist_lock);
6254 +       return retval;
6255 +}
6256 +
6257 +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
6258 +
6259 +void sched_show_task(struct task_struct *p)
6260 +{
6261 +       unsigned long free = 0;
6262 +       unsigned state;
6263 +
6264 +       state = p->state ? __ffs(p->state) + 1 : 0;
6265 +       printk(KERN_INFO "%-13.13s %c", p->comm,
6266 +               state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
6267 +#if BITS_PER_LONG == 32
6268 +       if (state == TASK_RUNNING)
6269 +               printk(KERN_CONT " running  ");
6270 +       else
6271 +               printk(KERN_CONT " %08lx ", thread_saved_pc(p));
6272 +#else
6273 +       if (state == TASK_RUNNING)
6274 +               printk(KERN_CONT "  running task    ");
6275 +       else
6276 +               printk(KERN_CONT " %016lx ", thread_saved_pc(p));
6277 +#endif
6278 +#ifdef CONFIG_DEBUG_STACK_USAGE
6279 +       {
6280 +               unsigned long *n = end_of_stack(p);
6281 +               while (!*n)
6282 +                       n++;
6283 +               free = (unsigned long)n - (unsigned long)end_of_stack(p);
6284 +       }
6285 +#endif
6286 +       printk(KERN_CONT "%5lu %5d %6d\n", free,
6287 +               task_pid_nr(p), task_pid_nr(p->real_parent));
6288 +
6289 +       show_stack(p, NULL);
6290 +}
6291 +
6292 +void show_state_filter(unsigned long state_filter)
6293 +{
6294 +       struct task_struct *g, *p;
6295 +
6296 +#if BITS_PER_LONG == 32
6297 +       printk(KERN_INFO
6298 +               "  task                PC stack   pid father\n");
6299 +#else
6300 +       printk(KERN_INFO
6301 +               "  task                        PC stack   pid father\n");
6302 +#endif
6303 +       read_lock(&tasklist_lock);
6304 +       do_each_thread(g, p) {
6305 +               /*
6306 +                * reset the NMI-timeout, listing all files on a slow
6307 +                * console might take alot of time:
6308 +                */
6309 +               touch_nmi_watchdog();
6310 +               if (!state_filter || (p->state & state_filter))
6311 +                       sched_show_task(p);
6312 +       } while_each_thread(g, p);
6313 +
6314 +       touch_all_softlockup_watchdogs();
6315 +
6316 +#ifdef CONFIG_SCHED_DEBUG
6317 +       sysrq_sched_debug_show();
6318 +#endif
6319 +       read_unlock(&tasklist_lock);
6320 +       /*
6321 +        * Only show locks if all tasks are dumped:
6322 +        */
6323 +       if (state_filter == -1)
6324 +               debug_show_all_locks();
6325 +}
6326 +
6327 +void __cpuinit init_idle_bootup_task(struct task_struct *idle)
6328 +{
6329 +       idle->sched_class = &idle_sched_class;
6330 +}
6331 +
6332 +/**
6333 + * init_idle - set up an idle thread for a given CPU
6334 + * @idle: task in question
6335 + * @cpu: cpu the idle task belongs to
6336 + *
6337 + * NOTE: this function does not set the idle thread's NEED_RESCHED
6338 + * flag, to make booting more robust.
6339 + */
6340 +void __cpuinit init_idle(struct task_struct *idle, int cpu)
6341 +{
6342 +       struct rq *rq = cpu_rq(cpu);
6343 +       unsigned long flags;
6344 +
6345 +       __sched_fork(idle);
6346 +       idle->se.exec_start = sched_clock();
6347 +
6348 +       idle->prio = idle->normal_prio = MAX_PRIO;
6349 +       idle->cpus_allowed = cpumask_of_cpu(cpu);
6350 +       __set_task_cpu(idle, cpu);
6351 +
6352 +       spin_lock_irqsave(&rq->lock, flags);
6353 +       rq->curr = rq->idle = idle;
6354 +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6355 +       idle->oncpu = 1;
6356 +#endif
6357 +       spin_unlock_irqrestore(&rq->lock, flags);
6358 +
6359 +       /* Set the preempt count _outside_ the spinlocks! */
6360 +#if defined(CONFIG_PREEMPT)
6361 +       task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
6362 +#else
6363 +       task_thread_info(idle)->preempt_count = 0;
6364 +#endif
6365 +       /*
6366 +        * The idle tasks have their own, simple scheduling class:
6367 +        */
6368 +       idle->sched_class = &idle_sched_class;
6369 +}
6370 +
6371 +/*
6372 + * In a system that switches off the HZ timer nohz_cpu_mask
6373 + * indicates which cpus entered this state. This is used
6374 + * in the rcu update to wait only for active cpus. For system
6375 + * which do not switch off the HZ timer nohz_cpu_mask should
6376 + * always be CPU_MASK_NONE.
6377 + */
6378 +cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
6379 +
6380 +/*
6381 + * Increase the granularity value when there are more CPUs,
6382 + * because with more CPUs the 'effective latency' as visible
6383 + * to users decreases. But the relationship is not linear,
6384 + * so pick a second-best guess by going with the log2 of the
6385 + * number of CPUs.
6386 + *
6387 + * This idea comes from the SD scheduler of Con Kolivas:
6388 + */
6389 +static inline void sched_init_granularity(void)
6390 +{
6391 +       unsigned int factor = 1 + ilog2(num_online_cpus());
6392 +       const unsigned long limit = 200000000;
6393 +
6394 +       sysctl_sched_min_granularity *= factor;
6395 +       if (sysctl_sched_min_granularity > limit)
6396 +               sysctl_sched_min_granularity = limit;
6397 +
6398 +       sysctl_sched_latency *= factor;
6399 +       if (sysctl_sched_latency > limit)
6400 +               sysctl_sched_latency = limit;
6401 +
6402 +       sysctl_sched_wakeup_granularity *= factor;
6403 +
6404 +       sysctl_sched_shares_ratelimit *= factor;
6405 +}
6406 +
6407 +#ifdef CONFIG_SMP
6408 +/*
6409 + * This is how migration works:
6410 + *
6411 + * 1) we queue a struct migration_req structure in the source CPU's
6412 + *    runqueue and wake up that CPU's migration thread.
6413 + * 2) we down() the locked semaphore => thread blocks.
6414 + * 3) migration thread wakes up (implicitly it forces the migrated
6415 + *    thread off the CPU)
6416 + * 4) it gets the migration request and checks whether the migrated
6417 + *    task is still in the wrong runqueue.
6418 + * 5) if it's in the wrong runqueue then the migration thread removes
6419 + *    it and puts it into the right queue.
6420 + * 6) migration thread up()s the semaphore.
6421 + * 7) we wake up and the migration is done.
6422 + */
6423 +
6424 +/*
6425 + * Change a given task's CPU affinity. Migrate the thread to a
6426 + * proper CPU and schedule it away if the CPU it's executing on
6427 + * is removed from the allowed bitmask.
6428 + *
6429 + * NOTE: the caller must have a valid reference to the task, the
6430 + * task must not exit() & deallocate itself prematurely. The
6431 + * call is not atomic; no spinlocks may be held.
6432 + */
6433 +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
6434 +{
6435 +       struct migration_req req;
6436 +       unsigned long flags;
6437 +       struct rq *rq;
6438 +       int ret = 0;
6439 +
6440 +       rq = task_rq_lock(p, &flags);
6441 +       if (!cpus_intersects(*new_mask, cpu_online_map)) {
6442 +               ret = -EINVAL;
6443 +               goto out;
6444 +       }
6445 +
6446 +       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
6447 +                    !cpus_equal(p->cpus_allowed, *new_mask))) {
6448 +               ret = -EINVAL;
6449 +               goto out;
6450 +       }
6451 +
6452 +       if (p->sched_class->set_cpus_allowed)
6453 +               p->sched_class->set_cpus_allowed(p, new_mask);
6454 +       else {
6455 +               p->cpus_allowed = *new_mask;
6456 +               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
6457 +       }
6458 +
6459 +       /* Can the task run on the task's current CPU? If so, we're done */
6460 +       if (cpu_isset(task_cpu(p), *new_mask))
6461 +               goto out;
6462 +
6463 +       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
6464 +               /* Need help from migration thread: drop lock and wait. */
6465 +               task_rq_unlock(rq, &flags);
6466 +               wake_up_process(rq->migration_thread);
6467 +               wait_for_completion(&req.done);
6468 +               tlb_migrate_finish(p->mm);
6469 +               return 0;
6470 +       }
6471 +out:
6472 +       task_rq_unlock(rq, &flags);
6473 +
6474 +       return ret;
6475 +}
6476 +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
6477 +
6478 +/*
6479 + * Move (not current) task off this cpu, onto dest cpu. We're doing
6480 + * this because either it can't run here any more (set_cpus_allowed()
6481 + * away from this CPU, or CPU going down), or because we're
6482 + * attempting to rebalance this task on exec (sched_exec).
6483 + *
6484 + * So we race with normal scheduler movements, but that's OK, as long
6485 + * as the task is no longer on this CPU.
6486 + *
6487 + * Returns non-zero if task was successfully migrated.
6488 + */
6489 +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6490 +{
6491 +       struct rq *rq_dest, *rq_src;
6492 +       int ret = 0, on_rq;
6493 +
6494 +       if (unlikely(!cpu_active(dest_cpu)))
6495 +               return ret;
6496 +
6497 +       rq_src = cpu_rq(src_cpu);
6498 +       rq_dest = cpu_rq(dest_cpu);
6499 +
6500 +       double_rq_lock(rq_src, rq_dest);
6501 +       /* Already moved. */
6502 +       if (task_cpu(p) != src_cpu)
6503 +               goto done;
6504 +       /* Affinity changed (again). */
6505 +       if (!cpu_isset(dest_cpu, p->cpus_allowed))
6506 +               goto fail;
6507 +
6508 +       on_rq = p->se.on_rq;
6509 +       if (on_rq)
6510 +               deactivate_task(rq_src, p, 0);
6511 +
6512 +       set_task_cpu(p, dest_cpu);
6513 +       if (on_rq) {
6514 +               activate_task(rq_dest, p, 0);
6515 +               check_preempt_curr(rq_dest, p);
6516 +       }
6517 +done:
6518 +       ret = 1;
6519 +fail:
6520 +       double_rq_unlock(rq_src, rq_dest);
6521 +       return ret;
6522 +}
6523 +
6524 +/*
6525 + * migration_thread - this is a highprio system thread that performs
6526 + * thread migration by bumping thread off CPU then 'pushing' onto
6527 + * another runqueue.
6528 + */
6529 +static int migration_thread(void *data)
6530 +{
6531 +       int cpu = (long)data;
6532 +       struct rq *rq;
6533 +
6534 +       rq = cpu_rq(cpu);
6535 +       BUG_ON(rq->migration_thread != current);
6536 +
6537 +       set_current_state(TASK_INTERRUPTIBLE);
6538 +       while (!kthread_should_stop()) {
6539 +               struct migration_req *req;
6540 +               struct list_head *head;
6541 +
6542 +               spin_lock_irq(&rq->lock);
6543 +
6544 +               if (cpu_is_offline(cpu)) {
6545 +                       spin_unlock_irq(&rq->lock);
6546 +                       goto wait_to_die;
6547 +               }
6548 +
6549 +               if (rq->active_balance) {
6550 +                       active_load_balance(rq, cpu);
6551 +                       rq->active_balance = 0;
6552 +               }
6553 +
6554 +               head = &rq->migration_queue;
6555 +
6556 +               if (list_empty(head)) {
6557 +                       spin_unlock_irq(&rq->lock);
6558 +                       schedule();
6559 +                       set_current_state(TASK_INTERRUPTIBLE);
6560 +                       continue;
6561 +               }
6562 +               req = list_entry(head->next, struct migration_req, list);
6563 +               list_del_init(head->next);
6564 +
6565 +               spin_unlock(&rq->lock);
6566 +               __migrate_task(req->task, cpu, req->dest_cpu);
6567 +               local_irq_enable();
6568 +
6569 +               complete(&req->done);
6570 +       }
6571 +       __set_current_state(TASK_RUNNING);
6572 +       return 0;
6573 +
6574 +wait_to_die:
6575 +       /* Wait for kthread_stop */
6576 +       set_current_state(TASK_INTERRUPTIBLE);
6577 +       while (!kthread_should_stop()) {
6578 +               schedule();
6579 +               set_current_state(TASK_INTERRUPTIBLE);
6580 +       }
6581 +       __set_current_state(TASK_RUNNING);
6582 +       return 0;
6583 +}
6584 +
6585 +#ifdef CONFIG_HOTPLUG_CPU
6586 +
6587 +static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6588 +{
6589 +       int ret;
6590 +
6591 +       local_irq_disable();
6592 +       ret = __migrate_task(p, src_cpu, dest_cpu);
6593 +       local_irq_enable();
6594 +       return ret;
6595 +}
6596 +
6597 +/*
6598 + * Figure out where task on dead CPU should go, use force if necessary.
6599 + * NOTE: interrupts should be disabled by the caller
6600 + */
6601 +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6602 +{
6603 +       unsigned long flags;
6604 +       cpumask_t mask;
6605 +       struct rq *rq;
6606 +       int dest_cpu;
6607 +
6608 +       do {
6609 +               /* On same node? */
6610 +               mask = node_to_cpumask(cpu_to_node(dead_cpu));
6611 +               cpus_and(mask, mask, p->cpus_allowed);
6612 +               dest_cpu = any_online_cpu(mask);
6613 +
6614 +               /* On any allowed CPU? */
6615 +               if (dest_cpu >= nr_cpu_ids)
6616 +                       dest_cpu = any_online_cpu(p->cpus_allowed);
6617 +
6618 +               /* No more Mr. Nice Guy. */
6619 +               if (dest_cpu >= nr_cpu_ids) {
6620 +                       cpumask_t cpus_allowed;
6621 +
6622 +                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
6623 +                       /*
6624 +                        * Try to stay on the same cpuset, where the
6625 +                        * current cpuset may be a subset of all cpus.
6626 +                        * The cpuset_cpus_allowed_locked() variant of
6627 +                        * cpuset_cpus_allowed() will not block. It must be
6628 +                        * called within calls to cpuset_lock/cpuset_unlock.
6629 +                        */
6630 +                       rq = task_rq_lock(p, &flags);
6631 +                       p->cpus_allowed = cpus_allowed;
6632 +                       dest_cpu = any_online_cpu(p->cpus_allowed);
6633 +                       task_rq_unlock(rq, &flags);
6634 +
6635 +                       /*
6636 +                        * Don't tell them about moving exiting tasks or
6637 +                        * kernel threads (both mm NULL), since they never
6638 +                        * leave kernel.
6639 +                        */
6640 +                       if (p->mm && printk_ratelimit()) {
6641 +                               printk(KERN_INFO "process %d (%s) no "
6642 +                                      "longer affine to cpu%d\n",
6643 +                                       task_pid_nr(p), p->comm, dead_cpu);
6644 +                       }
6645 +               }
6646 +       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
6647 +}
6648 +
6649 +/*
6650 + * While a dead CPU has no uninterruptible tasks queued at this point,
6651 + * it might still have a nonzero ->nr_uninterruptible counter, because
6652 + * for performance reasons the counter is not stricly tracking tasks to
6653 + * their home CPUs. So we just add the counter to another CPU's counter,
6654 + * to keep the global sum constant after CPU-down:
6655 + */
6656 +static void migrate_nr_uninterruptible(struct rq *rq_src)
6657 +{
6658 +       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
6659 +       unsigned long flags;
6660 +
6661 +       local_irq_save(flags);
6662 +       double_rq_lock(rq_src, rq_dest);
6663 +       rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6664 +       rq_src->nr_uninterruptible = 0;
6665 +       double_rq_unlock(rq_src, rq_dest);
6666 +       local_irq_restore(flags);
6667 +}
6668 +
6669 +/* Run through task list and migrate tasks from the dead cpu. */
6670 +static void migrate_live_tasks(int src_cpu)
6671 +{
6672 +       struct task_struct *p, *t;
6673 +
6674 +       read_lock(&tasklist_lock);
6675 +
6676 +       do_each_thread(t, p) {
6677 +               if (p == current)
6678 +                       continue;
6679 +
6680 +               if (task_cpu(p) == src_cpu)
6681 +                       move_task_off_dead_cpu(src_cpu, p);
6682 +       } while_each_thread(t, p);
6683 +
6684 +       read_unlock(&tasklist_lock);
6685 +}
6686 +
6687 +/*
6688 + * Schedules idle task to be the next runnable task on current CPU.
6689 + * It does so by boosting its priority to highest possible.
6690 + * Used by CPU offline code.
6691 + */
6692 +void sched_idle_next(void)
6693 +{
6694 +       int this_cpu = smp_processor_id();
6695 +       struct rq *rq = cpu_rq(this_cpu);
6696 +       struct task_struct *p = rq->idle;
6697 +       unsigned long flags;
6698 +
6699 +       /* cpu has to be offline */
6700 +       BUG_ON(cpu_online(this_cpu));
6701 +
6702 +       /*
6703 +        * Strictly not necessary since rest of the CPUs are stopped by now
6704 +        * and interrupts disabled on the current cpu.
6705 +        */
6706 +       spin_lock_irqsave(&rq->lock, flags);
6707 +
6708 +       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6709 +
6710 +       update_rq_clock(rq);
6711 +       activate_task(rq, p, 0);
6712 +
6713 +       spin_unlock_irqrestore(&rq->lock, flags);
6714 +}
6715 +
6716 +/*
6717 + * Ensures that the idle task is using init_mm right before its cpu goes
6718 + * offline.
6719 + */
6720 +void idle_task_exit(void)
6721 +{
6722 +       struct mm_struct *mm = current->active_mm;
6723 +
6724 +       BUG_ON(cpu_online(smp_processor_id()));
6725 +
6726 +       if (mm != &init_mm)
6727 +               switch_mm(mm, &init_mm, current);
6728 +       mmdrop(mm);
6729 +}
6730 +
6731 +/* called under rq->lock with disabled interrupts */
6732 +static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6733 +{
6734 +       struct rq *rq = cpu_rq(dead_cpu);
6735 +
6736 +       /* Must be exiting, otherwise would be on tasklist. */
6737 +       BUG_ON(!p->exit_state);
6738 +
6739 +       /* Cannot have done final schedule yet: would have vanished. */
6740 +       BUG_ON(p->state == TASK_DEAD);
6741 +
6742 +       get_task_struct(p);
6743 +
6744 +       /*
6745 +        * Drop lock around migration; if someone else moves it,
6746 +        * that's OK. No task can be added to this CPU, so iteration is
6747 +        * fine.
6748 +        */
6749 +       spin_unlock_irq(&rq->lock);
6750 +       move_task_off_dead_cpu(dead_cpu, p);
6751 +       spin_lock_irq(&rq->lock);
6752 +
6753 +       put_task_struct(p);
6754 +}
6755 +
6756 +/* release_task() removes task from tasklist, so we won't find dead tasks. */
6757 +static void migrate_dead_tasks(unsigned int dead_cpu)
6758 +{
6759 +       struct rq *rq = cpu_rq(dead_cpu);
6760 +       struct task_struct *next;
6761 +
6762 +       for ( ; ; ) {
6763 +               if (!rq->nr_running)
6764 +                       break;
6765 +               update_rq_clock(rq);
6766 +               next = pick_next_task(rq, rq->curr);
6767 +               if (!next)
6768 +                       break;
6769 +               next->sched_class->put_prev_task(rq, next);
6770 +               migrate_dead(dead_cpu, next);
6771 +
6772 +       }
6773 +}
6774 +#endif /* CONFIG_HOTPLUG_CPU */
6775 +
6776 +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6777 +
6778 +static struct ctl_table sd_ctl_dir[] = {
6779 +       {
6780 +               .procname       = "sched_domain",
6781 +               .mode           = 0555,
6782 +       },
6783 +       {0, },
6784 +};
6785 +
6786 +static struct ctl_table sd_ctl_root[] = {
6787 +       {
6788 +               .ctl_name       = CTL_KERN,
6789 +               .procname       = "kernel",
6790 +               .mode           = 0555,
6791 +               .child          = sd_ctl_dir,
6792 +       },
6793 +       {0, },
6794 +};
6795 +
6796 +static struct ctl_table *sd_alloc_ctl_entry(int n)
6797 +{
6798 +       struct ctl_table *entry =
6799 +               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6800 +
6801 +       return entry;
6802 +}
6803 +
6804 +static void sd_free_ctl_entry(struct ctl_table **tablep)
6805 +{
6806 +       struct ctl_table *entry;
6807 +
6808 +       /*
6809 +        * In the intermediate directories, both the child directory and
6810 +        * procname are dynamically allocated and could fail but the mode
6811 +        * will always be set. In the lowest directory the names are
6812 +        * static strings and all have proc handlers.
6813 +        */
6814 +       for (entry = *tablep; entry->mode; entry++) {
6815 +               if (entry->child)
6816 +                       sd_free_ctl_entry(&entry->child);
6817 +               if (entry->proc_handler == NULL)
6818 +                       kfree(entry->procname);
6819 +       }
6820 +
6821 +       kfree(*tablep);
6822 +       *tablep = NULL;
6823 +}
6824 +
6825 +static void
6826 +set_table_entry(struct ctl_table *entry,
6827 +               const char *procname, void *data, int maxlen,
6828 +               mode_t mode, proc_handler *proc_handler)
6829 +{
6830 +       entry->procname = procname;
6831 +       entry->data = data;
6832 +       entry->maxlen = maxlen;
6833 +       entry->mode = mode;
6834 +       entry->proc_handler = proc_handler;
6835 +}
6836 +
6837 +static struct ctl_table *
6838 +sd_alloc_ctl_domain_table(struct sched_domain *sd)
6839 +{
6840 +       struct ctl_table *table = sd_alloc_ctl_entry(12);
6841 +
6842 +       if (table == NULL)
6843 +               return NULL;
6844 +
6845 +       set_table_entry(&table[0], "min_interval", &sd->min_interval,
6846 +               sizeof(long), 0644, proc_doulongvec_minmax);
6847 +       set_table_entry(&table[1], "max_interval", &sd->max_interval,
6848 +               sizeof(long), 0644, proc_doulongvec_minmax);
6849 +       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6850 +               sizeof(int), 0644, proc_dointvec_minmax);
6851 +       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6852 +               sizeof(int), 0644, proc_dointvec_minmax);
6853 +       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6854 +               sizeof(int), 0644, proc_dointvec_minmax);
6855 +       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6856 +               sizeof(int), 0644, proc_dointvec_minmax);
6857 +       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6858 +               sizeof(int), 0644, proc_dointvec_minmax);
6859 +       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6860 +               sizeof(int), 0644, proc_dointvec_minmax);
6861 +       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6862 +               sizeof(int), 0644, proc_dointvec_minmax);
6863 +       set_table_entry(&table[9], "cache_nice_tries",
6864 +               &sd->cache_nice_tries,
6865 +               sizeof(int), 0644, proc_dointvec_minmax);
6866 +       set_table_entry(&table[10], "flags", &sd->flags,
6867 +               sizeof(int), 0644, proc_dointvec_minmax);
6868 +       /* &table[11] is terminator */
6869 +
6870 +       return table;
6871 +}
6872 +
6873 +static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6874 +{
6875 +       struct ctl_table *entry, *table;
6876 +       struct sched_domain *sd;
6877 +       int domain_num = 0, i;
6878 +       char buf[32];
6879 +
6880 +       for_each_domain(cpu, sd)
6881 +               domain_num++;
6882 +       entry = table = sd_alloc_ctl_entry(domain_num + 1);
6883 +       if (table == NULL)
6884 +               return NULL;
6885 +
6886 +       i = 0;
6887 +       for_each_domain(cpu, sd) {
6888 +               snprintf(buf, 32, "domain%d", i);
6889 +               entry->procname = kstrdup(buf, GFP_KERNEL);
6890 +               entry->mode = 0555;
6891 +               entry->child = sd_alloc_ctl_domain_table(sd);
6892 +               entry++;
6893 +               i++;
6894 +       }
6895 +       return table;
6896 +}
6897 +
6898 +static struct ctl_table_header *sd_sysctl_header;
6899 +static void register_sched_domain_sysctl(void)
6900 +{
6901 +       int i, cpu_num = num_online_cpus();
6902 +       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6903 +       char buf[32];
6904 +
6905 +       WARN_ON(sd_ctl_dir[0].child);
6906 +       sd_ctl_dir[0].child = entry;
6907 +
6908 +       if (entry == NULL)
6909 +               return;
6910 +
6911 +       for_each_online_cpu(i) {
6912 +               snprintf(buf, 32, "cpu%d", i);
6913 +               entry->procname = kstrdup(buf, GFP_KERNEL);
6914 +               entry->mode = 0555;
6915 +               entry->child = sd_alloc_ctl_cpu_table(i);
6916 +               entry++;
6917 +       }
6918 +
6919 +       WARN_ON(sd_sysctl_header);
6920 +       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6921 +}
6922 +
6923 +/* may be called multiple times per register */
6924 +static void unregister_sched_domain_sysctl(void)
6925 +{
6926 +       if (sd_sysctl_header)
6927 +               unregister_sysctl_table(sd_sysctl_header);
6928 +       sd_sysctl_header = NULL;
6929 +       if (sd_ctl_dir[0].child)
6930 +               sd_free_ctl_entry(&sd_ctl_dir[0].child);
6931 +}
6932 +#else
6933 +static void register_sched_domain_sysctl(void)
6934 +{
6935 +}
6936 +static void unregister_sched_domain_sysctl(void)
6937 +{
6938 +}
6939 +#endif
6940 +
6941 +static void set_rq_online(struct rq *rq)
6942 +{
6943 +       if (!rq->online) {
6944 +               const struct sched_class *class;
6945 +
6946 +               cpu_set(rq->cpu, rq->rd->online);
6947 +               rq->online = 1;
6948 +
6949 +               for_each_class(class) {
6950 +                       if (class->rq_online)
6951 +                               class->rq_online(rq);
6952 +               }
6953 +       }
6954 +}
6955 +
6956 +static void set_rq_offline(struct rq *rq)
6957 +{
6958 +       if (rq->online) {
6959 +               const struct sched_class *class;
6960 +
6961 +               for_each_class(class) {
6962 +                       if (class->rq_offline)
6963 +                               class->rq_offline(rq);
6964 +               }
6965 +
6966 +               cpu_clear(rq->cpu, rq->rd->online);
6967 +               rq->online = 0;
6968 +       }
6969 +}
6970 +
6971 +/*
6972 + * migration_call - callback that gets triggered when a CPU is added.
6973 + * Here we can start up the necessary migration thread for the new CPU.
6974 + */
6975 +static int __cpuinit
6976 +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6977 +{
6978 +       struct task_struct *p;
6979 +       int cpu = (long)hcpu;
6980 +       unsigned long flags;
6981 +       struct rq *rq;
6982 +
6983 +       switch (action) {
6984 +
6985 +       case CPU_UP_PREPARE:
6986 +       case CPU_UP_PREPARE_FROZEN:
6987 +               p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
6988 +               if (IS_ERR(p))
6989 +                       return NOTIFY_BAD;
6990 +               kthread_bind(p, cpu);
6991 +               /* Must be high prio: stop_machine expects to yield to it. */
6992 +               rq = task_rq_lock(p, &flags);
6993 +               __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6994 +               task_rq_unlock(rq, &flags);
6995 +               cpu_rq(cpu)->migration_thread = p;
6996 +               break;
6997 +
6998 +       case CPU_ONLINE:
6999 +       case CPU_ONLINE_FROZEN:
7000 +               /* Strictly unnecessary, as first user will wake it. */
7001 +               wake_up_process(cpu_rq(cpu)->migration_thread);
7002 +
7003 +               /* Update our root-domain */
7004 +               rq = cpu_rq(cpu);
7005 +               spin_lock_irqsave(&rq->lock, flags);
7006 +               if (rq->rd) {
7007 +                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
7008 +
7009 +                       set_rq_online(rq);
7010 +               }
7011 +               spin_unlock_irqrestore(&rq->lock, flags);
7012 +               break;
7013 +
7014 +#ifdef CONFIG_HOTPLUG_CPU
7015 +       case CPU_UP_CANCELED:
7016 +       case CPU_UP_CANCELED_FROZEN:
7017 +               if (!cpu_rq(cpu)->migration_thread)
7018 +                       break;
7019 +               /* Unbind it from offline cpu so it can run. Fall thru. */
7020 +               kthread_bind(cpu_rq(cpu)->migration_thread,
7021 +                            any_online_cpu(cpu_online_map));
7022 +               kthread_stop(cpu_rq(cpu)->migration_thread);
7023 +               cpu_rq(cpu)->migration_thread = NULL;
7024 +               break;
7025 +
7026 +       case CPU_DEAD:
7027 +       case CPU_DEAD_FROZEN:
7028 +               cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
7029 +               migrate_live_tasks(cpu);
7030 +               rq = cpu_rq(cpu);
7031 +               kthread_stop(rq->migration_thread);
7032 +               rq->migration_thread = NULL;
7033 +               /* Idle task back to normal (off runqueue, low prio) */
7034 +               spin_lock_irq(&rq->lock);
7035 +               update_rq_clock(rq);
7036 +               deactivate_task(rq, rq->idle, 0);
7037 +               rq->idle->static_prio = MAX_PRIO;
7038 +               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7039 +               rq->idle->sched_class = &idle_sched_class;
7040 +               migrate_dead_tasks(cpu);
7041 +               spin_unlock_irq(&rq->lock);
7042 +               cpuset_unlock();
7043 +               migrate_nr_uninterruptible(rq);
7044 +               BUG_ON(rq->nr_running != 0);
7045 +
7046 +               /*
7047 +                * No need to migrate the tasks: it was best-effort if
7048 +                * they didn't take sched_hotcpu_mutex. Just wake up
7049 +                * the requestors.
7050 +                */
7051 +               spin_lock_irq(&rq->lock);
7052 +               while (!list_empty(&rq->migration_queue)) {
7053 +                       struct migration_req *req;
7054 +
7055 +                       req = list_entry(rq->migration_queue.next,
7056 +                                        struct migration_req, list);
7057 +                       list_del_init(&req->list);
7058 +                       spin_unlock_irq(&rq->lock);
7059 +                       complete(&req->done);
7060 +                       spin_lock_irq(&rq->lock);
7061 +               }
7062 +               spin_unlock_irq(&rq->lock);
7063 +               break;
7064 +
7065 +       case CPU_DYING:
7066 +       case CPU_DYING_FROZEN:
7067 +               /* Update our root-domain */
7068 +               rq = cpu_rq(cpu);
7069 +               spin_lock_irqsave(&rq->lock, flags);
7070 +               if (rq->rd) {
7071 +                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
7072 +                       set_rq_offline(rq);
7073 +               }
7074 +               spin_unlock_irqrestore(&rq->lock, flags);
7075 +               break;
7076 +#endif
7077 +       }
7078 +       return NOTIFY_OK;
7079 +}
7080 +
7081 +/* Register at highest priority so that task migration (migrate_all_tasks)
7082 + * happens before everything else.
7083 + */
7084 +static struct notifier_block __cpuinitdata migration_notifier = {
7085 +       .notifier_call = migration_call,
7086 +       .priority = 10
7087 +};
7088 +
7089 +static int __init migration_init(void)
7090 +{
7091 +       void *cpu = (void *)(long)smp_processor_id();
7092 +       int err;
7093 +
7094 +       /* Start one for the boot CPU: */
7095 +       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
7096 +       BUG_ON(err == NOTIFY_BAD);
7097 +       migration_call(&migration_notifier, CPU_ONLINE, cpu);
7098 +       register_cpu_notifier(&migration_notifier);
7099 +
7100 +       return err;
7101 +}
7102 +early_initcall(migration_init);
7103 +#endif
7104 +
7105 +#ifdef CONFIG_SMP
7106 +
7107 +#ifdef CONFIG_SCHED_DEBUG
7108 +
7109 +static inline const char *sd_level_to_string(enum sched_domain_level lvl)
7110 +{
7111 +       switch (lvl) {
7112 +       case SD_LV_NONE:
7113 +                       return "NONE";
7114 +       case SD_LV_SIBLING:
7115 +                       return "SIBLING";
7116 +       case SD_LV_MC:
7117 +                       return "MC";
7118 +       case SD_LV_CPU:
7119 +                       return "CPU";
7120 +       case SD_LV_NODE:
7121 +                       return "NODE";
7122 +       case SD_LV_ALLNODES:
7123 +                       return "ALLNODES";
7124 +       case SD_LV_MAX:
7125 +                       return "MAX";
7126 +
7127 +       }
7128 +       return "MAX";
7129 +}
7130 +
7131 +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7132 +                                 cpumask_t *groupmask)
7133 +{
7134 +       struct sched_group *group = sd->groups;
7135 +       char str[256];
7136 +
7137 +       cpulist_scnprintf(str, sizeof(str), sd->span);
7138 +       cpus_clear(*groupmask);
7139 +
7140 +       printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
7141 +
7142 +       if (!(sd->flags & SD_LOAD_BALANCE)) {
7143 +               printk("does not load-balance\n");
7144 +               if (sd->parent)
7145 +                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
7146 +                                       " has parent");
7147 +               return -1;
7148 +       }
7149 +
7150 +       printk(KERN_CONT "span %s level %s\n",
7151 +               str, sd_level_to_string(sd->level));
7152 +
7153 +       if (!cpu_isset(cpu, sd->span)) {
7154 +               printk(KERN_ERR "ERROR: domain->span does not contain "
7155 +                               "CPU%d\n", cpu);
7156 +       }
7157 +       if (!cpu_isset(cpu, group->cpumask)) {
7158 +               printk(KERN_ERR "ERROR: domain->groups does not contain"
7159 +                               " CPU%d\n", cpu);
7160 +       }
7161 +
7162 +       printk(KERN_DEBUG "%*s groups:", level + 1, "");
7163 +       do {
7164 +               if (!group) {
7165 +                       printk("\n");
7166 +                       printk(KERN_ERR "ERROR: group is NULL\n");
7167 +                       break;
7168 +               }
7169 +
7170 +               if (!group->__cpu_power) {
7171 +                       printk(KERN_CONT "\n");
7172 +                       printk(KERN_ERR "ERROR: domain->cpu_power not "
7173 +                                       "set\n");
7174 +                       break;
7175 +               }
7176 +
7177 +               if (!cpus_weight(group->cpumask)) {
7178 +                       printk(KERN_CONT "\n");
7179 +                       printk(KERN_ERR "ERROR: empty group\n");
7180 +                       break;
7181 +               }
7182 +
7183 +               if (cpus_intersects(*groupmask, group->cpumask)) {
7184 +                       printk(KERN_CONT "\n");
7185 +                       printk(KERN_ERR "ERROR: repeated CPUs\n");
7186 +                       break;
7187 +               }
7188 +
7189 +               cpus_or(*groupmask, *groupmask, group->cpumask);
7190 +
7191 +               cpulist_scnprintf(str, sizeof(str), group->cpumask);
7192 +               printk(KERN_CONT " %s", str);
7193 +
7194 +               group = group->next;
7195 +       } while (group != sd->groups);
7196 +       printk(KERN_CONT "\n");
7197 +
7198 +       if (!cpus_equal(sd->span, *groupmask))
7199 +               printk(KERN_ERR "ERROR: groups don't span domain->span\n");
7200 +
7201 +       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
7202 +               printk(KERN_ERR "ERROR: parent span is not a superset "
7203 +                       "of domain->span\n");
7204 +       return 0;
7205 +}
7206 +
7207 +static void sched_domain_debug(struct sched_domain *sd, int cpu)
7208 +{
7209 +       cpumask_t *groupmask;
7210 +       int level = 0;
7211 +
7212 +       if (!sd) {
7213 +               printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7214 +               return;
7215 +       }
7216 +
7217 +       printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
7218 +
7219 +       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
7220 +       if (!groupmask) {
7221 +               printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
7222 +               return;
7223 +       }
7224 +
7225 +       for (;;) {
7226 +               if (sched_domain_debug_one(sd, cpu, level, groupmask))
7227 +                       break;
7228 +               level++;
7229 +               sd = sd->parent;
7230 +               if (!sd)
7231 +                       break;
7232 +       }
7233 +       kfree(groupmask);
7234 +}
7235 +#else /* !CONFIG_SCHED_DEBUG */
7236 +# define sched_domain_debug(sd, cpu) do { } while (0)
7237 +#endif /* CONFIG_SCHED_DEBUG */
7238 +
7239 +static int sd_degenerate(struct sched_domain *sd)
7240 +{
7241 +       if (cpus_weight(sd->span) == 1)
7242 +               return 1;
7243 +
7244 +       /* Following flags need at least 2 groups */
7245 +       if (sd->flags & (SD_LOAD_BALANCE |
7246 +                        SD_BALANCE_NEWIDLE |
7247 +                        SD_BALANCE_FORK |
7248 +                        SD_BALANCE_EXEC |
7249 +                        SD_SHARE_CPUPOWER |
7250 +                        SD_SHARE_PKG_RESOURCES)) {
7251 +               if (sd->groups != sd->groups->next)
7252 +                       return 0;
7253 +       }
7254 +
7255 +       /* Following flags don't use groups */
7256 +       if (sd->flags & (SD_WAKE_IDLE |
7257 +                        SD_WAKE_AFFINE |
7258 +                        SD_WAKE_BALANCE))
7259 +               return 0;
7260 +
7261 +       return 1;
7262 +}
7263 +
7264 +static int
7265 +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7266 +{
7267 +       unsigned long cflags = sd->flags, pflags = parent->flags;
7268 +
7269 +       if (sd_degenerate(parent))
7270 +               return 1;
7271 +
7272 +       if (!cpus_equal(sd->span, parent->span))
7273 +               return 0;
7274 +
7275 +       /* Does parent contain flags not in child? */
7276 +       /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7277 +       if (cflags & SD_WAKE_AFFINE)
7278 +               pflags &= ~SD_WAKE_BALANCE;
7279 +       /* Flags needing groups don't count if only 1 group in parent */
7280 +       if (parent->groups == parent->groups->next) {
7281 +               pflags &= ~(SD_LOAD_BALANCE |
7282 +                               SD_BALANCE_NEWIDLE |
7283 +                               SD_BALANCE_FORK |
7284 +                               SD_BALANCE_EXEC |
7285 +                               SD_SHARE_CPUPOWER |
7286 +                               SD_SHARE_PKG_RESOURCES);
7287 +       }
7288 +       if (~cflags & pflags)
7289 +               return 0;
7290 +
7291 +       return 1;
7292 +}
7293 +
7294 +static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7295 +{
7296 +       unsigned long flags;
7297 +
7298 +       spin_lock_irqsave(&rq->lock, flags);
7299 +
7300 +       if (rq->rd) {
7301 +               struct root_domain *old_rd = rq->rd;
7302 +
7303 +               if (cpu_isset(rq->cpu, old_rd->online))
7304 +                       set_rq_offline(rq);
7305 +
7306 +               cpu_clear(rq->cpu, old_rd->span);
7307 +
7308 +               if (atomic_dec_and_test(&old_rd->refcount))
7309 +                       kfree(old_rd);
7310 +       }
7311 +
7312 +       atomic_inc(&rd->refcount);
7313 +       rq->rd = rd;
7314 +
7315 +       cpu_set(rq->cpu, rd->span);
7316 +       if (cpu_isset(rq->cpu, cpu_online_map))
7317 +               set_rq_online(rq);
7318 +
7319 +       spin_unlock_irqrestore(&rq->lock, flags);
7320 +}
7321 +
7322 +static void init_rootdomain(struct root_domain *rd)
7323 +{
7324 +       memset(rd, 0, sizeof(*rd));
7325 +
7326 +       cpus_clear(rd->span);
7327 +       cpus_clear(rd->online);
7328 +
7329 +       cpupri_init(&rd->cpupri);
7330 +}
7331 +
7332 +static void init_defrootdomain(void)
7333 +{
7334 +       init_rootdomain(&def_root_domain);
7335 +       atomic_set(&def_root_domain.refcount, 1);
7336 +}
7337 +
7338 +static struct root_domain *alloc_rootdomain(void)
7339 +{
7340 +       struct root_domain *rd;
7341 +
7342 +       rd = kmalloc(sizeof(*rd), GFP_KERNEL);
7343 +       if (!rd)
7344 +               return NULL;
7345 +
7346 +       init_rootdomain(rd);
7347 +
7348 +       return rd;
7349 +}
7350 +
7351 +/*
7352 + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
7353 + * hold the hotplug lock.
7354 + */
7355 +static void
7356 +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7357 +{
7358 +       struct rq *rq = cpu_rq(cpu);
7359 +       struct sched_domain *tmp;
7360 +
7361 +       /* Remove the sched domains which do not contribute to scheduling. */
7362 +       for (tmp = sd; tmp; ) {
7363 +               struct sched_domain *parent = tmp->parent;
7364 +               if (!parent)
7365 +                       break;
7366 +
7367 +               if (sd_parent_degenerate(tmp, parent)) {
7368 +                       tmp->parent = parent->parent;
7369 +                       if (parent->parent)
7370 +                               parent->parent->child = tmp;
7371 +               } else
7372 +                       tmp = tmp->parent;
7373 +       }
7374 +
7375 +       if (sd && sd_degenerate(sd)) {
7376 +               sd = sd->parent;
7377 +               if (sd)
7378 +                       sd->child = NULL;
7379 +       }
7380 +
7381 +       sched_domain_debug(sd, cpu);
7382 +
7383 +       rq_attach_root(rq, rd);
7384 +       rcu_assign_pointer(rq->sd, sd);
7385 +}
7386 +
7387 +/* cpus with isolated domains */
7388 +static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
7389 +
7390 +/* Setup the mask of cpus configured for isolated domains */
7391 +static int __init isolated_cpu_setup(char *str)
7392 +{
7393 +       static int __initdata ints[NR_CPUS];
7394 +       int i;
7395 +
7396 +       str = get_options(str, ARRAY_SIZE(ints), ints);
7397 +       cpus_clear(cpu_isolated_map);
7398 +       for (i = 1; i <= ints[0]; i++)
7399 +               if (ints[i] < NR_CPUS)
7400 +                       cpu_set(ints[i], cpu_isolated_map);
7401 +       return 1;
7402 +}
7403 +
7404 +__setup("isolcpus=", isolated_cpu_setup);
7405 +
7406 +/*
7407 + * init_sched_build_groups takes the cpumask we wish to span, and a pointer
7408 + * to a function which identifies what group(along with sched group) a CPU
7409 + * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
7410 + * (due to the fact that we keep track of groups covered with a cpumask_t).
7411 + *
7412 + * init_sched_build_groups will build a circular linked list of the groups
7413 + * covered by the given span, and will set each group's ->cpumask correctly,
7414 + * and ->cpu_power to 0.
7415 + */
7416 +static void
7417 +init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
7418 +                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
7419 +                                       struct sched_group **sg,
7420 +                                       cpumask_t *tmpmask),
7421 +                       cpumask_t *covered, cpumask_t *tmpmask)
7422 +{
7423 +       struct sched_group *first = NULL, *last = NULL;
7424 +       int i;
7425 +
7426 +       cpus_clear(*covered);
7427 +
7428 +       for_each_cpu_mask_nr(i, *span) {
7429 +               struct sched_group *sg;
7430 +               int group = group_fn(i, cpu_map, &sg, tmpmask);
7431 +               int j;
7432 +
7433 +               if (cpu_isset(i, *covered))
7434 +                       continue;
7435 +
7436 +               cpus_clear(sg->cpumask);
7437 +               sg->__cpu_power = 0;
7438 +
7439 +               for_each_cpu_mask_nr(j, *span) {
7440 +                       if (group_fn(j, cpu_map, NULL, tmpmask) != group)
7441 +                               continue;
7442 +
7443 +                       cpu_set(j, *covered);
7444 +                       cpu_set(j, sg->cpumask);
7445 +               }
7446 +               if (!first)
7447 +                       first = sg;
7448 +               if (last)
7449 +                       last->next = sg;
7450 +               last = sg;
7451 +       }
7452 +       last->next = first;
7453 +}
7454 +
7455 +#define SD_NODES_PER_DOMAIN 16
7456 +
7457 +#ifdef CONFIG_NUMA
7458 +
7459 +/**
7460 + * find_next_best_node - find the next node to include in a sched_domain
7461 + * @node: node whose sched_domain we're building
7462 + * @used_nodes: nodes already in the sched_domain
7463 + *
7464 + * Find the next node to include in a given scheduling domain. Simply
7465 + * finds the closest node not already in the @used_nodes map.
7466 + *
7467 + * Should use nodemask_t.
7468 + */
7469 +static int find_next_best_node(int node, nodemask_t *used_nodes)
7470 +{
7471 +       int i, n, val, min_val, best_node = 0;
7472 +
7473 +       min_val = INT_MAX;
7474 +
7475 +       for (i = 0; i < nr_node_ids; i++) {
7476 +               /* Start at @node */
7477 +               n = (node + i) % nr_node_ids;
7478 +
7479 +               if (!nr_cpus_node(n))
7480 +                       continue;
7481 +
7482 +               /* Skip already used nodes */
7483 +               if (node_isset(n, *used_nodes))
7484 +                       continue;
7485 +
7486 +               /* Simple min distance search */
7487 +               val = node_distance(node, n);
7488 +
7489 +               if (val < min_val) {
7490 +                       min_val = val;
7491 +                       best_node = n;
7492 +               }
7493 +       }
7494 +
7495 +       node_set(best_node, *used_nodes);
7496 +       return best_node;
7497 +}
7498 +
7499 +/**
7500 + * sched_domain_node_span - get a cpumask for a node's sched_domain
7501 + * @node: node whose cpumask we're constructing
7502 + * @span: resulting cpumask
7503 + *
7504 + * Given a node, construct a good cpumask for its sched_domain to span. It
7505 + * should be one that prevents unnecessary balancing, but also spreads tasks
7506 + * out optimally.
7507 + */
7508 +static void sched_domain_node_span(int node, cpumask_t *span)
7509 +{
7510 +       nodemask_t used_nodes;
7511 +       node_to_cpumask_ptr(nodemask, node);
7512 +       int i;
7513 +
7514 +       cpus_clear(*span);
7515 +       nodes_clear(used_nodes);
7516 +
7517 +       cpus_or(*span, *span, *nodemask);
7518 +       node_set(node, used_nodes);
7519 +
7520 +       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7521 +               int next_node = find_next_best_node(node, &used_nodes);
7522 +
7523 +               node_to_cpumask_ptr_next(nodemask, next_node);
7524 +               cpus_or(*span, *span, *nodemask);
7525 +       }
7526 +}
7527 +#endif /* CONFIG_NUMA */
7528 +
7529 +int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7530 +
7531 +/*
7532 + * SMT sched-domains:
7533 + */
7534 +#ifdef CONFIG_SCHED_SMT
7535 +static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
7536 +static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
7537 +
7538 +static int
7539 +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7540 +                cpumask_t *unused)
7541 +{
7542 +       if (sg)
7543 +               *sg = &per_cpu(sched_group_cpus, cpu);
7544 +       return cpu;
7545 +}
7546 +#endif /* CONFIG_SCHED_SMT */
7547 +
7548 +/*
7549 + * multi-core sched-domains:
7550 + */
7551 +#ifdef CONFIG_SCHED_MC
7552 +static DEFINE_PER_CPU(struct sched_domain, core_domains);
7553 +static DEFINE_PER_CPU(struct sched_group, sched_group_core);
7554 +#endif /* CONFIG_SCHED_MC */
7555 +
7556 +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7557 +static int
7558 +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7559 +                 cpumask_t *mask)
7560 +{
7561 +       int group;
7562 +
7563 +       *mask = per_cpu(cpu_sibling_map, cpu);
7564 +       cpus_and(*mask, *mask, *cpu_map);
7565 +       group = first_cpu(*mask);
7566 +       if (sg)
7567 +               *sg = &per_cpu(sched_group_core, group);
7568 +       return group;
7569 +}
7570 +#elif defined(CONFIG_SCHED_MC)
7571 +static int
7572 +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7573 +                 cpumask_t *unused)
7574 +{
7575 +       if (sg)
7576 +               *sg = &per_cpu(sched_group_core, cpu);
7577 +       return cpu;
7578 +}
7579 +#endif
7580 +
7581 +static DEFINE_PER_CPU(struct sched_domain, phys_domains);
7582 +static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
7583 +
7584 +static int
7585 +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7586 +                 cpumask_t *mask)
7587 +{
7588 +       int group;
7589 +#ifdef CONFIG_SCHED_MC
7590 +       *mask = cpu_coregroup_map(cpu);
7591 +       cpus_and(*mask, *mask, *cpu_map);
7592 +       group = first_cpu(*mask);
7593 +#elif defined(CONFIG_SCHED_SMT)
7594 +       *mask = per_cpu(cpu_sibling_map, cpu);
7595 +       cpus_and(*mask, *mask, *cpu_map);
7596 +       group = first_cpu(*mask);
7597 +#else
7598 +       group = cpu;
7599 +#endif
7600 +       if (sg)
7601 +               *sg = &per_cpu(sched_group_phys, group);
7602 +       return group;
7603 +}
7604 +
7605 +#ifdef CONFIG_NUMA
7606 +/*
7607 + * The init_sched_build_groups can't handle what we want to do with node
7608 + * groups, so roll our own. Now each node has its own list of groups which
7609 + * gets dynamically allocated.
7610 + */
7611 +static DEFINE_PER_CPU(struct sched_domain, node_domains);
7612 +static struct sched_group ***sched_group_nodes_bycpu;
7613 +
7614 +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7615 +static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
7616 +
7617 +static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
7618 +                                struct sched_group **sg, cpumask_t *nodemask)
7619 +{
7620 +       int group;
7621 +
7622 +       *nodemask = node_to_cpumask(cpu_to_node(cpu));
7623 +       cpus_and(*nodemask, *nodemask, *cpu_map);
7624 +       group = first_cpu(*nodemask);
7625 +
7626 +       if (sg)
7627 +               *sg = &per_cpu(sched_group_allnodes, group);
7628 +       return group;
7629 +}
7630 +
7631 +static void init_numa_sched_groups_power(struct sched_group *group_head)
7632 +{
7633 +       struct sched_group *sg = group_head;
7634 +       int j;
7635 +
7636 +       if (!sg)
7637 +               return;
7638 +       do {
7639 +               for_each_cpu_mask_nr(j, sg->cpumask) {
7640 +                       struct sched_domain *sd;
7641 +
7642 +                       sd = &per_cpu(phys_domains, j);
7643 +                       if (j != first_cpu(sd->groups->cpumask)) {
7644 +                               /*
7645 +                                * Only add "power" once for each
7646 +                                * physical package.
7647 +                                */
7648 +                               continue;
7649 +                       }
7650 +
7651 +                       sg_inc_cpu_power(sg, sd->groups->__cpu_power);
7652 +               }
7653 +               sg = sg->next;
7654 +       } while (sg != group_head);
7655 +}
7656 +#endif /* CONFIG_NUMA */
7657 +
7658 +#ifdef CONFIG_NUMA
7659 +/* Free memory allocated for various sched_group structures */
7660 +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7661 +{
7662 +       int cpu, i;
7663 +
7664 +       for_each_cpu_mask_nr(cpu, *cpu_map) {
7665 +               struct sched_group **sched_group_nodes
7666 +                       = sched_group_nodes_bycpu[cpu];
7667 +
7668 +               if (!sched_group_nodes)
7669 +                       continue;
7670 +
7671 +               for (i = 0; i < nr_node_ids; i++) {
7672 +                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
7673 +
7674 +                       *nodemask = node_to_cpumask(i);
7675 +                       cpus_and(*nodemask, *nodemask, *cpu_map);
7676 +                       if (cpus_empty(*nodemask))
7677 +                               continue;
7678 +
7679 +                       if (sg == NULL)
7680 +                               continue;
7681 +                       sg = sg->next;
7682 +next_sg:
7683 +                       oldsg = sg;
7684 +                       sg = sg->next;
7685 +                       kfree(oldsg);
7686 +                       if (oldsg != sched_group_nodes[i])
7687 +                               goto next_sg;
7688 +               }
7689 +               kfree(sched_group_nodes);
7690 +               sched_group_nodes_bycpu[cpu] = NULL;
7691 +       }
7692 +}
7693 +#else /* !CONFIG_NUMA */
7694 +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7695 +{
7696 +}
7697 +#endif /* CONFIG_NUMA */
7698 +
7699 +/*
7700 + * Initialize sched groups cpu_power.
7701 + *
7702 + * cpu_power indicates the capacity of sched group, which is used while
7703 + * distributing the load between different sched groups in a sched domain.
7704 + * Typically cpu_power for all the groups in a sched domain will be same unless
7705 + * there are asymmetries in the topology. If there are asymmetries, group
7706 + * having more cpu_power will pickup more load compared to the group having
7707 + * less cpu_power.
7708 + *
7709 + * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
7710 + * the maximum number of tasks a group can handle in the presence of other idle
7711 + * or lightly loaded groups in the same sched domain.
7712 + */
7713 +static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7714 +{
7715 +       struct sched_domain *child;
7716 +       struct sched_group *group;
7717 +
7718 +       WARN_ON(!sd || !sd->groups);
7719 +
7720 +       if (cpu != first_cpu(sd->groups->cpumask))
7721 +               return;
7722 +
7723 +       child = sd->child;
7724 +
7725 +       sd->groups->__cpu_power = 0;
7726 +
7727 +       /*
7728 +        * For perf policy, if the groups in child domain share resources
7729 +        * (for example cores sharing some portions of the cache hierarchy
7730 +        * or SMT), then set this domain groups cpu_power such that each group
7731 +        * can handle only one task, when there are other idle groups in the
7732 +        * same sched domain.
7733 +        */
7734 +       if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
7735 +                      (child->flags &
7736 +                       (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
7737 +               sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
7738 +               return;
7739 +       }
7740 +
7741 +       /*
7742 +        * add cpu_power of each child group to this groups cpu_power
7743 +        */
7744 +       group = child->groups;
7745 +       do {
7746 +               sg_inc_cpu_power(sd->groups, group->__cpu_power);
7747 +               group = group->next;
7748 +       } while (group != child->groups);
7749 +}
7750 +
7751 +/*
7752 + * Initializers for schedule domains
7753 + * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7754 + */
7755 +
7756 +#define        SD_INIT(sd, type)       sd_init_##type(sd)
7757 +#define SD_INIT_FUNC(type)     \
7758 +static noinline void sd_init_##type(struct sched_domain *sd)   \
7759 +{                                                              \
7760 +       memset(sd, 0, sizeof(*sd));                             \
7761 +       *sd = SD_##type##_INIT;                                 \
7762 +       sd->level = SD_LV_##type;                               \
7763 +}
7764 +
7765 +SD_INIT_FUNC(CPU)
7766 +#ifdef CONFIG_NUMA
7767 + SD_INIT_FUNC(ALLNODES)
7768 + SD_INIT_FUNC(NODE)
7769 +#endif
7770 +#ifdef CONFIG_SCHED_SMT
7771 + SD_INIT_FUNC(SIBLING)
7772 +#endif
7773 +#ifdef CONFIG_SCHED_MC
7774 + SD_INIT_FUNC(MC)
7775 +#endif
7776 +
7777 +/*
7778 + * To minimize stack usage kmalloc room for cpumasks and share the
7779 + * space as the usage in build_sched_domains() dictates.  Used only
7780 + * if the amount of space is significant.
7781 + */
7782 +struct allmasks {
7783 +       cpumask_t tmpmask;                      /* make this one first */
7784 +       union {
7785 +               cpumask_t nodemask;
7786 +               cpumask_t this_sibling_map;
7787 +               cpumask_t this_core_map;
7788 +       };
7789 +       cpumask_t send_covered;
7790 +
7791 +#ifdef CONFIG_NUMA
7792 +       cpumask_t domainspan;
7793 +       cpumask_t covered;
7794 +       cpumask_t notcovered;
7795 +#endif
7796 +};
7797 +
7798 +#if    NR_CPUS > 128
7799 +#define        SCHED_CPUMASK_ALLOC             1
7800 +#define        SCHED_CPUMASK_FREE(v)           kfree(v)
7801 +#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
7802 +#else
7803 +#define        SCHED_CPUMASK_ALLOC             0
7804 +#define        SCHED_CPUMASK_FREE(v)
7805 +#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
7806 +#endif
7807 +
7808 +#define        SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
7809 +                       ((unsigned long)(a) + offsetof(struct allmasks, v))
7810 +
7811 +static int default_relax_domain_level = -1;
7812 +
7813 +static int __init setup_relax_domain_level(char *str)
7814 +{
7815 +       unsigned long val;
7816 +
7817 +       val = simple_strtoul(str, NULL, 0);
7818 +       if (val < SD_LV_MAX)
7819 +               default_relax_domain_level = val;
7820 +
7821 +       return 1;
7822 +}
7823 +__setup("relax_domain_level=", setup_relax_domain_level);
7824 +
7825 +static void set_domain_attribute(struct sched_domain *sd,
7826 +                                struct sched_domain_attr *attr)
7827 +{
7828 +       int request;
7829 +
7830 +       if (!attr || attr->relax_domain_level < 0) {
7831 +               if (default_relax_domain_level < 0)
7832 +                       return;
7833 +               else
7834 +                       request = default_relax_domain_level;
7835 +       } else
7836 +               request = attr->relax_domain_level;
7837 +       if (request < sd->level) {
7838 +               /* turn off idle balance on this domain */
7839 +               sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
7840 +       } else {
7841 +               /* turn on idle balance on this domain */
7842 +               sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
7843 +       }
7844 +}
7845 +
7846 +/*
7847 + * Build sched domains for a given set of cpus and attach the sched domains
7848 + * to the individual cpus
7849 + */
7850 +static int __build_sched_domains(const cpumask_t *cpu_map,
7851 +                                struct sched_domain_attr *attr)
7852 +{
7853 +       int i;
7854 +       struct root_domain *rd;
7855 +       SCHED_CPUMASK_DECLARE(allmasks);
7856 +       cpumask_t *tmpmask;
7857 +#ifdef CONFIG_NUMA
7858 +       struct sched_group **sched_group_nodes = NULL;
7859 +       int sd_allnodes = 0;
7860 +
7861 +       /*
7862 +        * Allocate the per-node list of sched groups
7863 +        */
7864 +       sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
7865 +                                   GFP_KERNEL);
7866 +       if (!sched_group_nodes) {
7867 +               printk(KERN_WARNING "Can not alloc sched group node list\n");
7868 +               return -ENOMEM;
7869 +       }
7870 +#endif
7871 +
7872 +       rd = alloc_rootdomain();
7873 +       if (!rd) {
7874 +               printk(KERN_WARNING "Cannot alloc root domain\n");
7875 +#ifdef CONFIG_NUMA
7876 +               kfree(sched_group_nodes);
7877 +#endif
7878 +               return -ENOMEM;
7879 +       }
7880 +
7881 +#if SCHED_CPUMASK_ALLOC
7882 +       /* get space for all scratch cpumask variables */
7883 +       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7884 +       if (!allmasks) {
7885 +               printk(KERN_WARNING "Cannot alloc cpumask array\n");
7886 +               kfree(rd);
7887 +#ifdef CONFIG_NUMA
7888 +               kfree(sched_group_nodes);
7889 +#endif
7890 +               return -ENOMEM;
7891 +       }
7892 +#endif
7893 +       tmpmask = (cpumask_t *)allmasks;
7894 +
7895 +
7896 +#ifdef CONFIG_NUMA
7897 +       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7898 +#endif
7899 +
7900 +       /*
7901 +        * Set up domains for cpus specified by the cpu_map.
7902 +        */
7903 +       for_each_cpu_mask_nr(i, *cpu_map) {
7904 +               struct sched_domain *sd = NULL, *p;
7905 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
7906 +
7907 +               *nodemask = node_to_cpumask(cpu_to_node(i));
7908 +               cpus_and(*nodemask, *nodemask, *cpu_map);
7909 +
7910 +#ifdef CONFIG_NUMA
7911 +               if (cpus_weight(*cpu_map) >
7912 +                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
7913 +                       sd = &per_cpu(allnodes_domains, i);
7914 +                       SD_INIT(sd, ALLNODES);
7915 +                       set_domain_attribute(sd, attr);
7916 +                       sd->span = *cpu_map;
7917 +                       cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7918 +                       p = sd;
7919 +                       sd_allnodes = 1;
7920 +               } else
7921 +                       p = NULL;
7922 +
7923 +               sd = &per_cpu(node_domains, i);
7924 +               SD_INIT(sd, NODE);
7925 +               set_domain_attribute(sd, attr);
7926 +               sched_domain_node_span(cpu_to_node(i), &sd->span);
7927 +               sd->parent = p;
7928 +               if (p)
7929 +                       p->child = sd;
7930 +               cpus_and(sd->span, sd->span, *cpu_map);
7931 +#endif
7932 +
7933 +               p = sd;
7934 +               sd = &per_cpu(phys_domains, i);
7935 +               SD_INIT(sd, CPU);
7936 +               set_domain_attribute(sd, attr);
7937 +               sd->span = *nodemask;
7938 +               sd->parent = p;
7939 +               if (p)
7940 +                       p->child = sd;
7941 +               cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
7942 +
7943 +#ifdef CONFIG_SCHED_MC
7944 +               p = sd;
7945 +               sd = &per_cpu(core_domains, i);
7946 +               SD_INIT(sd, MC);
7947 +               set_domain_attribute(sd, attr);
7948 +               sd->span = cpu_coregroup_map(i);
7949 +               cpus_and(sd->span, sd->span, *cpu_map);
7950 +               sd->parent = p;
7951 +               p->child = sd;
7952 +               cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
7953 +#endif
7954 +
7955 +#ifdef CONFIG_SCHED_SMT
7956 +               p = sd;
7957 +               sd = &per_cpu(cpu_domains, i);
7958 +               SD_INIT(sd, SIBLING);
7959 +               set_domain_attribute(sd, attr);
7960 +               sd->span = per_cpu(cpu_sibling_map, i);
7961 +               cpus_and(sd->span, sd->span, *cpu_map);
7962 +               sd->parent = p;
7963 +               p->child = sd;
7964 +               cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
7965 +#endif
7966 +       }
7967 +
7968 +#ifdef CONFIG_SCHED_SMT
7969 +       /* Set up CPU (sibling) groups */
7970 +       for_each_cpu_mask_nr(i, *cpu_map) {
7971 +               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7972 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
7973 +
7974 +               *this_sibling_map = per_cpu(cpu_sibling_map, i);
7975 +               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7976 +               if (i != first_cpu(*this_sibling_map))
7977 +                       continue;
7978 +
7979 +               init_sched_build_groups(this_sibling_map, cpu_map,
7980 +                                       &cpu_to_cpu_group,
7981 +                                       send_covered, tmpmask);
7982 +       }
7983 +#endif
7984 +
7985 +#ifdef CONFIG_SCHED_MC
7986 +       /* Set up multi-core groups */
7987 +       for_each_cpu_mask_nr(i, *cpu_map) {
7988 +               SCHED_CPUMASK_VAR(this_core_map, allmasks);
7989 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
7990 +
7991 +               *this_core_map = cpu_coregroup_map(i);
7992 +               cpus_and(*this_core_map, *this_core_map, *cpu_map);
7993 +               if (i != first_cpu(*this_core_map))
7994 +                       continue;
7995 +
7996 +               init_sched_build_groups(this_core_map, cpu_map,
7997 +                                       &cpu_to_core_group,
7998 +                                       send_covered, tmpmask);
7999 +       }
8000 +#endif
8001 +
8002 +       /* Set up physical groups */
8003 +       for (i = 0; i < nr_node_ids; i++) {
8004 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
8005 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
8006 +
8007 +               *nodemask = node_to_cpumask(i);
8008 +               cpus_and(*nodemask, *nodemask, *cpu_map);
8009 +               if (cpus_empty(*nodemask))
8010 +                       continue;
8011 +
8012 +               init_sched_build_groups(nodemask, cpu_map,
8013 +                                       &cpu_to_phys_group,
8014 +                                       send_covered, tmpmask);
8015 +       }
8016 +
8017 +#ifdef CONFIG_NUMA
8018 +       /* Set up node groups */
8019 +       if (sd_allnodes) {
8020 +               SCHED_CPUMASK_VAR(send_covered, allmasks);
8021 +
8022 +               init_sched_build_groups(cpu_map, cpu_map,
8023 +                                       &cpu_to_allnodes_group,
8024 +                                       send_covered, tmpmask);
8025 +       }
8026 +
8027 +       for (i = 0; i < nr_node_ids; i++) {
8028 +               /* Set up node groups */
8029 +               struct sched_group *sg, *prev;
8030 +               SCHED_CPUMASK_VAR(nodemask, allmasks);
8031 +               SCHED_CPUMASK_VAR(domainspan, allmasks);
8032 +               SCHED_CPUMASK_VAR(covered, allmasks);
8033 +               int j;
8034 +
8035 +               *nodemask = node_to_cpumask(i);
8036 +               cpus_clear(*covered);
8037 +
8038 +               cpus_and(*nodemask, *nodemask, *cpu_map);
8039 +               if (cpus_empty(*nodemask)) {
8040 +                       sched_group_nodes[i] = NULL;
8041 +                       continue;
8042 +               }
8043 +
8044 +               sched_domain_node_span(i, domainspan);
8045 +               cpus_and(*domainspan, *domainspan, *cpu_map);
8046 +
8047 +               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
8048 +               if (!sg) {
8049 +                       printk(KERN_WARNING "Can not alloc domain group for "
8050 +                               "node %d\n", i);
8051 +                       goto error;
8052 +               }
8053 +               sched_group_nodes[i] = sg;
8054 +               for_each_cpu_mask_nr(j, *nodemask) {
8055 +                       struct sched_domain *sd;
8056 +
8057 +                       sd = &per_cpu(node_domains, j);
8058 +                       sd->groups = sg;
8059 +               }
8060 +               sg->__cpu_power = 0;
8061 +               sg->cpumask = *nodemask;
8062 +               sg->next = sg;
8063 +               cpus_or(*covered, *covered, *nodemask);
8064 +               prev = sg;
8065 +
8066 +               for (j = 0; j < nr_node_ids; j++) {
8067 +                       SCHED_CPUMASK_VAR(notcovered, allmasks);
8068 +                       int n = (i + j) % nr_node_ids;
8069 +                       node_to_cpumask_ptr(pnodemask, n);
8070 +
8071 +                       cpus_complement(*notcovered, *covered);
8072 +                       cpus_and(*tmpmask, *notcovered, *cpu_map);
8073 +                       cpus_and(*tmpmask, *tmpmask, *domainspan);
8074 +                       if (cpus_empty(*tmpmask))
8075 +                               break;
8076 +
8077 +                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
8078 +                       if (cpus_empty(*tmpmask))
8079 +                               continue;
8080 +
8081 +                       sg = kmalloc_node(sizeof(struct sched_group),
8082 +                                         GFP_KERNEL, i);
8083 +                       if (!sg) {
8084 +                               printk(KERN_WARNING
8085 +                               "Can not alloc domain group for node %d\n", j);
8086 +                               goto error;
8087 +                       }
8088 +                       sg->__cpu_power = 0;
8089 +                       sg->cpumask = *tmpmask;
8090 +                       sg->next = prev->next;
8091 +                       cpus_or(*covered, *covered, *tmpmask);
8092 +                       prev->next = sg;
8093 +                       prev = sg;
8094 +               }
8095 +       }
8096 +#endif
8097 +
8098 +       /* Calculate CPU power for physical packages and nodes */
8099 +#ifdef CONFIG_SCHED_SMT
8100 +       for_each_cpu_mask_nr(i, *cpu_map) {
8101 +               struct sched_domain *sd = &per_cpu(cpu_domains, i);
8102 +
8103 +               init_sched_groups_power(i, sd);
8104 +       }
8105 +#endif
8106 +#ifdef CONFIG_SCHED_MC
8107 +       for_each_cpu_mask_nr(i, *cpu_map) {
8108 +               struct sched_domain *sd = &per_cpu(core_domains, i);
8109 +
8110 +               init_sched_groups_power(i, sd);
8111 +       }
8112 +#endif
8113 +
8114 +       for_each_cpu_mask_nr(i, *cpu_map) {
8115 +               struct sched_domain *sd = &per_cpu(phys_domains, i);
8116 +
8117 +               init_sched_groups_power(i, sd);
8118 +       }
8119 +
8120 +#ifdef CONFIG_NUMA
8121 +       for (i = 0; i < nr_node_ids; i++)
8122 +               init_numa_sched_groups_power(sched_group_nodes[i]);
8123 +
8124 +       if (sd_allnodes) {
8125 +               struct sched_group *sg;
8126 +
8127 +               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
8128 +                                                               tmpmask);
8129 +               init_numa_sched_groups_power(sg);
8130 +       }
8131 +#endif
8132 +
8133 +       /* Attach the domains */
8134 +       for_each_cpu_mask_nr(i, *cpu_map) {
8135 +               struct sched_domain *sd;
8136 +#ifdef CONFIG_SCHED_SMT
8137 +               sd = &per_cpu(cpu_domains, i);
8138 +#elif defined(CONFIG_SCHED_MC)
8139 +               sd = &per_cpu(core_domains, i);
8140 +#else
8141 +               sd = &per_cpu(phys_domains, i);
8142 +#endif
8143 +               cpu_attach_domain(sd, rd, i);
8144 +       }
8145 +
8146 +       SCHED_CPUMASK_FREE((void *)allmasks);
8147 +       return 0;
8148 +
8149 +#ifdef CONFIG_NUMA
8150 +error:
8151 +       free_sched_groups(cpu_map, tmpmask);
8152 +       SCHED_CPUMASK_FREE((void *)allmasks);
8153 +       return -ENOMEM;
8154 +#endif
8155 +}
8156 +
8157 +static int build_sched_domains(const cpumask_t *cpu_map)
8158 +{
8159 +       return __build_sched_domains(cpu_map, NULL);
8160 +}
8161 +
8162 +static cpumask_t *doms_cur;    /* current sched domains */
8163 +static int ndoms_cur;          /* number of sched domains in 'doms_cur' */
8164 +static struct sched_domain_attr *dattr_cur;
8165 +                               /* attribues of custom domains in 'doms_cur' */
8166 +
8167 +/*
8168 + * Special case: If a kmalloc of a doms_cur partition (array of
8169 + * cpumask_t) fails, then fallback to a single sched domain,
8170 + * as determined by the single cpumask_t fallback_doms.
8171 + */
8172 +static cpumask_t fallback_doms;
8173 +
8174 +void __attribute__((weak)) arch_update_cpu_topology(void)
8175 +{
8176 +}
8177 +
8178 +/*
8179 + * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8180 + * For now this just excludes isolated cpus, but could be used to
8181 + * exclude other special cases in the future.
8182 + */
8183 +static int arch_init_sched_domains(const cpumask_t *cpu_map)
8184 +{
8185 +       int err;
8186 +
8187 +       arch_update_cpu_topology();
8188 +       ndoms_cur = 1;
8189 +       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
8190 +       if (!doms_cur)
8191 +               doms_cur = &fallback_doms;
8192 +       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
8193 +       dattr_cur = NULL;
8194 +       err = build_sched_domains(doms_cur);
8195 +       register_sched_domain_sysctl();
8196 +
8197 +       return err;
8198 +}
8199 +
8200 +static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
8201 +                                      cpumask_t *tmpmask)
8202 +{
8203 +       free_sched_groups(cpu_map, tmpmask);
8204 +}
8205 +
8206 +/*
8207 + * Detach sched domains from a group of cpus specified in cpu_map
8208 + * These cpus will now be attached to the NULL domain
8209 + */
8210 +static void detach_destroy_domains(const cpumask_t *cpu_map)
8211 +{
8212 +       cpumask_t tmpmask;
8213 +       int i;
8214 +
8215 +       unregister_sched_domain_sysctl();
8216 +
8217 +       for_each_cpu_mask_nr(i, *cpu_map)
8218 +               cpu_attach_domain(NULL, &def_root_domain, i);
8219 +       synchronize_sched();
8220 +       arch_destroy_sched_domains(cpu_map, &tmpmask);
8221 +}
8222 +
8223 +/* handle null as "default" */
8224 +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8225 +                       struct sched_domain_attr *new, int idx_new)
8226 +{
8227 +       struct sched_domain_attr tmp;
8228 +
8229 +       /* fast path */
8230 +       if (!new && !cur)
8231 +               return 1;
8232 +
8233 +       tmp = SD_ATTR_INIT;
8234 +       return !memcmp(cur ? (cur + idx_cur) : &tmp,
8235 +                       new ? (new + idx_new) : &tmp,
8236 +                       sizeof(struct sched_domain_attr));
8237 +}
8238 +
8239 +/*
8240 + * Partition sched domains as specified by the 'ndoms_new'
8241 + * cpumasks in the array doms_new[] of cpumasks. This compares
8242 + * doms_new[] to the current sched domain partitioning, doms_cur[].
8243 + * It destroys each deleted domain and builds each new domain.
8244 + *
8245 + * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
8246 + * The masks don't intersect (don't overlap.) We should setup one
8247 + * sched domain for each mask. CPUs not in any of the cpumasks will
8248 + * not be load balanced. If the same cpumask appears both in the
8249 + * current 'doms_cur' domains and in the new 'doms_new', we can leave
8250 + * it as it is.
8251 + *
8252 + * The passed in 'doms_new' should be kmalloc'd. This routine takes
8253 + * ownership of it and will kfree it when done with it. If the caller
8254 + * failed the kmalloc call, then it can pass in doms_new == NULL &&
8255 + * ndoms_new == 1, and partition_sched_domains() will fallback to
8256 + * the single partition 'fallback_doms', it also forces the domains
8257 + * to be rebuilt.
8258 + *
8259 + * If doms_new == NULL it will be replaced with cpu_online_map.
8260 + * ndoms_new == 0 is a special case for destroying existing domains,
8261 + * and it will not create the default domain.
8262 + *
8263 + * Call with hotplug lock held
8264 + */
8265 +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
8266 +                            struct sched_domain_attr *dattr_new)
8267 +{
8268 +       int i, j, n;
8269 +
8270 +       mutex_lock(&sched_domains_mutex);
8271 +
8272 +       /* always unregister in case we don't destroy any domains */
8273 +       unregister_sched_domain_sysctl();
8274 +
8275 +       n = doms_new ? ndoms_new : 0;
8276 +
8277 +       /* Destroy deleted domains */
8278 +       for (i = 0; i < ndoms_cur; i++) {
8279 +               for (j = 0; j < n; j++) {
8280 +                       if (cpus_equal(doms_cur[i], doms_new[j])
8281 +                           && dattrs_equal(dattr_cur, i, dattr_new, j))
8282 +                               goto match1;
8283 +               }
8284 +               /* no match - a current sched domain not in new doms_new[] */
8285 +               detach_destroy_domains(doms_cur + i);
8286 +match1:
8287 +               ;
8288 +       }
8289 +
8290 +       if (doms_new == NULL) {
8291 +               ndoms_cur = 0;
8292 +               doms_new = &fallback_doms;
8293 +               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
8294 +               dattr_new = NULL;
8295 +       }
8296 +
8297 +       /* Build new domains */
8298 +       for (i = 0; i < ndoms_new; i++) {
8299 +               for (j = 0; j < ndoms_cur; j++) {
8300 +                       if (cpus_equal(doms_new[i], doms_cur[j])
8301 +                           && dattrs_equal(dattr_new, i, dattr_cur, j))
8302 +                               goto match2;
8303 +               }
8304 +               /* no match - add a new doms_new */
8305 +               __build_sched_domains(doms_new + i,
8306 +                                       dattr_new ? dattr_new + i : NULL);
8307 +match2:
8308 +               ;
8309 +       }
8310 +
8311 +       /* Remember the new sched domains */
8312 +       if (doms_cur != &fallback_doms)
8313 +               kfree(doms_cur);
8314 +       kfree(dattr_cur);       /* kfree(NULL) is safe */
8315 +       doms_cur = doms_new;
8316 +       dattr_cur = dattr_new;
8317 +       ndoms_cur = ndoms_new;
8318 +
8319 +       register_sched_domain_sysctl();
8320 +
8321 +       mutex_unlock(&sched_domains_mutex);
8322 +}
8323 +
8324 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
8325 +int arch_reinit_sched_domains(void)
8326 +{
8327 +       get_online_cpus();
8328 +
8329 +       /* Destroy domains first to force the rebuild */
8330 +       partition_sched_domains(0, NULL, NULL);
8331 +
8332 +       rebuild_sched_domains();
8333 +       put_online_cpus();
8334 +
8335 +       return 0;
8336 +}
8337 +
8338 +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
8339 +{
8340 +       int ret;
8341 +
8342 +       if (buf[0] != '0' && buf[0] != '1')
8343 +               return -EINVAL;
8344 +
8345 +       if (smt)
8346 +               sched_smt_power_savings = (buf[0] == '1');
8347 +       else
8348 +               sched_mc_power_savings = (buf[0] == '1');
8349 +
8350 +       ret = arch_reinit_sched_domains();
8351 +
8352 +       return ret ? ret : count;
8353 +}
8354 +
8355 +#ifdef CONFIG_SCHED_MC
8356 +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
8357 +                                          char *page)
8358 +{
8359 +       return sprintf(page, "%u\n", sched_mc_power_savings);
8360 +}
8361 +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
8362 +                                           const char *buf, size_t count)
8363 +{
8364 +       return sched_power_savings_store(buf, count, 0);
8365 +}
8366 +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
8367 +                        sched_mc_power_savings_show,
8368 +                        sched_mc_power_savings_store);
8369 +#endif
8370 +
8371 +#ifdef CONFIG_SCHED_SMT
8372 +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
8373 +                                           char *page)
8374 +{
8375 +       return sprintf(page, "%u\n", sched_smt_power_savings);
8376 +}
8377 +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
8378 +                                            const char *buf, size_t count)
8379 +{
8380 +       return sched_power_savings_store(buf, count, 1);
8381 +}
8382 +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
8383 +                  sched_smt_power_savings_show,
8384 +                  sched_smt_power_savings_store);
8385 +#endif
8386 +
8387 +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
8388 +{
8389 +       int err = 0;
8390 +
8391 +#ifdef CONFIG_SCHED_SMT
8392 +       if (smt_capable())
8393 +               err = sysfs_create_file(&cls->kset.kobj,
8394 +                                       &attr_sched_smt_power_savings.attr);
8395 +#endif
8396 +#ifdef CONFIG_SCHED_MC
8397 +       if (!err && mc_capable())
8398 +               err = sysfs_create_file(&cls->kset.kobj,
8399 +                                       &attr_sched_mc_power_savings.attr);
8400 +#endif
8401 +       return err;
8402 +}
8403 +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
8404 +
8405 +#ifndef CONFIG_CPUSETS
8406 +/*
8407 + * Add online and remove offline CPUs from the scheduler domains.
8408 + * When cpusets are enabled they take over this function.
8409 + */
8410 +static int update_sched_domains(struct notifier_block *nfb,
8411 +                               unsigned long action, void *hcpu)
8412 +{
8413 +       switch (action) {
8414 +       case CPU_ONLINE:
8415 +       case CPU_ONLINE_FROZEN:
8416 +       case CPU_DEAD:
8417 +       case CPU_DEAD_FROZEN:
8418 +               partition_sched_domains(1, NULL, NULL);
8419 +               return NOTIFY_OK;
8420 +
8421 +       default:
8422 +               return NOTIFY_DONE;
8423 +       }
8424 +}
8425 +#endif
8426 +
8427 +static int update_runtime(struct notifier_block *nfb,
8428 +                               unsigned long action, void *hcpu)
8429 +{
8430 +       int cpu = (int)(long)hcpu;
8431 +
8432 +       switch (action) {
8433 +       case CPU_DOWN_PREPARE:
8434 +       case CPU_DOWN_PREPARE_FROZEN:
8435 +               disable_runtime(cpu_rq(cpu));
8436 +               return NOTIFY_OK;
8437 +
8438 +       case CPU_DOWN_FAILED:
8439 +       case CPU_DOWN_FAILED_FROZEN:
8440 +       case CPU_ONLINE:
8441 +       case CPU_ONLINE_FROZEN:
8442 +               enable_runtime(cpu_rq(cpu));
8443 +               return NOTIFY_OK;
8444 +
8445 +       default:
8446 +               return NOTIFY_DONE;
8447 +       }
8448 +}
8449 +
8450 +void __init sched_init_smp(void)
8451 +{
8452 +       cpumask_t non_isolated_cpus;
8453 +
8454 +#if defined(CONFIG_NUMA)
8455 +       sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
8456 +                                                               GFP_KERNEL);
8457 +       BUG_ON(sched_group_nodes_bycpu == NULL);
8458 +#endif
8459 +       get_online_cpus();
8460 +       mutex_lock(&sched_domains_mutex);
8461 +       arch_init_sched_domains(&cpu_online_map);
8462 +       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
8463 +       if (cpus_empty(non_isolated_cpus))
8464 +               cpu_set(smp_processor_id(), non_isolated_cpus);
8465 +       mutex_unlock(&sched_domains_mutex);
8466 +       put_online_cpus();
8467 +
8468 +#ifndef CONFIG_CPUSETS
8469 +       /* XXX: Theoretical race here - CPU may be hotplugged now */
8470 +       hotcpu_notifier(update_sched_domains, 0);
8471 +#endif
8472 +
8473 +       /* RT runtime code needs to handle some hotplug events */
8474 +       hotcpu_notifier(update_runtime, 0);
8475 +
8476 +       init_hrtick();
8477 +
8478 +       /* Move init over to a non-isolated CPU */
8479 +       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
8480 +               BUG();
8481 +       sched_init_granularity();
8482 +}
8483 +#else
8484 +void __init sched_init_smp(void)
8485 +{
8486 +       sched_init_granularity();
8487 +}
8488 +#endif /* CONFIG_SMP */
8489 +
8490 +int in_sched_functions(unsigned long addr)
8491 +{
8492 +       return in_lock_functions(addr) ||
8493 +               (addr >= (unsigned long)__sched_text_start
8494 +               && addr < (unsigned long)__sched_text_end);
8495 +}
8496 +
8497 +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
8498 +{
8499 +       cfs_rq->tasks_timeline = RB_ROOT;
8500 +       INIT_LIST_HEAD(&cfs_rq->tasks);
8501 +#ifdef CONFIG_FAIR_GROUP_SCHED
8502 +       cfs_rq->rq = rq;
8503 +#endif
8504 +       cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8505 +}
8506 +
8507 +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8508 +{
8509 +       struct rt_prio_array *array;
8510 +       int i;
8511 +
8512 +       array = &rt_rq->active;
8513 +       for (i = 0; i < MAX_RT_PRIO; i++) {
8514 +               INIT_LIST_HEAD(array->queue + i);
8515 +               __clear_bit(i, array->bitmap);
8516 +       }
8517 +       /* delimiter for bitsearch: */
8518 +       __set_bit(MAX_RT_PRIO, array->bitmap);
8519 +
8520 +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8521 +       rt_rq->highest_prio = MAX_RT_PRIO;
8522 +#endif
8523 +#ifdef CONFIG_SMP
8524 +       rt_rq->rt_nr_migratory = 0;
8525 +       rt_rq->overloaded = 0;
8526 +#endif
8527 +
8528 +       rt_rq->rt_time = 0;
8529 +       rt_rq->rt_throttled = 0;
8530 +       rt_rq->rt_runtime = 0;
8531 +       spin_lock_init(&rt_rq->rt_runtime_lock);
8532 +
8533 +#ifdef CONFIG_RT_GROUP_SCHED
8534 +       rt_rq->rt_nr_boosted = 0;
8535 +       rt_rq->rq = rq;
8536 +#endif
8537 +}
8538 +
8539 +#ifdef CONFIG_FAIR_GROUP_SCHED
8540 +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8541 +                               struct sched_entity *se, int cpu, int add,
8542 +                               struct sched_entity *parent)
8543 +{
8544 +       struct rq *rq = cpu_rq(cpu);
8545 +       tg->cfs_rq[cpu] = cfs_rq;
8546 +       init_cfs_rq(cfs_rq, rq);
8547 +       cfs_rq->tg = tg;
8548 +       if (add)
8549 +               list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8550 +
8551 +       tg->se[cpu] = se;
8552 +       /* se could be NULL for init_task_group */
8553 +       if (!se)
8554 +               return;
8555 +
8556 +       if (!parent)
8557 +               se->cfs_rq = &rq->cfs;
8558 +       else
8559 +               se->cfs_rq = parent->my_q;
8560 +
8561 +       se->my_q = cfs_rq;
8562 +       se->load.weight = tg->shares;
8563 +       se->load.inv_weight = 0;
8564 +       se->parent = parent;
8565 +}
8566 +#endif
8567 +
8568 +#ifdef CONFIG_RT_GROUP_SCHED
8569 +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8570 +               struct sched_rt_entity *rt_se, int cpu, int add,
8571 +               struct sched_rt_entity *parent)
8572 +{
8573 +       struct rq *rq = cpu_rq(cpu);
8574 +
8575 +       tg->rt_rq[cpu] = rt_rq;
8576 +       init_rt_rq(rt_rq, rq);
8577 +       rt_rq->tg = tg;
8578 +       rt_rq->rt_se = rt_se;
8579 +       rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8580 +       if (add)
8581 +               list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8582 +
8583 +       tg->rt_se[cpu] = rt_se;
8584 +       if (!rt_se)
8585 +               return;
8586 +
8587 +       if (!parent)
8588 +               rt_se->rt_rq = &rq->rt;
8589 +       else
8590 +               rt_se->rt_rq = parent->my_q;
8591 +
8592 +       rt_se->my_q = rt_rq;
8593 +       rt_se->parent = parent;
8594 +       INIT_LIST_HEAD(&rt_se->run_list);
8595 +}
8596 +#endif
8597 +
8598 +void __init sched_init(void)
8599 +{
8600 +       int i, j;
8601 +       unsigned long alloc_size = 0, ptr;
8602 +
8603 +#ifdef CONFIG_FAIR_GROUP_SCHED
8604 +       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8605 +#endif
8606 +#ifdef CONFIG_RT_GROUP_SCHED
8607 +       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8608 +#endif
8609 +#ifdef CONFIG_USER_SCHED
8610 +       alloc_size *= 2;
8611 +#endif
8612 +       /*
8613 +        * As sched_init() is called before page_alloc is setup,
8614 +        * we use alloc_bootmem().
8615 +        */
8616 +       if (alloc_size) {
8617 +               ptr = (unsigned long)alloc_bootmem(alloc_size);
8618 +
8619 +#ifdef CONFIG_FAIR_GROUP_SCHED
8620 +               init_task_group.se = (struct sched_entity **)ptr;
8621 +               ptr += nr_cpu_ids * sizeof(void **);
8622 +
8623 +               init_task_group.cfs_rq = (struct cfs_rq **)ptr;
8624 +               ptr += nr_cpu_ids * sizeof(void **);
8625 +
8626 +#ifdef CONFIG_USER_SCHED
8627 +               root_task_group.se = (struct sched_entity **)ptr;
8628 +               ptr += nr_cpu_ids * sizeof(void **);
8629 +
8630 +               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8631 +               ptr += nr_cpu_ids * sizeof(void **);
8632 +#endif /* CONFIG_USER_SCHED */
8633 +#endif /* CONFIG_FAIR_GROUP_SCHED */
8634 +#ifdef CONFIG_RT_GROUP_SCHED
8635 +               init_task_group.rt_se = (struct sched_rt_entity **)ptr;
8636 +               ptr += nr_cpu_ids * sizeof(void **);
8637 +
8638 +               init_task_group.rt_rq = (struct rt_rq **)ptr;
8639 +               ptr += nr_cpu_ids * sizeof(void **);
8640 +
8641 +#ifdef CONFIG_USER_SCHED
8642 +               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8643 +               ptr += nr_cpu_ids * sizeof(void **);
8644 +
8645 +               root_task_group.rt_rq = (struct rt_rq **)ptr;
8646 +               ptr += nr_cpu_ids * sizeof(void **);
8647 +#endif /* CONFIG_USER_SCHED */
8648 +#endif /* CONFIG_RT_GROUP_SCHED */
8649 +       }
8650 +
8651 +#ifdef CONFIG_SMP
8652 +       init_defrootdomain();
8653 +#endif
8654 +
8655 +       init_rt_bandwidth(&def_rt_bandwidth,
8656 +                       global_rt_period(), global_rt_runtime());
8657 +
8658 +#ifdef CONFIG_RT_GROUP_SCHED
8659 +       init_rt_bandwidth(&init_task_group.rt_bandwidth,
8660 +                       global_rt_period(), global_rt_runtime());
8661 +#ifdef CONFIG_USER_SCHED
8662 +       init_rt_bandwidth(&root_task_group.rt_bandwidth,
8663 +                       global_rt_period(), RUNTIME_INF);
8664 +#endif /* CONFIG_USER_SCHED */
8665 +#endif /* CONFIG_RT_GROUP_SCHED */
8666 +
8667 +#ifdef CONFIG_GROUP_SCHED
8668 +       list_add(&init_task_group.list, &task_groups);
8669 +       INIT_LIST_HEAD(&init_task_group.children);
8670 +
8671 +#ifdef CONFIG_USER_SCHED
8672 +       INIT_LIST_HEAD(&root_task_group.children);
8673 +       init_task_group.parent = &root_task_group;
8674 +       list_add(&init_task_group.siblings, &root_task_group.children);
8675 +#endif /* CONFIG_USER_SCHED */
8676 +#endif /* CONFIG_GROUP_SCHED */
8677 +
8678 +       for_each_possible_cpu(i) {
8679 +               struct rq *rq;
8680 +
8681 +               rq = cpu_rq(i);
8682 +               spin_lock_init(&rq->lock);
8683 +               rq->nr_running = 0;
8684 +               init_cfs_rq(&rq->cfs, rq);
8685 +               init_rt_rq(&rq->rt, rq);
8686 +#ifdef CONFIG_FAIR_GROUP_SCHED
8687 +               init_task_group.shares = init_task_group_load;
8688 +               INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8689 +#ifdef CONFIG_CGROUP_SCHED
8690 +               /*
8691 +                * How much cpu bandwidth does init_task_group get?
8692 +                *
8693 +                * In case of task-groups formed thr' the cgroup filesystem, it
8694 +                * gets 100% of the cpu resources in the system. This overall
8695 +                * system cpu resource is divided among the tasks of
8696 +                * init_task_group and its child task-groups in a fair manner,
8697 +                * based on each entity's (task or task-group's) weight
8698 +                * (se->load.weight).
8699 +                *
8700 +                * In other words, if init_task_group has 10 tasks of weight
8701 +                * 1024) and two child groups A0 and A1 (of weight 1024 each),
8702 +                * then A0's share of the cpu resource is:
8703 +                *
8704 +                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8705 +                *
8706 +                * We achieve this by letting init_task_group's tasks sit
8707 +                * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8708 +                */
8709 +               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
8710 +#elif defined CONFIG_USER_SCHED
8711 +               root_task_group.shares = NICE_0_LOAD;
8712 +               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
8713 +               /*
8714 +                * In case of task-groups formed thr' the user id of tasks,
8715 +                * init_task_group represents tasks belonging to root user.
8716 +                * Hence it forms a sibling of all subsequent groups formed.
8717 +                * In this case, init_task_group gets only a fraction of overall
8718 +                * system cpu resource, based on the weight assigned to root
8719 +                * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
8720 +                * by letting tasks of init_task_group sit in a separate cfs_rq
8721 +                * (init_cfs_rq) and having one entity represent this group of
8722 +                * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
8723 +                */
8724 +               init_tg_cfs_entry(&init_task_group,
8725 +                               &per_cpu(init_cfs_rq, i),
8726 +                               &per_cpu(init_sched_entity, i), i, 1,
8727 +                               root_task_group.se[i]);
8728 +
8729 +#endif
8730 +#endif /* CONFIG_FAIR_GROUP_SCHED */
8731 +
8732 +               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8733 +#ifdef CONFIG_RT_GROUP_SCHED
8734 +               INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8735 +#ifdef CONFIG_CGROUP_SCHED
8736 +               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8737 +#elif defined CONFIG_USER_SCHED
8738 +               init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
8739 +               init_tg_rt_entry(&init_task_group,
8740 +                               &per_cpu(init_rt_rq, i),
8741 +                               &per_cpu(init_sched_rt_entity, i), i, 1,
8742 +                               root_task_group.rt_se[i]);
8743 +#endif
8744 +#endif
8745 +
8746 +               for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8747 +                       rq->cpu_load[j] = 0;
8748 +#ifdef CONFIG_SMP
8749 +               rq->sd = NULL;
8750 +               rq->rd = NULL;
8751 +               rq->active_balance = 0;
8752 +               rq->next_balance = jiffies;
8753 +               rq->push_cpu = 0;
8754 +               rq->cpu = i;
8755 +               rq->online = 0;
8756 +               rq->migration_thread = NULL;
8757 +               INIT_LIST_HEAD(&rq->migration_queue);
8758 +               rq_attach_root(rq, &def_root_domain);
8759 +#endif
8760 +               init_rq_hrtick(rq);
8761 +               atomic_set(&rq->nr_iowait, 0);
8762 +       }
8763 +
8764 +       set_load_weight(&init_task);
8765 +
8766 +#ifdef CONFIG_PREEMPT_NOTIFIERS
8767 +       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8768 +#endif
8769 +
8770 +#ifdef CONFIG_SMP
8771 +       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8772 +#endif
8773 +
8774 +#ifdef CONFIG_RT_MUTEXES
8775 +       plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
8776 +#endif
8777 +
8778 +       /*
8779 +        * The boot idle thread does lazy MMU switching as well:
8780 +        */
8781 +       atomic_inc(&init_mm.mm_count);
8782 +       enter_lazy_tlb(&init_mm, current);
8783 +
8784 +       /*
8785 +        * Make us the idle thread. Technically, schedule() should not be
8786 +        * called from this thread, however somewhere below it might be,
8787 +        * but because we are the idle thread, we just pick up running again
8788 +        * when this runqueue becomes "idle".
8789 +        */
8790 +       init_idle(current, smp_processor_id());
8791 +       /*
8792 +        * During early bootup we pretend to be a normal task:
8793 +        */
8794 +       current->sched_class = &fair_sched_class;
8795 +
8796 +       scheduler_running = 1;
8797 +}
8798 +
8799 +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
8800 +void __might_sleep(char *file, int line)
8801 +{
8802 +#ifdef in_atomic
8803 +       static unsigned long prev_jiffy;        /* ratelimiting */
8804 +
8805 +       if ((in_atomic() || irqs_disabled()) &&
8806 +           system_state == SYSTEM_RUNNING && !oops_in_progress) {
8807 +               if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8808 +                       return;
8809 +               prev_jiffy = jiffies;
8810 +               printk(KERN_ERR "BUG: sleeping function called from invalid"
8811 +                               " context at %s:%d\n", file, line);
8812 +               printk("in_atomic():%d, irqs_disabled():%d\n",
8813 +                       in_atomic(), irqs_disabled());
8814 +               debug_show_held_locks(current);
8815 +               if (irqs_disabled())
8816 +                       print_irqtrace_events(current);
8817 +               dump_stack();
8818 +       }
8819 +#endif
8820 +}
8821 +EXPORT_SYMBOL(__might_sleep);
8822 +#endif
8823 +
8824 +#ifdef CONFIG_MAGIC_SYSRQ
8825 +static void normalize_task(struct rq *rq, struct task_struct *p)
8826 +{
8827 +       int on_rq;
8828 +
8829 +       update_rq_clock(rq);
8830 +       on_rq = p->se.on_rq;
8831 +       if (on_rq)
8832 +               deactivate_task(rq, p, 0);
8833 +       __setscheduler(rq, p, SCHED_NORMAL, 0);
8834 +       if (on_rq) {
8835 +               activate_task(rq, p, 0);
8836 +               resched_task(rq->curr);
8837 +       }
8838 +}
8839 +
8840 +void normalize_rt_tasks(void)
8841 +{
8842 +       struct task_struct *g, *p;
8843 +       unsigned long flags;
8844 +       struct rq *rq;
8845 +
8846 +       read_lock_irqsave(&tasklist_lock, flags);
8847 +       do_each_thread(g, p) {
8848 +               /*
8849 +                * Only normalize user tasks:
8850 +                */
8851 +               if (!p->mm)
8852 +                       continue;
8853 +
8854 +               p->se.exec_start                = 0;
8855 +#ifdef CONFIG_SCHEDSTATS
8856 +               p->se.wait_start                = 0;
8857 +               p->se.sleep_start               = 0;
8858 +               p->se.block_start               = 0;
8859 +#endif
8860 +
8861 +               if (!rt_task(p)) {
8862 +                       /*
8863 +                        * Renice negative nice level userspace
8864 +                        * tasks back to 0:
8865 +                        */
8866 +                       if (TASK_NICE(p) < 0 && p->mm)
8867 +                               set_user_nice(p, 0);
8868 +                       continue;
8869 +               }
8870 +
8871 +               spin_lock(&p->pi_lock);
8872 +               rq = __task_rq_lock(p);
8873 +
8874 +               normalize_task(rq, p);
8875 +
8876 +               __task_rq_unlock(rq);
8877 +               spin_unlock(&p->pi_lock);
8878 +       } while_each_thread(g, p);
8879 +
8880 +       read_unlock_irqrestore(&tasklist_lock, flags);
8881 +}
8882 +
8883 +#endif /* CONFIG_MAGIC_SYSRQ */
8884 +
8885 +#ifdef CONFIG_IA64
8886 +/*
8887 + * These functions are only useful for the IA64 MCA handling.
8888 + *
8889 + * They can only be called when the whole system has been
8890 + * stopped - every CPU needs to be quiescent, and no scheduling
8891 + * activity can take place. Using them for anything else would
8892 + * be a serious bug, and as a result, they aren't even visible
8893 + * under any other configuration.
8894 + */
8895 +
8896 +/**
8897 + * curr_task - return the current task for a given cpu.
8898 + * @cpu: the processor in question.
8899 + *
8900 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8901 + */
8902 +struct task_struct *curr_task(int cpu)
8903 +{
8904 +       return cpu_curr(cpu);
8905 +}
8906 +
8907 +/**
8908 + * set_curr_task - set the current task for a given cpu.
8909 + * @cpu: the processor in question.
8910 + * @p: the task pointer to set.
8911 + *
8912 + * Description: This function must only be used when non-maskable interrupts
8913 + * are serviced on a separate stack. It allows the architecture to switch the
8914 + * notion of the current task on a cpu in a non-blocking manner. This function
8915 + * must be called with all CPU's synchronized, and interrupts disabled, the
8916 + * and caller must save the original value of the current task (see
8917 + * curr_task() above) and restore that value before reenabling interrupts and
8918 + * re-starting the system.
8919 + *
8920 + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8921 + */
8922 +void set_curr_task(int cpu, struct task_struct *p)
8923 +{
8924 +       cpu_curr(cpu) = p;
8925 +}
8926 +
8927 +#endif
8928 +
8929 +#ifdef CONFIG_FAIR_GROUP_SCHED
8930 +static void free_fair_sched_group(struct task_group *tg)
8931 +{
8932 +       int i;
8933 +
8934 +       for_each_possible_cpu(i) {
8935 +               if (tg->cfs_rq)
8936 +                       kfree(tg->cfs_rq[i]);
8937 +               if (tg->se)
8938 +                       kfree(tg->se[i]);
8939 +       }
8940 +
8941 +       kfree(tg->cfs_rq);
8942 +       kfree(tg->se);
8943 +}
8944 +
8945 +static
8946 +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8947 +{
8948 +       struct cfs_rq *cfs_rq;
8949 +       struct sched_entity *se, *parent_se;
8950 +       struct rq *rq;
8951 +       int i;
8952 +
8953 +       tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8954 +       if (!tg->cfs_rq)
8955 +               goto err;
8956 +       tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8957 +       if (!tg->se)
8958 +               goto err;
8959 +
8960 +       tg->shares = NICE_0_LOAD;
8961 +
8962 +       for_each_possible_cpu(i) {
8963 +               rq = cpu_rq(i);
8964 +
8965 +               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
8966 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8967 +               if (!cfs_rq)
8968 +                       goto err;
8969 +
8970 +               se = kmalloc_node(sizeof(struct sched_entity),
8971 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
8972 +               if (!se)
8973 +                       goto err;
8974 +
8975 +               parent_se = parent ? parent->se[i] : NULL;
8976 +               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8977 +       }
8978 +
8979 +       return 1;
8980 +
8981 + err:
8982 +       return 0;
8983 +}
8984 +
8985 +static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8986 +{
8987 +       list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8988 +                       &cpu_rq(cpu)->leaf_cfs_rq_list);
8989 +}
8990 +
8991 +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8992 +{
8993 +       list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8994 +}
8995 +#else /* !CONFG_FAIR_GROUP_SCHED */
8996 +static inline void free_fair_sched_group(struct task_group *tg)
8997 +{
8998 +}
8999 +
9000 +static inline
9001 +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9002 +{
9003 +       return 1;
9004 +}
9005 +
9006 +static inline void register_fair_sched_group(struct task_group *tg, int cpu)
9007 +{
9008 +}
9009 +
9010 +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
9011 +{
9012 +}
9013 +#endif /* CONFIG_FAIR_GROUP_SCHED */
9014 +
9015 +#ifdef CONFIG_RT_GROUP_SCHED
9016 +static void free_rt_sched_group(struct task_group *tg)
9017 +{
9018 +       int i;
9019 +
9020 +       destroy_rt_bandwidth(&tg->rt_bandwidth);
9021 +
9022 +       for_each_possible_cpu(i) {
9023 +               if (tg->rt_rq)
9024 +                       kfree(tg->rt_rq[i]);
9025 +               if (tg->rt_se)
9026 +                       kfree(tg->rt_se[i]);
9027 +       }
9028 +
9029 +       kfree(tg->rt_rq);
9030 +       kfree(tg->rt_se);
9031 +}
9032 +
9033 +static
9034 +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9035 +{
9036 +       struct rt_rq *rt_rq;
9037 +       struct sched_rt_entity *rt_se, *parent_se;
9038 +       struct rq *rq;
9039 +       int i;
9040 +
9041 +       tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
9042 +       if (!tg->rt_rq)
9043 +               goto err;
9044 +       tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
9045 +       if (!tg->rt_se)
9046 +               goto err;
9047 +
9048 +       init_rt_bandwidth(&tg->rt_bandwidth,
9049 +                       ktime_to_ns(def_rt_bandwidth.rt_period), 0);
9050 +
9051 +       for_each_possible_cpu(i) {
9052 +               rq = cpu_rq(i);
9053 +
9054 +               rt_rq = kmalloc_node(sizeof(struct rt_rq),
9055 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
9056 +               if (!rt_rq)
9057 +                       goto err;
9058 +
9059 +               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
9060 +                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
9061 +               if (!rt_se)
9062 +                       goto err;
9063 +
9064 +               parent_se = parent ? parent->rt_se[i] : NULL;
9065 +               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
9066 +       }
9067 +
9068 +       return 1;
9069 +
9070 + err:
9071 +       return 0;
9072 +}
9073 +
9074 +static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9075 +{
9076 +       list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
9077 +                       &cpu_rq(cpu)->leaf_rt_rq_list);
9078 +}
9079 +
9080 +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9081 +{
9082 +       list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
9083 +}
9084 +#else /* !CONFIG_RT_GROUP_SCHED */
9085 +static inline void free_rt_sched_group(struct task_group *tg)
9086 +{
9087 +}
9088 +
9089 +static inline
9090 +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9091 +{
9092 +       return 1;
9093 +}
9094 +
9095 +static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9096 +{
9097 +}
9098 +
9099 +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9100 +{
9101 +}
9102 +#endif /* CONFIG_RT_GROUP_SCHED */
9103 +
9104 +#ifdef CONFIG_GROUP_SCHED
9105 +static void free_sched_group(struct task_group *tg)
9106 +{
9107 +       free_fair_sched_group(tg);
9108 +       free_rt_sched_group(tg);
9109 +       kfree(tg);
9110 +}
9111 +
9112 +/* allocate runqueue etc for a new task group */
9113 +struct task_group *sched_create_group(struct task_group *parent)
9114 +{
9115 +       struct task_group *tg;
9116 +       unsigned long flags;
9117 +       int i;
9118 +
9119 +       tg = kzalloc(sizeof(*tg), GFP_KERNEL);
9120 +       if (!tg)
9121 +               return ERR_PTR(-ENOMEM);
9122 +
9123 +       if (!alloc_fair_sched_group(tg, parent))
9124 +               goto err;
9125 +
9126 +       if (!alloc_rt_sched_group(tg, parent))
9127 +               goto err;
9128 +
9129 +       spin_lock_irqsave(&task_group_lock, flags);
9130 +       for_each_possible_cpu(i) {
9131 +               register_fair_sched_group(tg, i);
9132 +               register_rt_sched_group(tg, i);
9133 +       }
9134 +       list_add_rcu(&tg->list, &task_groups);
9135 +
9136 +       WARN_ON(!parent); /* root should already exist */
9137 +
9138 +       tg->parent = parent;
9139 +       INIT_LIST_HEAD(&tg->children);
9140 +       list_add_rcu(&tg->siblings, &parent->children);
9141 +       spin_unlock_irqrestore(&task_group_lock, flags);
9142 +
9143 +       return tg;
9144 +
9145 +err:
9146 +       free_sched_group(tg);
9147 +       return ERR_PTR(-ENOMEM);
9148 +}
9149 +
9150 +/* rcu callback to free various structures associated with a task group */
9151 +static void free_sched_group_rcu(struct rcu_head *rhp)
9152 +{
9153 +       /* now it should be safe to free those cfs_rqs */
9154 +       free_sched_group(container_of(rhp, struct task_group, rcu));
9155 +}
9156 +
9157 +/* Destroy runqueue etc associated with a task group */
9158 +void sched_destroy_group(struct task_group *tg)
9159 +{
9160 +       unsigned long flags;
9161 +       int i;
9162 +
9163 +       spin_lock_irqsave(&task_group_lock, flags);
9164 +       for_each_possible_cpu(i) {
9165 +               unregister_fair_sched_group(tg, i);
9166 +               unregister_rt_sched_group(tg, i);
9167 +       }
9168 +       list_del_rcu(&tg->list);
9169 +       list_del_rcu(&tg->siblings);
9170 +       spin_unlock_irqrestore(&task_group_lock, flags);
9171 +
9172 +       /* wait for possible concurrent references to cfs_rqs complete */
9173 +       call_rcu(&tg->rcu, free_sched_group_rcu);
9174 +}
9175 +
9176 +/* change task's runqueue when it moves between groups.
9177 + *     The caller of this function should have put the task in its new group
9178 + *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
9179 + *     reflect its new group.
9180 + */
9181 +void sched_move_task(struct task_struct *tsk)
9182 +{
9183 +       int on_rq, running;
9184 +       unsigned long flags;
9185 +       struct rq *rq;
9186 +
9187 +       rq = task_rq_lock(tsk, &flags);
9188 +
9189 +       update_rq_clock(rq);
9190 +
9191 +       running = task_current(rq, tsk);
9192 +       on_rq = tsk->se.on_rq;
9193 +
9194 +       if (on_rq)
9195 +               dequeue_task(rq, tsk, 0);
9196 +       if (unlikely(running))
9197 +               tsk->sched_class->put_prev_task(rq, tsk);
9198 +
9199 +       set_task_rq(tsk, task_cpu(tsk));
9200 +
9201 +#ifdef CONFIG_FAIR_GROUP_SCHED
9202 +       if (tsk->sched_class->moved_group)
9203 +               tsk->sched_class->moved_group(tsk);
9204 +#endif
9205 +
9206 +       if (unlikely(running))
9207 +               tsk->sched_class->set_curr_task(rq);
9208 +       if (on_rq)
9209 +               enqueue_task(rq, tsk, 0);
9210 +
9211 +       task_rq_unlock(rq, &flags);
9212 +}
9213 +#endif /* CONFIG_GROUP_SCHED */
9214 +
9215 +#ifdef CONFIG_FAIR_GROUP_SCHED
9216 +static void __set_se_shares(struct sched_entity *se, unsigned long shares)
9217 +{
9218 +       struct cfs_rq *cfs_rq = se->cfs_rq;
9219 +       int on_rq;
9220 +
9221 +       on_rq = se->on_rq;
9222 +       if (on_rq)
9223 +               dequeue_entity(cfs_rq, se, 0);
9224 +
9225 +       se->load.weight = shares;
9226 +       se->load.inv_weight = 0;
9227 +
9228 +       if (on_rq)
9229 +               enqueue_entity(cfs_rq, se, 0);
9230 +}
9231 +
9232 +static void set_se_shares(struct sched_entity *se, unsigned long shares)
9233 +{
9234 +       struct cfs_rq *cfs_rq = se->cfs_rq;
9235 +       struct rq *rq = cfs_rq->rq;
9236 +       unsigned long flags;
9237 +
9238 +       spin_lock_irqsave(&rq->lock, flags);
9239 +       __set_se_shares(se, shares);
9240 +       spin_unlock_irqrestore(&rq->lock, flags);
9241 +}
9242 +
9243 +static DEFINE_MUTEX(shares_mutex);
9244 +
9245 +int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9246 +{
9247 +       int i;
9248 +       unsigned long flags;
9249 +
9250 +       /*
9251 +        * We can't change the weight of the root cgroup.
9252 +        */
9253 +       if (!tg->se[0])
9254 +               return -EINVAL;
9255 +
9256 +       if (shares < MIN_SHARES)
9257 +               shares = MIN_SHARES;
9258 +       else if (shares > MAX_SHARES)
9259 +               shares = MAX_SHARES;
9260 +
9261 +       mutex_lock(&shares_mutex);
9262 +       if (tg->shares == shares)
9263 +               goto done;
9264 +
9265 +       spin_lock_irqsave(&task_group_lock, flags);
9266 +       for_each_possible_cpu(i)
9267 +               unregister_fair_sched_group(tg, i);
9268 +       list_del_rcu(&tg->siblings);
9269 +       spin_unlock_irqrestore(&task_group_lock, flags);
9270 +
9271 +       /* wait for any ongoing reference to this group to finish */
9272 +       synchronize_sched();
9273 +
9274 +       /*
9275 +        * Now we are free to modify the group's share on each cpu
9276 +        * w/o tripping rebalance_share or load_balance_fair.
9277 +        */
9278 +       tg->shares = shares;
9279 +       for_each_possible_cpu(i) {
9280 +               /*
9281 +                * force a rebalance
9282 +                */
9283 +               cfs_rq_set_shares(tg->cfs_rq[i], 0);
9284 +               set_se_shares(tg->se[i], shares);
9285 +       }
9286 +
9287 +       /*
9288 +        * Enable load balance activity on this group, by inserting it back on
9289 +        * each cpu's rq->leaf_cfs_rq_list.
9290 +        */
9291 +       spin_lock_irqsave(&task_group_lock, flags);
9292 +       for_each_possible_cpu(i)
9293 +               register_fair_sched_group(tg, i);
9294 +       list_add_rcu(&tg->siblings, &tg->parent->children);
9295 +       spin_unlock_irqrestore(&task_group_lock, flags);
9296 +done:
9297 +       mutex_unlock(&shares_mutex);
9298 +       return 0;
9299 +}
9300 +
9301 +unsigned long sched_group_shares(struct task_group *tg)
9302 +{
9303 +       return tg->shares;
9304 +}
9305 +#endif
9306 +
9307 +#ifdef CONFIG_RT_GROUP_SCHED
9308 +/*
9309 + * Ensure that the real time constraints are schedulable.
9310 + */
9311 +static DEFINE_MUTEX(rt_constraints_mutex);
9312 +
9313 +static unsigned long to_ratio(u64 period, u64 runtime)
9314 +{
9315 +       if (runtime == RUNTIME_INF)
9316 +               return 1ULL << 16;
9317 +
9318 +       return div64_u64(runtime << 16, period);
9319 +}
9320 +
9321 +#ifdef CONFIG_CGROUP_SCHED
9322 +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
9323 +{
9324 +       struct task_group *tgi, *parent = tg->parent;
9325 +       unsigned long total = 0;
9326 +
9327 +       if (!parent) {
9328 +               if (global_rt_period() < period)
9329 +                       return 0;
9330 +
9331 +               return to_ratio(period, runtime) <
9332 +                       to_ratio(global_rt_period(), global_rt_runtime());
9333 +       }
9334 +
9335 +       if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
9336 +               return 0;
9337 +
9338 +       rcu_read_lock();
9339 +       list_for_each_entry_rcu(tgi, &parent->children, siblings) {
9340 +               if (tgi == tg)
9341 +                       continue;
9342 +
9343 +               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
9344 +                               tgi->rt_bandwidth.rt_runtime);
9345 +       }
9346 +       rcu_read_unlock();
9347 +
9348 +       return total + to_ratio(period, runtime) <=
9349 +               to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
9350 +                               parent->rt_bandwidth.rt_runtime);
9351 +}
9352 +#elif defined CONFIG_USER_SCHED
9353 +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
9354 +{
9355 +       struct task_group *tgi;
9356 +       unsigned long total = 0;
9357 +       unsigned long global_ratio =
9358 +               to_ratio(global_rt_period(), global_rt_runtime());
9359 +
9360 +       rcu_read_lock();
9361 +       list_for_each_entry_rcu(tgi, &task_groups, list) {
9362 +               if (tgi == tg)
9363 +                       continue;
9364 +
9365 +               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
9366 +                               tgi->rt_bandwidth.rt_runtime);
9367 +       }
9368 +       rcu_read_unlock();
9369 +
9370 +       return total + to_ratio(period, runtime) < global_ratio;
9371 +}
9372 +#endif
9373 +
9374 +/* Must be called with tasklist_lock held */
9375 +static inline int tg_has_rt_tasks(struct task_group *tg)
9376 +{
9377 +       struct task_struct *g, *p;
9378 +       do_each_thread(g, p) {
9379 +               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
9380 +                       return 1;
9381 +       } while_each_thread(g, p);
9382 +       return 0;
9383 +}
9384 +
9385 +static int tg_set_bandwidth(struct task_group *tg,
9386 +               u64 rt_period, u64 rt_runtime)
9387 +{
9388 +       int i, err = 0;
9389 +
9390 +       mutex_lock(&rt_constraints_mutex);
9391 +       read_lock(&tasklist_lock);
9392 +       if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
9393 +               err = -EBUSY;
9394 +               goto unlock;
9395 +       }
9396 +       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
9397 +               err = -EINVAL;
9398 +               goto unlock;
9399 +       }
9400 +
9401 +       spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
9402 +       tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
9403 +       tg->rt_bandwidth.rt_runtime = rt_runtime;
9404 +
9405 +       for_each_possible_cpu(i) {
9406 +               struct rt_rq *rt_rq = tg->rt_rq[i];
9407 +
9408 +               spin_lock(&rt_rq->rt_runtime_lock);
9409 +               rt_rq->rt_runtime = rt_runtime;
9410 +               spin_unlock(&rt_rq->rt_runtime_lock);
9411 +       }
9412 +       spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
9413 + unlock:
9414 +       read_unlock(&tasklist_lock);
9415 +       mutex_unlock(&rt_constraints_mutex);
9416 +
9417 +       return err;
9418 +}
9419 +
9420 +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
9421 +{
9422 +       u64 rt_runtime, rt_period;
9423 +
9424 +       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
9425 +       rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
9426 +       if (rt_runtime_us < 0)
9427 +               rt_runtime = RUNTIME_INF;
9428 +
9429 +       return tg_set_bandwidth(tg, rt_period, rt_runtime);
9430 +}
9431 +
9432 +long sched_group_rt_runtime(struct task_group *tg)
9433 +{
9434 +       u64 rt_runtime_us;
9435 +
9436 +       if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
9437 +               return -1;
9438 +
9439 +       rt_runtime_us = tg->rt_bandwidth.rt_runtime;
9440 +       do_div(rt_runtime_us, NSEC_PER_USEC);
9441 +       return rt_runtime_us;
9442 +}
9443 +
9444 +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
9445 +{
9446 +       u64 rt_runtime, rt_period;
9447 +
9448 +       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
9449 +       rt_runtime = tg->rt_bandwidth.rt_runtime;
9450 +
9451 +       if (rt_period == 0)
9452 +               return -EINVAL;
9453 +
9454 +       return tg_set_bandwidth(tg, rt_period, rt_runtime);
9455 +}
9456 +
9457 +long sched_group_rt_period(struct task_group *tg)
9458 +{
9459 +       u64 rt_period_us;
9460 +
9461 +       rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
9462 +       do_div(rt_period_us, NSEC_PER_USEC);
9463 +       return rt_period_us;
9464 +}
9465 +
9466 +static int sched_rt_global_constraints(void)
9467 +{
9468 +       struct task_group *tg = &root_task_group;
9469 +       u64 rt_runtime, rt_period;
9470 +       int ret = 0;
9471 +
9472 +       if (sysctl_sched_rt_period <= 0)
9473 +               return -EINVAL;
9474 +
9475 +       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
9476 +       rt_runtime = tg->rt_bandwidth.rt_runtime;
9477 +
9478 +       mutex_lock(&rt_constraints_mutex);
9479 +       if (!__rt_schedulable(tg, rt_period, rt_runtime))
9480 +               ret = -EINVAL;
9481 +       mutex_unlock(&rt_constraints_mutex);
9482 +
9483 +       return ret;
9484 +}
9485 +#else /* !CONFIG_RT_GROUP_SCHED */
9486 +static int sched_rt_global_constraints(void)
9487 +{
9488 +       unsigned long flags;
9489 +       int i;
9490 +
9491 +       if (sysctl_sched_rt_period <= 0)
9492 +               return -EINVAL;
9493 +
9494 +       spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9495 +       for_each_possible_cpu(i) {
9496 +               struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9497 +
9498 +               spin_lock(&rt_rq->rt_runtime_lock);
9499 +               rt_rq->rt_runtime = global_rt_runtime();
9500 +               spin_unlock(&rt_rq->rt_runtime_lock);
9501 +       }
9502 +       spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
9503 +
9504 +       return 0;
9505 +}
9506 +#endif /* CONFIG_RT_GROUP_SCHED */
9507 +
9508 +int sched_rt_handler(struct ctl_table *table, int write,
9509 +               struct file *filp, void __user *buffer, size_t *lenp,
9510 +               loff_t *ppos)
9511 +{
9512 +       int ret;
9513 +       int old_period, old_runtime;
9514 +       static DEFINE_MUTEX(mutex);
9515 +
9516 +       mutex_lock(&mutex);
9517 +       old_period = sysctl_sched_rt_period;
9518 +       old_runtime = sysctl_sched_rt_runtime;
9519 +
9520 +       ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
9521 +
9522 +       if (!ret && write) {
9523 +               ret = sched_rt_global_constraints();
9524 +               if (ret) {
9525 +                       sysctl_sched_rt_period = old_period;
9526 +                       sysctl_sched_rt_runtime = old_runtime;
9527 +               } else {
9528 +                       def_rt_bandwidth.rt_runtime = global_rt_runtime();
9529 +                       def_rt_bandwidth.rt_period =
9530 +                               ns_to_ktime(global_rt_period());
9531 +               }
9532 +       }
9533 +       mutex_unlock(&mutex);
9534 +
9535 +       return ret;
9536 +}
9537 +
9538 +#ifdef CONFIG_CGROUP_SCHED
9539 +
9540 +/* return corresponding task_group object of a cgroup */
9541 +static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
9542 +{
9543 +       return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
9544 +                           struct task_group, css);
9545 +}
9546 +
9547 +static struct cgroup_subsys_state *
9548 +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9549 +{
9550 +       struct task_group *tg, *parent;
9551 +
9552 +       if (!cgrp->parent) {
9553 +               /* This is early initialization for the top cgroup */
9554 +               init_task_group.css.cgroup = cgrp;
9555 +               return &init_task_group.css;
9556 +       }
9557 +
9558 +       parent = cgroup_tg(cgrp->parent);
9559 +       tg = sched_create_group(parent);
9560 +       if (IS_ERR(tg))
9561 +               return ERR_PTR(-ENOMEM);
9562 +
9563 +       /* Bind the cgroup to task_group object we just created */
9564 +       tg->css.cgroup = cgrp;
9565 +
9566 +       return &tg->css;
9567 +}
9568 +
9569 +static void
9570 +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9571 +{
9572 +       struct task_group *tg = cgroup_tg(cgrp);
9573 +
9574 +       sched_destroy_group(tg);
9575 +}
9576 +
9577 +static int
9578 +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9579 +                     struct task_struct *tsk)
9580 +{
9581 +#ifdef CONFIG_RT_GROUP_SCHED
9582 +       /* Don't accept realtime tasks when there is no way for them to run */
9583 +       if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9584 +               return -EINVAL;
9585 +#else
9586 +       /* We don't support RT-tasks being in separate groups */
9587 +       if (tsk->sched_class != &fair_sched_class)
9588 +               return -EINVAL;
9589 +#endif
9590 +
9591 +       return 0;
9592 +}
9593 +
9594 +static void
9595 +cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9596 +                       struct cgroup *old_cont, struct task_struct *tsk)
9597 +{
9598 +       sched_move_task(tsk);
9599 +}
9600 +
9601 +#ifdef CONFIG_FAIR_GROUP_SCHED
9602 +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9603 +                               u64 shareval)
9604 +{
9605 +       return sched_group_set_shares(cgroup_tg(cgrp), shareval);
9606 +}
9607 +
9608 +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9609 +{
9610 +       struct task_group *tg = cgroup_tg(cgrp);
9611 +
9612 +       return (u64) tg->shares;
9613 +}
9614 +#endif /* CONFIG_FAIR_GROUP_SCHED */
9615 +
9616 +#ifdef CONFIG_RT_GROUP_SCHED
9617 +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
9618 +                               s64 val)
9619 +{
9620 +       return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
9621 +}
9622 +
9623 +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
9624 +{
9625 +       return sched_group_rt_runtime(cgroup_tg(cgrp));
9626 +}
9627 +
9628 +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9629 +               u64 rt_period_us)
9630 +{
9631 +       return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9632 +}
9633 +
9634 +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9635 +{
9636 +       return sched_group_rt_period(cgroup_tg(cgrp));
9637 +}
9638 +#endif /* CONFIG_RT_GROUP_SCHED */
9639 +
9640 +static struct cftype cpu_files[] = {
9641 +#ifdef CONFIG_FAIR_GROUP_SCHED
9642 +       {
9643 +               .name = "shares",
9644 +               .read_u64 = cpu_shares_read_u64,
9645 +               .write_u64 = cpu_shares_write_u64,
9646 +       },
9647 +#endif
9648 +#ifdef CONFIG_RT_GROUP_SCHED
9649 +       {
9650 +               .name = "rt_runtime_us",
9651 +               .read_s64 = cpu_rt_runtime_read,
9652 +               .write_s64 = cpu_rt_runtime_write,
9653 +       },
9654 +       {
9655 +               .name = "rt_period_us",
9656 +               .read_u64 = cpu_rt_period_read_uint,
9657 +               .write_u64 = cpu_rt_period_write_uint,
9658 +       },
9659 +#endif
9660 +};
9661 +
9662 +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
9663 +{
9664 +       return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
9665 +}
9666 +
9667 +struct cgroup_subsys cpu_cgroup_subsys = {
9668 +       .name           = "cpu",
9669 +       .create         = cpu_cgroup_create,
9670 +       .destroy        = cpu_cgroup_destroy,
9671 +       .can_attach     = cpu_cgroup_can_attach,
9672 +       .attach         = cpu_cgroup_attach,
9673 +       .populate       = cpu_cgroup_populate,
9674 +       .subsys_id      = cpu_cgroup_subsys_id,
9675 +       .early_init     = 1,
9676 +};
9677 +
9678 +#endif /* CONFIG_CGROUP_SCHED */
9679 +
9680 +#ifdef CONFIG_CGROUP_CPUACCT
9681 +
9682 +/*
9683 + * CPU accounting code for task groups.
9684 + *
9685 + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
9686 + * (balbir@in.ibm.com).
9687 + */
9688 +
9689 +/* track cpu usage of a group of tasks */
9690 +struct cpuacct {
9691 +       struct cgroup_subsys_state css;
9692 +       /* cpuusage holds pointer to a u64-type object on every cpu */
9693 +       u64 *cpuusage;
9694 +};
9695 +
9696 +struct cgroup_subsys cpuacct_subsys;
9697 +
9698 +/* return cpu accounting group corresponding to this container */
9699 +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9700 +{
9701 +       return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9702 +                           struct cpuacct, css);
9703 +}
9704 +
9705 +/* return cpu accounting group to which this task belongs */
9706 +static inline struct cpuacct *task_ca(struct task_struct *tsk)
9707 +{
9708 +       return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9709 +                           struct cpuacct, css);
9710 +}
9711 +
9712 +/* create a new cpu accounting group */
9713 +static struct cgroup_subsys_state *cpuacct_create(
9714 +       struct cgroup_subsys *ss, struct cgroup *cgrp)
9715 +{
9716 +       struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9717 +
9718 +       if (!ca)
9719 +               return ERR_PTR(-ENOMEM);
9720 +
9721 +       ca->cpuusage = alloc_percpu(u64);
9722 +       if (!ca->cpuusage) {
9723 +               kfree(ca);
9724 +               return ERR_PTR(-ENOMEM);
9725 +       }
9726 +
9727 +       return &ca->css;
9728 +}
9729 +
9730 +/* destroy an existing cpu accounting group */
9731 +static void
9732 +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9733 +{
9734 +       struct cpuacct *ca = cgroup_ca(cgrp);
9735 +
9736 +       free_percpu(ca->cpuusage);
9737 +       kfree(ca);
9738 +}
9739 +
9740 +/* return total cpu usage (in nanoseconds) of a group */
9741 +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9742 +{
9743 +       struct cpuacct *ca = cgroup_ca(cgrp);
9744 +       u64 totalcpuusage = 0;
9745 +       int i;
9746 +
9747 +       for_each_possible_cpu(i) {
9748 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9749 +
9750 +               /*
9751 +                * Take rq->lock to make 64-bit addition safe on 32-bit
9752 +                * platforms.
9753 +                */
9754 +               spin_lock_irq(&cpu_rq(i)->lock);
9755 +               totalcpuusage += *cpuusage;
9756 +               spin_unlock_irq(&cpu_rq(i)->lock);
9757 +       }
9758 +
9759 +       return totalcpuusage;
9760 +}
9761 +
9762 +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9763 +                                                               u64 reset)
9764 +{
9765 +       struct cpuacct *ca = cgroup_ca(cgrp);
9766 +       int err = 0;
9767 +       int i;
9768 +
9769 +       if (reset) {
9770 +               err = -EINVAL;
9771 +               goto out;
9772 +       }
9773 +
9774 +       for_each_possible_cpu(i) {
9775 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
9776 +
9777 +               spin_lock_irq(&cpu_rq(i)->lock);
9778 +               *cpuusage = 0;
9779 +               spin_unlock_irq(&cpu_rq(i)->lock);
9780 +       }
9781 +out:
9782 +       return err;
9783 +}
9784 +
9785 +static struct cftype files[] = {
9786 +       {
9787 +               .name = "usage",
9788 +               .read_u64 = cpuusage_read,
9789 +               .write_u64 = cpuusage_write,
9790 +       },
9791 +};
9792 +
9793 +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9794 +{
9795 +       return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
9796 +}
9797 +
9798 +/*
9799 + * charge this task's execution time to its accounting group.
9800 + *
9801 + * called with rq->lock held.
9802 + */
9803 +static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9804 +{
9805 +       struct cpuacct *ca;
9806 +
9807 +       if (!cpuacct_subsys.active)
9808 +               return;
9809 +
9810 +       ca = task_ca(tsk);
9811 +       if (ca) {
9812 +               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9813 +
9814 +               *cpuusage += cputime;
9815 +       }
9816 +}
9817 +
9818 +struct cgroup_subsys cpuacct_subsys = {
9819 +       .name = "cpuacct",
9820 +       .create = cpuacct_create,
9821 +       .destroy = cpuacct_destroy,
9822 +       .populate = cpuacct_populate,
9823 +       .subsys_id = cpuacct_subsys_id,
9824 +};
9825 +#endif /* CONFIG_CGROUP_CPUACCT */
9826 diff -Nurb linux-2.6.27-590/kernel/sched.c.rej linux-2.6.27-591/kernel/sched.c.rej
9827 --- linux-2.6.27-590/kernel/sched.c.rej 1969-12-31 19:00:00.000000000 -0500
9828 +++ linux-2.6.27-591/kernel/sched.c.rej 2010-02-01 19:43:07.000000000 -0500
9829 @@ -0,0 +1,258 @@
9830 +***************
9831 +*** 23,28 ****
9832 +  #include <linux/nmi.h>
9833 +  #include <linux/init.h>
9834 +  #include <asm/uaccess.h>
9835 +  #include <linux/highmem.h>
9836 +  #include <linux/smp_lock.h>
9837 +  #include <asm/mmu_context.h>
9838 +--- 23,29 ----
9839 +  #include <linux/nmi.h>
9840 +  #include <linux/init.h>
9841 +  #include <asm/uaccess.h>
9842 ++ #include <linux/arrays.h>
9843 +  #include <linux/highmem.h>
9844 +  #include <linux/smp_lock.h>
9845 +  #include <asm/mmu_context.h>
9846 +***************
9847 +*** 451,456 ****
9848 +
9849 +  repeat_lock_task:
9850 +       rq = task_rq(p);
9851 +       spin_lock(&rq->lock);
9852 +       if (unlikely(rq != task_rq(p))) {
9853 +               spin_unlock(&rq->lock);
9854 +--- 455,461 ----
9855 +
9856 +  repeat_lock_task:
9857 +       rq = task_rq(p);
9858 ++
9859 +       spin_lock(&rq->lock);
9860 +       if (unlikely(rq != task_rq(p))) {
9861 +               spin_unlock(&rq->lock);
9862 +***************
9863 +*** 1761,1766 ****
9864 +        * event cannot wake it up and insert it on the runqueue either.
9865 +        */
9866 +       p->state = TASK_RUNNING;
9867 +
9868 +       /*
9869 +        * Make sure we do not leak PI boosting priority to the child:
9870 +--- 1766,1786 ----
9871 +        * event cannot wake it up and insert it on the runqueue either.
9872 +        */
9873 +       p->state = TASK_RUNNING;
9874 ++ #ifdef CONFIG_CHOPSTIX
9875 ++     /* The jiffy of last interruption */
9876 ++     if (p->state & TASK_UNINTERRUPTIBLE) {
9877 ++                              p->last_interrupted=jiffies;
9878 ++      }
9879 ++     else
9880 ++     if (p->state & TASK_INTERRUPTIBLE) {
9881 ++                              p->last_interrupted=INTERRUPTIBLE;
9882 ++      }
9883 ++     else
9884 ++          p->last_interrupted=RUNNING;
9885 ++
9886 ++     /* The jiffy of last execution */
9887 ++      p->last_ran_j=jiffies;
9888 ++ #endif
9889 +
9890 +       /*
9891 +        * Make sure we do not leak PI boosting priority to the child:
9892 +***************
9893 +*** 3628,3633 ****
9894 +
9895 +  #endif
9896 +
9897 +  static inline int interactive_sleep(enum sleep_type sleep_type)
9898 +  {
9899 +       return (sleep_type == SLEEP_INTERACTIVE ||
9900 +--- 3648,3654 ----
9901 +
9902 +  #endif
9903 +
9904 ++
9905 +  static inline int interactive_sleep(enum sleep_type sleep_type)
9906 +  {
9907 +       return (sleep_type == SLEEP_INTERACTIVE ||
9908 +***************
9909 +*** 3637,3652 ****
9910 +  /*
9911 +   * schedule() is the main scheduler function.
9912 +   */
9913 +  asmlinkage void __sched schedule(void)
9914 +  {
9915 +       struct task_struct *prev, *next;
9916 +       struct prio_array *array;
9917 +       struct list_head *queue;
9918 +       unsigned long long now;
9919 +-      unsigned long run_time;
9920 +       int cpu, idx, new_prio;
9921 +       long *switch_count;
9922 +       struct rq *rq;
9923 +
9924 +       /*
9925 +        * Test if we are atomic.  Since do_exit() needs to call into
9926 +--- 3658,3685 ----
9927 +  /*
9928 +   * schedule() is the main scheduler function.
9929 +   */
9930 ++
9931 ++ #ifdef CONFIG_CHOPSTIX
9932 ++ extern void (*rec_event)(void *,unsigned int);
9933 ++ struct event_spec {
9934 ++      unsigned long pc;
9935 ++      unsigned long dcookie;
9936 ++      unsigned int count;
9937 ++      unsigned int reason;
9938 ++ };
9939 ++ #endif
9940 ++
9941 +  asmlinkage void __sched schedule(void)
9942 +  {
9943 +       struct task_struct *prev, *next;
9944 +       struct prio_array *array;
9945 +       struct list_head *queue;
9946 +       unsigned long long now;
9947 ++      unsigned long run_time, diff;
9948 +       int cpu, idx, new_prio;
9949 +       long *switch_count;
9950 +       struct rq *rq;
9951 ++      int sampling_reason;
9952 +
9953 +       /*
9954 +        * Test if we are atomic.  Since do_exit() needs to call into
9955 +***************
9956 +*** 3700,3705 ****
9957 +       switch_count = &prev->nivcsw;
9958 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
9959 +               switch_count = &prev->nvcsw;
9960 +               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
9961 +                               unlikely(signal_pending(prev))))
9962 +                       prev->state = TASK_RUNNING;
9963 +--- 3733,3739 ----
9964 +       switch_count = &prev->nivcsw;
9965 +       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
9966 +               switch_count = &prev->nvcsw;
9967 ++
9968 +               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
9969 +                               unlikely(signal_pending(prev))))
9970 +                       prev->state = TASK_RUNNING;
9971 +***************
9972 +*** 3709,3714 ****
9973 +                               vx_uninterruptible_inc(prev);
9974 +                       }
9975 +                       deactivate_task(prev, rq);
9976 +               }
9977 +       }
9978 +
9979 +--- 3743,3759 ----
9980 +                               vx_uninterruptible_inc(prev);
9981 +                       }
9982 +                       deactivate_task(prev, rq);
9983 ++ #ifdef CONFIG_CHOPSTIX
9984 ++             /* An uninterruptible process just yielded. Record the current jiffie */
9985 ++                      if (prev->state & TASK_UNINTERRUPTIBLE) {
9986 ++                              prev->last_interrupted=jiffies;
9987 ++                      }
9988 ++             /* An interruptible process just yielded, or it got preempted.
9989 ++              * Mark it as interruptible */
9990 ++                      else if (prev->state & TASK_INTERRUPTIBLE) {
9991 ++                              prev->last_interrupted=INTERRUPTIBLE;
9992 ++                      }
9993 ++ #endif
9994 +               }
9995 +       }
9996 +
9997 +***************
9998 +*** 3785,3790 ****
9999 +               prev->sleep_avg = 0;
10000 +       prev->timestamp = prev->last_ran = now;
10001 +
10002 +       sched_info_switch(prev, next);
10003 +       if (likely(prev != next)) {
10004 +               next->timestamp = next->last_ran = now;
10005 +--- 3830,3869 ----
10006 +               prev->sleep_avg = 0;
10007 +       prev->timestamp = prev->last_ran = now;
10008 +
10009 ++ #ifdef CONFIG_CHOPSTIX
10010 ++      /* Run only if the Chopstix module so decrees it */
10011 ++      if (rec_event) {
10012 ++              prev->last_ran_j = jiffies;
10013 ++              if (next->last_interrupted!=INTERRUPTIBLE) {
10014 ++                      if (next->last_interrupted!=RUNNING) {
10015 ++                              diff = (jiffies-next->last_interrupted);
10016 ++                              sampling_reason = 0;/* BLOCKING */
10017 ++                      }
10018 ++                      else {
10019 ++                              diff = jiffies-next->last_ran_j;
10020 ++                              sampling_reason = 1;/* PREEMPTION */
10021 ++                      }
10022 ++
10023 ++                      if (diff >= HZ/10) {
10024 ++                              struct event event;
10025 ++                              struct event_spec espec;
10026 ++                 struct pt_regs *regs;
10027 ++                 regs = task_pt_regs(current);
10028 ++
10029 ++                              espec.reason = sampling_reason;
10030 ++                              event.event_data=&espec;
10031 ++                              event.task=next;
10032 ++                              espec.pc=regs->eip;
10033 ++                              event.event_type=2;
10034 ++                              /* index in the event array currently set up */
10035 ++                              /* make sure the counters are loaded in the order we want them to show up*/
10036 ++                              (*rec_event)(&event, diff);
10037 ++                      }
10038 ++              }
10039 ++         /* next has been elected to run */
10040 ++              next->last_interrupted=0;
10041 ++      }
10042 ++ #endif
10043 +       sched_info_switch(prev, next);
10044 +       if (likely(prev != next)) {
10045 +               next->timestamp = next->last_ran = now;
10046 +***************
10047 +*** 5737,5742 ****
10048 +       jiffies_to_timespec(p->policy == SCHED_FIFO ?
10049 +                               0 : task_timeslice(p), &t);
10050 +       read_unlock(&tasklist_lock);
10051 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
10052 +  out_nounlock:
10053 +       return retval;
10054 +--- 5817,5823 ----
10055 +       jiffies_to_timespec(p->policy == SCHED_FIFO ?
10056 +                               0 : task_timeslice(p), &t);
10057 +       read_unlock(&tasklist_lock);
10058 ++
10059 +       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
10060 +  out_nounlock:
10061 +       return retval;
10062 +***************
10063 +*** 7980,7982 ****
10064 +  }
10065 +
10066 +  #endif
10067 +--- 8061,8080 ----
10068 +  }
10069 +
10070 +  #endif
10071 ++
10072 ++ #ifdef CONFIG_CHOPSTIX
10073 ++ void (*rec_event)(void *,unsigned int) = NULL;
10074 ++
10075 ++ /* To support safe calling from asm */
10076 ++ asmlinkage void rec_event_asm (struct event *event_signature_in, unsigned int count) {
10077 ++     struct pt_regs *regs;
10078 ++     struct event_spec *es = event_signature_in->event_data;
10079 ++     regs = task_pt_regs(current);
10080 ++      event_signature_in->task=current;
10081 ++      es->pc=regs->eip;
10082 ++     event_signature_in->count=1;
10083 ++     (*rec_event)(event_signature_in, count);
10084 ++ }
10085 ++ EXPORT_SYMBOL(rec_event);
10086 ++ EXPORT_SYMBOL(in_sched_functions);
10087 ++ #endif
10088 diff -Nurb linux-2.6.27-590/mm/memory.c linux-2.6.27-591/mm/memory.c
10089 --- linux-2.6.27-590/mm/memory.c        2010-02-01 19:42:07.000000000 -0500
10090 +++ linux-2.6.27-591/mm/memory.c        2010-02-01 19:43:07.000000000 -0500
10091 @@ -61,6 +61,7 @@
10092
10093  #include <linux/swapops.h>
10094  #include <linux/elf.h>
10095 +#include <linux/arrays.h>
10096
10097  #include "internal.h"
10098
10099 @@ -2690,6 +2691,15 @@
10100         return ret;
10101  }
10102
10103 +extern void (*rec_event)(void *,unsigned int);
10104 +struct event_spec {
10105 +       unsigned long pc;
10106 +       unsigned long dcookie;
10107 +       unsigned count;
10108 +       unsigned char reason;
10109 +};
10110 +
10111 +
10112  /*
10113   * By the time we get here, we already hold the mm semaphore
10114   */
10115 @@ -2719,6 +2729,24 @@
10116         if (!pte)
10117                 return VM_FAULT_OOM;
10118
10119 +#ifdef CONFIG_CHOPSTIX
10120 +       if (rec_event) {
10121 +               struct event event;
10122 +               struct event_spec espec;
10123 +        struct pt_regs *regs;
10124 +        unsigned int pc;
10125 +        regs = task_pt_regs(current);
10126 +        pc = regs->ip & (unsigned int) ~4095;
10127 +
10128 +               espec.reason = 0; /* alloc */
10129 +               event.event_data=&espec;
10130 +               event.task = current;
10131 +               espec.pc=pc;
10132 +               event.event_type=5;
10133 +               (*rec_event)(&event, 1);
10134 +       }
10135 +#endif
10136 +
10137         return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
10138  }
10139
10140 diff -Nurb linux-2.6.27-590/mm/memory.c.orig linux-2.6.27-591/mm/memory.c.orig
10141 --- linux-2.6.27-590/mm/memory.c.orig   1969-12-31 19:00:00.000000000 -0500
10142 +++ linux-2.6.27-591/mm/memory.c.orig   2010-02-01 19:42:07.000000000 -0500
10143 @@ -0,0 +1,3035 @@
10144 +/*
10145 + *  linux/mm/memory.c
10146 + *
10147 + *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
10148 + */
10149 +
10150 +/*
10151 + * demand-loading started 01.12.91 - seems it is high on the list of
10152 + * things wanted, and it should be easy to implement. - Linus
10153 + */
10154 +
10155 +/*
10156 + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
10157 + * pages started 02.12.91, seems to work. - Linus.
10158 + *
10159 + * Tested sharing by executing about 30 /bin/sh: under the old kernel it
10160 + * would have taken more than the 6M I have free, but it worked well as
10161 + * far as I could see.
10162 + *
10163 + * Also corrected some "invalidate()"s - I wasn't doing enough of them.
10164 + */
10165 +
10166 +/*
10167 + * Real VM (paging to/from disk) started 18.12.91. Much more work and
10168 + * thought has to go into this. Oh, well..
10169 + * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
10170 + *             Found it. Everything seems to work now.
10171 + * 20.12.91  -  Ok, making the swap-device changeable like the root.
10172 + */
10173 +
10174 +/*
10175 + * 05.04.94  -  Multi-page memory management added for v1.1.
10176 + *             Idea by Alex Bligh (alex@cconcepts.co.uk)
10177 + *
10178 + * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
10179 + *             (Gerhard.Wichert@pdb.siemens.de)
10180 + *
10181 + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
10182 + */
10183 +
10184 +#include <linux/kernel_stat.h>
10185 +#include <linux/mm.h>
10186 +#include <linux/hugetlb.h>
10187 +#include <linux/mman.h>
10188 +#include <linux/swap.h>
10189 +#include <linux/highmem.h>
10190 +#include <linux/pagemap.h>
10191 +#include <linux/rmap.h>
10192 +#include <linux/module.h>
10193 +#include <linux/delayacct.h>
10194 +#include <linux/init.h>
10195 +#include <linux/writeback.h>
10196 +#include <linux/memcontrol.h>
10197 +#include <linux/mmu_notifier.h>
10198 +
10199 +#include <asm/pgalloc.h>
10200 +#include <asm/uaccess.h>
10201 +#include <asm/tlb.h>
10202 +#include <asm/tlbflush.h>
10203 +#include <asm/pgtable.h>
10204 +
10205 +#include <linux/swapops.h>
10206 +#include <linux/elf.h>
10207 +
10208 +#include "internal.h"
10209 +
10210 +#ifndef CONFIG_NEED_MULTIPLE_NODES
10211 +/* use the per-pgdat data instead for discontigmem - mbligh */
10212 +unsigned long max_mapnr;
10213 +struct page *mem_map;
10214 +
10215 +EXPORT_SYMBOL(max_mapnr);
10216 +EXPORT_SYMBOL(mem_map);
10217 +#endif
10218 +
10219 +unsigned long num_physpages;
10220 +/*
10221 + * A number of key systems in x86 including ioremap() rely on the assumption
10222 + * that high_memory defines the upper bound on direct map memory, then end
10223 + * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
10224 + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
10225 + * and ZONE_HIGHMEM.
10226 + */
10227 +void * high_memory;
10228 +
10229 +EXPORT_SYMBOL(num_physpages);
10230 +EXPORT_SYMBOL(high_memory);
10231 +
10232 +/*
10233 + * Randomize the address space (stacks, mmaps, brk, etc.).
10234 + *
10235 + * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
10236 + *   as ancient (libc5 based) binaries can segfault. )
10237 + */
10238 +int randomize_va_space __read_mostly =
10239 +#ifdef CONFIG_COMPAT_BRK
10240 +                                       1;
10241 +#else
10242 +                                       2;
10243 +#endif
10244 +
10245 +static int __init disable_randmaps(char *s)
10246 +{
10247 +       randomize_va_space = 0;
10248 +       return 1;
10249 +}
10250 +__setup("norandmaps", disable_randmaps);
10251 +
10252 +
10253 +/*
10254 + * If a p?d_bad entry is found while walking page tables, report
10255 + * the error, before resetting entry to p?d_none.  Usually (but
10256 + * very seldom) called out from the p?d_none_or_clear_bad macros.
10257 + */
10258 +
10259 +void pgd_clear_bad(pgd_t *pgd)
10260 +{
10261 +       pgd_ERROR(*pgd);
10262 +       pgd_clear(pgd);
10263 +}
10264 +
10265 +void pud_clear_bad(pud_t *pud)
10266 +{
10267 +       pud_ERROR(*pud);
10268 +       pud_clear(pud);
10269 +}
10270 +
10271 +void pmd_clear_bad(pmd_t *pmd)
10272 +{
10273 +       pmd_ERROR(*pmd);
10274 +       pmd_clear(pmd);
10275 +}
10276 +
10277 +/*
10278 + * Note: this doesn't free the actual pages themselves. That
10279 + * has been handled earlier when unmapping all the memory regions.
10280 + */
10281 +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
10282 +{
10283 +       pgtable_t token = pmd_pgtable(*pmd);
10284 +       pmd_clear(pmd);
10285 +       pte_free_tlb(tlb, token);
10286 +       tlb->mm->nr_ptes--;
10287 +}
10288 +
10289 +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
10290 +                               unsigned long addr, unsigned long end,
10291 +                               unsigned long floor, unsigned long ceiling)
10292 +{
10293 +       pmd_t *pmd;
10294 +       unsigned long next;
10295 +       unsigned long start;
10296 +
10297 +       start = addr;
10298 +       pmd = pmd_offset(pud, addr);
10299 +       do {
10300 +               next = pmd_addr_end(addr, end);
10301 +               if (pmd_none_or_clear_bad(pmd))
10302 +                       continue;
10303 +               free_pte_range(tlb, pmd);
10304 +       } while (pmd++, addr = next, addr != end);
10305 +
10306 +       start &= PUD_MASK;
10307 +       if (start < floor)
10308 +               return;
10309 +       if (ceiling) {
10310 +               ceiling &= PUD_MASK;
10311 +               if (!ceiling)
10312 +                       return;
10313 +       }
10314 +       if (end - 1 > ceiling - 1)
10315 +               return;
10316 +
10317 +       pmd = pmd_offset(pud, start);
10318 +       pud_clear(pud);
10319 +       pmd_free_tlb(tlb, pmd);
10320 +}
10321 +
10322 +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
10323 +                               unsigned long addr, unsigned long end,
10324 +                               unsigned long floor, unsigned long ceiling)
10325 +{
10326 +       pud_t *pud;
10327 +       unsigned long next;
10328 +       unsigned long start;
10329 +
10330 +       start = addr;
10331 +       pud = pud_offset(pgd, addr);
10332 +       do {
10333 +               next = pud_addr_end(addr, end);
10334 +               if (pud_none_or_clear_bad(pud))
10335 +                       continue;
10336 +               free_pmd_range(tlb, pud, addr, next, floor, ceiling);
10337 +       } while (pud++, addr = next, addr != end);
10338 +
10339 +       start &= PGDIR_MASK;
10340 +       if (start < floor)
10341 +               return;
10342 +       if (ceiling) {
10343 +               ceiling &= PGDIR_MASK;
10344 +               if (!ceiling)
10345 +                       return;
10346 +       }
10347 +       if (end - 1 > ceiling - 1)
10348 +               return;
10349 +
10350 +       pud = pud_offset(pgd, start);
10351 +       pgd_clear(pgd);
10352 +       pud_free_tlb(tlb, pud);
10353 +}
10354 +
10355 +/*
10356 + * This function frees user-level page tables of a process.
10357 + *
10358 + * Must be called with pagetable lock held.
10359 + */
10360 +void free_pgd_range(struct mmu_gather *tlb,
10361 +                       unsigned long addr, unsigned long end,
10362 +                       unsigned long floor, unsigned long ceiling)
10363 +{
10364 +       pgd_t *pgd;
10365 +       unsigned long next;
10366 +       unsigned long start;
10367 +
10368 +       /*
10369 +        * The next few lines have given us lots of grief...
10370 +        *
10371 +        * Why are we testing PMD* at this top level?  Because often
10372 +        * there will be no work to do at all, and we'd prefer not to
10373 +        * go all the way down to the bottom just to discover that.
10374 +        *
10375 +        * Why all these "- 1"s?  Because 0 represents both the bottom
10376 +        * of the address space and the top of it (using -1 for the
10377 +        * top wouldn't help much: the masks would do the wrong thing).
10378 +        * The rule is that addr 0 and floor 0 refer to the bottom of
10379 +        * the address space, but end 0 and ceiling 0 refer to the top
10380 +        * Comparisons need to use "end - 1" and "ceiling - 1" (though
10381 +        * that end 0 case should be mythical).
10382 +        *
10383 +        * Wherever addr is brought up or ceiling brought down, we must
10384 +        * be careful to reject "the opposite 0" before it confuses the
10385 +        * subsequent tests.  But what about where end is brought down
10386 +        * by PMD_SIZE below? no, end can't go down to 0 there.
10387 +        *
10388 +        * Whereas we round start (addr) and ceiling down, by different
10389 +        * masks at different levels, in order to test whether a table
10390 +        * now has no other vmas using it, so can be freed, we don't
10391 +        * bother to round floor or end up - the tests don't need that.
10392 +        */
10393 +
10394 +       addr &= PMD_MASK;
10395 +       if (addr < floor) {
10396 +               addr += PMD_SIZE;
10397 +               if (!addr)
10398 +                       return;
10399 +       }
10400 +       if (ceiling) {
10401 +               ceiling &= PMD_MASK;
10402 +               if (!ceiling)
10403 +                       return;
10404 +       }
10405 +       if (end - 1 > ceiling - 1)
10406 +               end -= PMD_SIZE;
10407 +       if (addr > end - 1)
10408 +               return;
10409 +
10410 +       start = addr;
10411 +       pgd = pgd_offset(tlb->mm, addr);
10412 +       do {
10413 +               next = pgd_addr_end(addr, end);
10414 +               if (pgd_none_or_clear_bad(pgd))
10415 +                       continue;
10416 +               free_pud_range(tlb, pgd, addr, next, floor, ceiling);
10417 +       } while (pgd++, addr = next, addr != end);
10418 +}
10419 +
10420 +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
10421 +               unsigned long floor, unsigned long ceiling)
10422 +{
10423 +       while (vma) {
10424 +               struct vm_area_struct *next = vma->vm_next;
10425 +               unsigned long addr = vma->vm_start;
10426 +
10427 +               /*
10428 +                * Hide vma from rmap and vmtruncate before freeing pgtables
10429 +                */
10430 +               anon_vma_unlink(vma);
10431 +               unlink_file_vma(vma);
10432 +
10433 +               if (is_vm_hugetlb_page(vma)) {
10434 +                       hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
10435 +                               floor, next? next->vm_start: ceiling);
10436 +               } else {
10437 +                       /*
10438 +                        * Optimization: gather nearby vmas into one call down
10439 +                        */
10440 +                       while (next && next->vm_start <= vma->vm_end + PMD_SIZE
10441 +                              && !is_vm_hugetlb_page(next)) {
10442 +                               vma = next;
10443 +                               next = vma->vm_next;
10444 +                               anon_vma_unlink(vma);
10445 +                               unlink_file_vma(vma);
10446 +                       }
10447 +                       free_pgd_range(tlb, addr, vma->vm_end,
10448 +                               floor, next? next->vm_start: ceiling);
10449 +               }
10450 +               vma = next;
10451 +       }
10452 +}
10453 +
10454 +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
10455 +{
10456 +       pgtable_t new = pte_alloc_one(mm, address);
10457 +       if (!new)
10458 +               return -ENOMEM;
10459 +
10460 +       /*
10461 +        * Ensure all pte setup (eg. pte page lock and page clearing) are
10462 +        * visible before the pte is made visible to other CPUs by being
10463 +        * put into page tables.
10464 +        *
10465 +        * The other side of the story is the pointer chasing in the page
10466 +        * table walking code (when walking the page table without locking;
10467 +        * ie. most of the time). Fortunately, these data accesses consist
10468 +        * of a chain of data-dependent loads, meaning most CPUs (alpha
10469 +        * being the notable exception) will already guarantee loads are
10470 +        * seen in-order. See the alpha page table accessors for the
10471 +        * smp_read_barrier_depends() barriers in page table walking code.
10472 +        */
10473 +       smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
10474 +
10475 +       spin_lock(&mm->page_table_lock);
10476 +       if (!pmd_present(*pmd)) {       /* Has another populated it ? */
10477 +               mm->nr_ptes++;
10478 +               pmd_populate(mm, pmd, new);
10479 +               new = NULL;
10480 +       }
10481 +       spin_unlock(&mm->page_table_lock);
10482 +       if (new)
10483 +               pte_free(mm, new);
10484 +       return 0;
10485 +}
10486 +
10487 +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
10488 +{
10489 +       pte_t *new = pte_alloc_one_kernel(&init_mm, address);
10490 +       if (!new)
10491 +               return -ENOMEM;
10492 +
10493 +       smp_wmb(); /* See comment in __pte_alloc */
10494 +
10495 +       spin_lock(&init_mm.page_table_lock);
10496 +       if (!pmd_present(*pmd)) {       /* Has another populated it ? */
10497 +               pmd_populate_kernel(&init_mm, pmd, new);
10498 +               new = NULL;
10499 +       }
10500 +       spin_unlock(&init_mm.page_table_lock);
10501 +       if (new)
10502 +               pte_free_kernel(&init_mm, new);
10503 +       return 0;
10504 +}
10505 +
10506 +static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
10507 +{
10508 +       if (file_rss)
10509 +               add_mm_counter(mm, file_rss, file_rss);
10510 +       if (anon_rss)
10511 +               add_mm_counter(mm, anon_rss, anon_rss);
10512 +}
10513 +
10514 +/*
10515 + * This function is called to print an error when a bad pte
10516 + * is found. For example, we might have a PFN-mapped pte in
10517 + * a region that doesn't allow it.
10518 + *
10519 + * The calling function must still handle the error.
10520 + */
10521 +static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
10522 +                         unsigned long vaddr)
10523 +{
10524 +       printk(KERN_ERR "Bad pte = %08llx, process = %s, "
10525 +                       "vm_flags = %lx, vaddr = %lx\n",
10526 +               (long long)pte_val(pte),
10527 +               (vma->vm_mm == current->mm ? current->comm : "???"),
10528 +               vma->vm_flags, vaddr);
10529 +       dump_stack();
10530 +}
10531 +
10532 +static inline int is_cow_mapping(unsigned int flags)
10533 +{
10534 +       return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
10535 +}
10536 +
10537 +/*
10538 + * vm_normal_page -- This function gets the "struct page" associated with a pte.
10539 + *
10540 + * "Special" mappings do not wish to be associated with a "struct page" (either
10541 + * it doesn't exist, or it exists but they don't want to touch it). In this
10542 + * case, NULL is returned here. "Normal" mappings do have a struct page.
10543 + *
10544 + * There are 2 broad cases. Firstly, an architecture may define a pte_special()
10545 + * pte bit, in which case this function is trivial. Secondly, an architecture
10546 + * may not have a spare pte bit, which requires a more complicated scheme,
10547 + * described below.
10548 + *
10549 + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
10550 + * special mapping (even if there are underlying and valid "struct pages").
10551 + * COWed pages of a VM_PFNMAP are always normal.
10552 + *
10553 + * The way we recognize COWed pages within VM_PFNMAP mappings is through the
10554 + * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
10555 + * set, and the vm_pgoff will point to the first PFN mapped: thus every special
10556 + * mapping will always honor the rule
10557 + *
10558 + *     pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
10559 + *
10560 + * And for normal mappings this is false.
10561 + *
10562 + * This restricts such mappings to be a linear translation from virtual address
10563 + * to pfn. To get around this restriction, we allow arbitrary mappings so long
10564 + * as the vma is not a COW mapping; in that case, we know that all ptes are
10565 + * special (because none can have been COWed).
10566 + *
10567 + *
10568 + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
10569 + *
10570 + * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
10571 + * page" backing, however the difference is that _all_ pages with a struct
10572 + * page (that is, those where pfn_valid is true) are refcounted and considered
10573 + * normal pages by the VM. The disadvantage is that pages are refcounted
10574 + * (which can be slower and simply not an option for some PFNMAP users). The
10575 + * advantage is that we don't have to follow the strict linearity rule of
10576 + * PFNMAP mappings in order to support COWable mappings.
10577 + *
10578 + */
10579 +#ifdef __HAVE_ARCH_PTE_SPECIAL
10580 +# define HAVE_PTE_SPECIAL 1
10581 +#else
10582 +# define HAVE_PTE_SPECIAL 0
10583 +#endif
10584 +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
10585 +                               pte_t pte)
10586 +{
10587 +       unsigned long pfn;
10588 +
10589 +       if (HAVE_PTE_SPECIAL) {
10590 +               if (likely(!pte_special(pte))) {
10591 +                       VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
10592 +                       return pte_page(pte);
10593 +               }
10594 +               VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
10595 +               return NULL;
10596 +       }
10597 +
10598 +       /* !HAVE_PTE_SPECIAL case follows: */
10599 +
10600 +       pfn = pte_pfn(pte);
10601 +
10602 +       if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
10603 +               if (vma->vm_flags & VM_MIXEDMAP) {
10604 +                       if (!pfn_valid(pfn))
10605 +                               return NULL;
10606 +                       goto out;
10607 +               } else {
10608 +                       unsigned long off;
10609 +                       off = (addr - vma->vm_start) >> PAGE_SHIFT;
10610 +                       if (pfn == vma->vm_pgoff + off)
10611 +                               return NULL;
10612 +                       if (!is_cow_mapping(vma->vm_flags))
10613 +                               return NULL;
10614 +               }
10615 +       }
10616 +
10617 +       VM_BUG_ON(!pfn_valid(pfn));
10618 +
10619 +       /*
10620 +        * NOTE! We still have PageReserved() pages in the page tables.
10621 +        *
10622 +        * eg. VDSO mappings can cause them to exist.
10623 +        */
10624 +out:
10625 +       return pfn_to_page(pfn);
10626 +}
10627 +
10628 +/*
10629 + * copy one vm_area from one task to the other. Assumes the page tables
10630 + * already present in the new task to be cleared in the whole range
10631 + * covered by this vma.
10632 + */
10633 +
10634 +static inline void
10635 +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
10636 +               pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
10637 +               unsigned long addr, int *rss)
10638 +{
10639 +       unsigned long vm_flags = vma->vm_flags;
10640 +       pte_t pte = *src_pte;
10641 +       struct page *page;
10642 +
10643 +       /* pte contains position in swap or file, so copy. */
10644 +       if (unlikely(!pte_present(pte))) {
10645 +               if (!pte_file(pte)) {
10646 +                       swp_entry_t entry = pte_to_swp_entry(pte);
10647 +
10648 +                       swap_duplicate(entry);
10649 +                       /* make sure dst_mm is on swapoff's mmlist. */
10650 +                       if (unlikely(list_empty(&dst_mm->mmlist))) {
10651 +                               spin_lock(&mmlist_lock);
10652 +                               if (list_empty(&dst_mm->mmlist))
10653 +                                       list_add(&dst_mm->mmlist,
10654 +                                                &src_mm->mmlist);
10655 +                               spin_unlock(&mmlist_lock);
10656 +                       }
10657 +                       if (is_write_migration_entry(entry) &&
10658 +                                       is_cow_mapping(vm_flags)) {
10659 +                               /*
10660 +                                * COW mappings require pages in both parent
10661 +                                * and child to be set to read.
10662 +                                */
10663 +                               make_migration_entry_read(&entry);
10664 +                               pte = swp_entry_to_pte(entry);
10665 +                               set_pte_at(src_mm, addr, src_pte, pte);
10666 +                       }
10667 +               }
10668 +               goto out_set_pte;
10669 +       }
10670 +
10671 +       /*
10672 +        * If it's a COW mapping, write protect it both
10673 +        * in the parent and the child
10674 +        */
10675 +       if (is_cow_mapping(vm_flags)) {
10676 +               ptep_set_wrprotect(src_mm, addr, src_pte);
10677 +               pte = pte_wrprotect(pte);
10678 +       }
10679 +
10680 +       /*
10681 +        * If it's a shared mapping, mark it clean in
10682 +        * the child
10683 +        */
10684 +       if (vm_flags & VM_SHARED)
10685 +               pte = pte_mkclean(pte);
10686 +       pte = pte_mkold(pte);
10687 +
10688 +       page = vm_normal_page(vma, addr, pte);
10689 +       if (page) {
10690 +               get_page(page);
10691 +               page_dup_rmap(page, vma, addr);
10692 +               rss[!!PageAnon(page)]++;
10693 +       }
10694 +
10695 +out_set_pte:
10696 +       set_pte_at(dst_mm, addr, dst_pte, pte);
10697 +}
10698 +
10699 +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
10700 +               pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
10701 +               unsigned long addr, unsigned long end)
10702 +{
10703 +       pte_t *src_pte, *dst_pte;
10704 +       spinlock_t *src_ptl, *dst_ptl;
10705 +       int progress = 0;
10706 +       int rss[2];
10707 +
10708 +       if (!vx_rss_avail(dst_mm, ((end - addr)/PAGE_SIZE + 1)))
10709 +               return -ENOMEM;
10710 +
10711 +again:
10712 +       rss[1] = rss[0] = 0;
10713 +       dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
10714 +       if (!dst_pte)
10715 +               return -ENOMEM;
10716 +       src_pte = pte_offset_map_nested(src_pmd, addr);
10717 +       src_ptl = pte_lockptr(src_mm, src_pmd);
10718 +       spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
10719 +       arch_enter_lazy_mmu_mode();
10720 +
10721 +       do {
10722 +               /*
10723 +                * We are holding two locks at this point - either of them
10724 +                * could generate latencies in another task on another CPU.
10725 +                */
10726 +               if (progress >= 32) {
10727 +                       progress = 0;
10728 +                       if (need_resched() ||
10729 +                           spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
10730 +                               break;
10731 +               }
10732 +               if (pte_none(*src_pte)) {
10733 +                       progress++;
10734 +                       continue;
10735 +               }
10736 +               copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
10737 +               progress += 8;
10738 +       } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
10739 +
10740 +       arch_leave_lazy_mmu_mode();
10741 +       spin_unlock(src_ptl);
10742 +       pte_unmap_nested(src_pte - 1);
10743 +       add_mm_rss(dst_mm, rss[0], rss[1]);
10744 +       pte_unmap_unlock(dst_pte - 1, dst_ptl);
10745 +       cond_resched();
10746 +       if (addr != end)
10747 +               goto again;
10748 +       return 0;
10749 +}
10750 +
10751 +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
10752 +               pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
10753 +               unsigned long addr, unsigned long end)
10754 +{
10755 +       pmd_t *src_pmd, *dst_pmd;
10756 +       unsigned long next;
10757 +
10758 +       dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
10759 +       if (!dst_pmd)
10760 +               return -ENOMEM;
10761 +       src_pmd = pmd_offset(src_pud, addr);
10762 +       do {
10763 +               next = pmd_addr_end(addr, end);
10764 +               if (pmd_none_or_clear_bad(src_pmd))
10765 +                       continue;
10766 +               if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
10767 +                                               vma, addr, next))
10768 +                       return -ENOMEM;
10769 +       } while (dst_pmd++, src_pmd++, addr = next, addr != end);
10770 +       return 0;
10771 +}
10772 +
10773 +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
10774 +               pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
10775 +               unsigned long addr, unsigned long end)
10776 +{
10777 +       pud_t *src_pud, *dst_pud;
10778 +       unsigned long next;
10779 +
10780 +       dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
10781 +       if (!dst_pud)
10782 +               return -ENOMEM;
10783 +       src_pud = pud_offset(src_pgd, addr);
10784 +       do {
10785 +               next = pud_addr_end(addr, end);
10786 +               if (pud_none_or_clear_bad(src_pud))
10787 +                       continue;
10788 +               if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
10789 +                                               vma, addr, next))
10790 +                       return -ENOMEM;
10791 +       } while (dst_pud++, src_pud++, addr = next, addr != end);
10792 +       return 0;
10793 +}
10794 +
10795 +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
10796 +               struct vm_area_struct *vma)
10797 +{
10798 +       pgd_t *src_pgd, *dst_pgd;
10799 +       unsigned long next;
10800 +       unsigned long addr = vma->vm_start;
10801 +       unsigned long end = vma->vm_end;
10802 +       int ret;
10803 +
10804 +       /*
10805 +        * Don't copy ptes where a page fault will fill them correctly.
10806 +        * Fork becomes much lighter when there are big shared or private
10807 +        * readonly mappings. The tradeoff is that copy_page_range is more
10808 +        * efficient than faulting.
10809 +        */
10810 +       if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
10811 +               if (!vma->anon_vma)
10812 +                       return 0;
10813 +       }
10814 +
10815 +       if (is_vm_hugetlb_page(vma))
10816 +               return copy_hugetlb_page_range(dst_mm, src_mm, vma);
10817 +
10818 +       /*
10819 +        * We need to invalidate the secondary MMU mappings only when
10820 +        * there could be a permission downgrade on the ptes of the
10821 +        * parent mm. And a permission downgrade will only happen if
10822 +        * is_cow_mapping() returns true.
10823 +        */
10824 +       if (is_cow_mapping(vma->vm_flags))
10825 +               mmu_notifier_invalidate_range_start(src_mm, addr, end);
10826 +
10827 +       ret = 0;
10828 +       dst_pgd = pgd_offset(dst_mm, addr);
10829 +       src_pgd = pgd_offset(src_mm, addr);
10830 +       do {
10831 +               next = pgd_addr_end(addr, end);
10832 +               if (pgd_none_or_clear_bad(src_pgd))
10833 +                       continue;
10834 +               if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
10835 +                                           vma, addr, next))) {
10836 +                       ret = -ENOMEM;
10837 +                       break;
10838 +               }
10839 +       } while (dst_pgd++, src_pgd++, addr = next, addr != end);
10840 +
10841 +       if (is_cow_mapping(vma->vm_flags))
10842 +               mmu_notifier_invalidate_range_end(src_mm,
10843 +                                                 vma->vm_start, end);
10844 +       return ret;
10845 +}
10846 +
10847 +static unsigned long zap_pte_range(struct mmu_gather *tlb,
10848 +                               struct vm_area_struct *vma, pmd_t *pmd,
10849 +                               unsigned long addr, unsigned long end,
10850 +                               long *zap_work, struct zap_details *details)
10851 +{
10852 +       struct mm_struct *mm = tlb->mm;
10853 +       pte_t *pte;
10854 +       spinlock_t *ptl;
10855 +       int file_rss = 0;
10856 +       int anon_rss = 0;
10857 +
10858 +       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
10859 +       arch_enter_lazy_mmu_mode();
10860 +       do {
10861 +               pte_t ptent = *pte;
10862 +               if (pte_none(ptent)) {
10863 +                       (*zap_work)--;
10864 +                       continue;
10865 +               }
10866 +
10867 +               (*zap_work) -= PAGE_SIZE;
10868 +
10869 +               if (pte_present(ptent)) {
10870 +                       struct page *page;
10871 +
10872 +                       page = vm_normal_page(vma, addr, ptent);
10873 +                       if (unlikely(details) && page) {
10874 +                               /*
10875 +                                * unmap_shared_mapping_pages() wants to
10876 +                                * invalidate cache without truncating:
10877 +                                * unmap shared but keep private pages.
10878 +                                */
10879 +                               if (details->check_mapping &&
10880 +                                   details->check_mapping != page->mapping)
10881 +                                       continue;
10882 +                               /*
10883 +                                * Each page->index must be checked when
10884 +                                * invalidating or truncating nonlinear.
10885 +                                */
10886 +                               if (details->nonlinear_vma &&
10887 +                                   (page->index < details->first_index ||
10888 +                                    page->index > details->last_index))
10889 +                                       continue;
10890 +                       }
10891 +                       ptent = ptep_get_and_clear_full(mm, addr, pte,
10892 +                                                       tlb->fullmm);
10893 +                       tlb_remove_tlb_entry(tlb, pte, addr);
10894 +                       if (unlikely(!page))
10895 +                               continue;
10896 +                       if (unlikely(details) && details->nonlinear_vma
10897 +                           && linear_page_index(details->nonlinear_vma,
10898 +                                               addr) != page->index)
10899 +                               set_pte_at(mm, addr, pte,
10900 +                                          pgoff_to_pte(page->index));
10901 +                       if (PageAnon(page))
10902 +                               anon_rss--;
10903 +                       else {
10904 +                               if (pte_dirty(ptent))
10905 +                                       set_page_dirty(page);
10906 +                               if (pte_young(ptent))
10907 +                                       SetPageReferenced(page);
10908 +                               file_rss--;
10909 +                       }
10910 +                       page_remove_rmap(page, vma);
10911 +                       tlb_remove_page(tlb, page);
10912 +                       continue;
10913 +               }
10914 +               /*
10915 +                * If details->check_mapping, we leave swap entries;
10916 +                * if details->nonlinear_vma, we leave file entries.
10917 +                */
10918 +               if (unlikely(details))
10919 +                       continue;
10920 +               if (!pte_file(ptent))
10921 +                       free_swap_and_cache(pte_to_swp_entry(ptent));
10922 +               pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
10923 +       } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
10924 +
10925 +       add_mm_rss(mm, file_rss, anon_rss);
10926 +       arch_leave_lazy_mmu_mode();
10927 +       pte_unmap_unlock(pte - 1, ptl);
10928 +
10929 +       return addr;
10930 +}
10931 +
10932 +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
10933 +                               struct vm_area_struct *vma, pud_t *pud,
10934 +                               unsigned long addr, unsigned long end,
10935 +                               long *zap_work, struct zap_details *details)
10936 +{
10937 +       pmd_t *pmd;
10938 +       unsigned long next;
10939 +
10940 +       pmd = pmd_offset(pud, addr);
10941 +       do {
10942 +               next = pmd_addr_end(addr, end);
10943 +               if (pmd_none_or_clear_bad(pmd)) {
10944 +                       (*zap_work)--;
10945 +                       continue;
10946 +               }
10947 +               next = zap_pte_range(tlb, vma, pmd, addr, next,
10948 +                                               zap_work, details);
10949 +       } while (pmd++, addr = next, (addr != end && *zap_work > 0));
10950 +
10951 +       return addr;
10952 +}
10953 +
10954 +static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
10955 +                               struct vm_area_struct *vma, pgd_t *pgd,
10956 +                               unsigned long addr, unsigned long end,
10957 +                               long *zap_work, struct zap_details *details)
10958 +{
10959 +       pud_t *pud;
10960 +       unsigned long next;
10961 +
10962 +       pud = pud_offset(pgd, addr);
10963 +       do {
10964 +               next = pud_addr_end(addr, end);
10965 +               if (pud_none_or_clear_bad(pud)) {
10966 +                       (*zap_work)--;
10967 +                       continue;
10968 +               }
10969 +               next = zap_pmd_range(tlb, vma, pud, addr, next,
10970 +                                               zap_work, details);
10971 +       } while (pud++, addr = next, (addr != end && *zap_work > 0));
10972 +
10973 +       return addr;
10974 +}
10975 +
10976 +static unsigned long unmap_page_range(struct mmu_gather *tlb,
10977 +                               struct vm_area_struct *vma,
10978 +                               unsigned long addr, unsigned long end,
10979 +                               long *zap_work, struct zap_details *details)
10980 +{
10981 +       pgd_t *pgd;
10982 +       unsigned long next;
10983 +
10984 +       if (details && !details->check_mapping && !details->nonlinear_vma)
10985 +               details = NULL;
10986 +
10987 +       BUG_ON(addr >= end);
10988 +       tlb_start_vma(tlb, vma);
10989 +       pgd = pgd_offset(vma->vm_mm, addr);
10990 +       do {
10991 +               next = pgd_addr_end(addr, end);
10992 +               if (pgd_none_or_clear_bad(pgd)) {
10993 +                       (*zap_work)--;
10994 +                       continue;
10995 +               }
10996 +               next = zap_pud_range(tlb, vma, pgd, addr, next,
10997 +                                               zap_work, details);
10998 +       } while (pgd++, addr = next, (addr != end && *zap_work > 0));
10999 +       tlb_end_vma(tlb, vma);
11000 +
11001 +       return addr;
11002 +}
11003 +
11004 +#ifdef CONFIG_PREEMPT
11005 +# define ZAP_BLOCK_SIZE        (8 * PAGE_SIZE)
11006 +#else
11007 +/* No preempt: go for improved straight-line efficiency */
11008 +# define ZAP_BLOCK_SIZE        (1024 * PAGE_SIZE)
11009 +#endif
11010 +
11011 +/**
11012 + * unmap_vmas - unmap a range of memory covered by a list of vma's
11013 + * @tlbp: address of the caller's struct mmu_gather
11014 + * @vma: the starting vma
11015 + * @start_addr: virtual address at which to start unmapping
11016 + * @end_addr: virtual address at which to end unmapping
11017 + * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
11018 + * @details: details of nonlinear truncation or shared cache invalidation
11019 + *
11020 + * Returns the end address of the unmapping (restart addr if interrupted).
11021 + *
11022 + * Unmap all pages in the vma list.
11023 + *
11024 + * We aim to not hold locks for too long (for scheduling latency reasons).
11025 + * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
11026 + * return the ending mmu_gather to the caller.
11027 + *
11028 + * Only addresses between `start' and `end' will be unmapped.
11029 + *
11030 + * The VMA list must be sorted in ascending virtual address order.
11031 + *
11032 + * unmap_vmas() assumes that the caller will flush the whole unmapped address
11033 + * range after unmap_vmas() returns.  So the only responsibility here is to
11034 + * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
11035 + * drops the lock and schedules.
11036 + */
11037 +unsigned long unmap_vmas(struct mmu_gather **tlbp,
11038 +               struct vm_area_struct *vma, unsigned long start_addr,
11039 +               unsigned long end_addr, unsigned long *nr_accounted,
11040 +               struct zap_details *details)
11041 +{
11042 +       long zap_work = ZAP_BLOCK_SIZE;
11043 +       unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
11044 +       int tlb_start_valid = 0;
11045 +       unsigned long start = start_addr;
11046 +       spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
11047 +       int fullmm = (*tlbp)->fullmm;
11048 +       struct mm_struct *mm = vma->vm_mm;
11049 +
11050 +       mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
11051 +       for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
11052 +               unsigned long end;
11053 +
11054 +               start = max(vma->vm_start, start_addr);
11055 +               if (start >= vma->vm_end)
11056 +                       continue;
11057 +               end = min(vma->vm_end, end_addr);
11058 +               if (end <= vma->vm_start)
11059 +                       continue;
11060 +
11061 +               if (vma->vm_flags & VM_ACCOUNT)
11062 +                       *nr_accounted += (end - start) >> PAGE_SHIFT;
11063 +
11064 +               while (start != end) {
11065 +                       if (!tlb_start_valid) {
11066 +                               tlb_start = start;
11067 +                               tlb_start_valid = 1;
11068 +                       }
11069 +
11070 +                       if (unlikely(is_vm_hugetlb_page(vma))) {
11071 +                               /*
11072 +                                * It is undesirable to test vma->vm_file as it
11073 +                                * should be non-null for valid hugetlb area.
11074 +                                * However, vm_file will be NULL in the error
11075 +                                * cleanup path of do_mmap_pgoff. When
11076 +                                * hugetlbfs ->mmap method fails,
11077 +                                * do_mmap_pgoff() nullifies vma->vm_file
11078 +                                * before calling this function to clean up.
11079 +                                * Since no pte has actually been setup, it is
11080 +                                * safe to do nothing in this case.
11081 +                                */
11082 +                               if (vma->vm_file) {
11083 +                                       unmap_hugepage_range(vma, start, end, NULL);
11084 +                                       zap_work -= (end - start) /
11085 +                                       pages_per_huge_page(hstate_vma(vma));
11086 +                               }
11087 +
11088 +                               start = end;
11089 +                       } else
11090 +                               start = unmap_page_range(*tlbp, vma,
11091 +                                               start, end, &zap_work, details);
11092 +
11093 +                       if (zap_work > 0) {
11094 +                               BUG_ON(start != end);
11095 +                               break;
11096 +                       }
11097 +
11098 +                       tlb_finish_mmu(*tlbp, tlb_start, start);
11099 +
11100 +                       if (need_resched() ||
11101 +                               (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
11102 +                               if (i_mmap_lock) {
11103 +                                       *tlbp = NULL;
11104 +                                       goto out;
11105 +                               }
11106 +                               cond_resched();
11107 +                       }
11108 +
11109 +                       *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
11110 +                       tlb_start_valid = 0;
11111 +                       zap_work = ZAP_BLOCK_SIZE;
11112 +               }
11113 +       }
11114 +out:
11115 +       mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
11116 +       return start;   /* which is now the end (or restart) address */
11117 +}
11118 +
11119 +/**
11120 + * zap_page_range - remove user pages in a given range
11121 + * @vma: vm_area_struct holding the applicable pages
11122 + * @address: starting address of pages to zap
11123 + * @size: number of bytes to zap
11124 + * @details: details of nonlinear truncation or shared cache invalidation
11125 + */
11126 +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
11127 +               unsigned long size, struct zap_details *details)
11128 +{
11129 +       struct mm_struct *mm = vma->vm_mm;
11130 +       struct mmu_gather *tlb;
11131 +       unsigned long end = address + size;
11132 +       unsigned long nr_accounted = 0;
11133 +
11134 +       lru_add_drain();
11135 +       tlb = tlb_gather_mmu(mm, 0);
11136 +       update_hiwater_rss(mm);
11137 +       end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
11138 +       if (tlb)
11139 +               tlb_finish_mmu(tlb, address, end);
11140 +       return end;
11141 +}
11142 +
11143 +/**
11144 + * zap_vma_ptes - remove ptes mapping the vma
11145 + * @vma: vm_area_struct holding ptes to be zapped
11146 + * @address: starting address of pages to zap
11147 + * @size: number of bytes to zap
11148 + *
11149 + * This function only unmaps ptes assigned to VM_PFNMAP vmas.
11150 + *
11151 + * The entire address range must be fully contained within the vma.
11152 + *
11153 + * Returns 0 if successful.
11154 + */
11155 +int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
11156 +               unsigned long size)
11157 +{
11158 +       if (address < vma->vm_start || address + size > vma->vm_end ||
11159 +                       !(vma->vm_flags & VM_PFNMAP))
11160 +               return -1;
11161 +       zap_page_range(vma, address, size, NULL);
11162 +       return 0;
11163 +}
11164 +EXPORT_SYMBOL_GPL(zap_vma_ptes);
11165 +
11166 +/*
11167 + * Do a quick page-table lookup for a single page.
11168 + */
11169 +struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
11170 +                       unsigned int flags)
11171 +{
11172 +       pgd_t *pgd;
11173 +       pud_t *pud;
11174 +       pmd_t *pmd;
11175 +       pte_t *ptep, pte;
11176 +       spinlock_t *ptl;
11177 +       struct page *page;
11178 +       struct mm_struct *mm = vma->vm_mm;
11179 +
11180 +       page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
11181 +       if (!IS_ERR(page)) {
11182 +               BUG_ON(flags & FOLL_GET);
11183 +               goto out;
11184 +       }
11185 +
11186 +       page = NULL;
11187 +       pgd = pgd_offset(mm, address);
11188 +       if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
11189 +               goto no_page_table;
11190 +
11191 +       pud = pud_offset(pgd, address);
11192 +       if (pud_none(*pud))
11193 +               goto no_page_table;
11194 +       if (pud_huge(*pud)) {
11195 +               BUG_ON(flags & FOLL_GET);
11196 +               page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
11197 +               goto out;
11198 +       }
11199 +       if (unlikely(pud_bad(*pud)))
11200 +               goto no_page_table;
11201 +
11202 +       pmd = pmd_offset(pud, address);
11203 +       if (pmd_none(*pmd))
11204 +               goto no_page_table;
11205 +       if (pmd_huge(*pmd)) {
11206 +               BUG_ON(flags & FOLL_GET);
11207 +               page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
11208 +               goto out;
11209 +       }
11210 +       if (unlikely(pmd_bad(*pmd)))
11211 +               goto no_page_table;
11212 +
11213 +       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
11214 +
11215 +       pte = *ptep;
11216 +       if (!pte_present(pte))
11217 +               goto no_page;
11218 +       if ((flags & FOLL_WRITE) && !pte_write(pte))
11219 +               goto unlock;
11220 +       page = vm_normal_page(vma, address, pte);
11221 +       if (unlikely(!page))
11222 +               goto bad_page;
11223 +
11224 +       if (flags & FOLL_GET)
11225 +               get_page(page);
11226 +       if (flags & FOLL_TOUCH) {
11227 +               if ((flags & FOLL_WRITE) &&
11228 +                   !pte_dirty(pte) && !PageDirty(page))
11229 +                       set_page_dirty(page);
11230 +               mark_page_accessed(page);
11231 +       }
11232 +unlock:
11233 +       pte_unmap_unlock(ptep, ptl);
11234 +out:
11235 +       return page;
11236 +
11237 +bad_page:
11238 +       pte_unmap_unlock(ptep, ptl);
11239 +       return ERR_PTR(-EFAULT);
11240 +
11241 +no_page:
11242 +       pte_unmap_unlock(ptep, ptl);
11243 +       if (!pte_none(pte))
11244 +               return page;
11245 +       /* Fall through to ZERO_PAGE handling */
11246 +no_page_table:
11247 +       /*
11248 +        * When core dumping an enormous anonymous area that nobody
11249 +        * has touched so far, we don't want to allocate page tables.
11250 +        */
11251 +       if (flags & FOLL_ANON) {
11252 +               page = ZERO_PAGE(0);
11253 +               if (flags & FOLL_GET)
11254 +                       get_page(page);
11255 +               BUG_ON(flags & FOLL_WRITE);
11256 +       }
11257 +       return page;
11258 +}
11259 +
11260 +/* Can we do the FOLL_ANON optimization? */
11261 +static inline int use_zero_page(struct vm_area_struct *vma)
11262 +{
11263 +       /*
11264 +        * We don't want to optimize FOLL_ANON for make_pages_present()
11265 +        * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
11266 +        * we want to get the page from the page tables to make sure
11267 +        * that we serialize and update with any other user of that
11268 +        * mapping.
11269 +        */
11270 +       if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
11271 +               return 0;
11272 +       /*
11273 +        * And if we have a fault routine, it's not an anonymous region.
11274 +        */
11275 +       return !vma->vm_ops || !vma->vm_ops->fault;
11276 +}
11277 +
11278 +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
11279 +               unsigned long start, int len, int write, int force,
11280 +               struct page **pages, struct vm_area_struct **vmas)
11281 +{
11282 +       int i;
11283 +       unsigned int vm_flags;
11284 +
11285 +       if (len <= 0)
11286 +               return 0;
11287 +       /*
11288 +        * Require read or write permissions.
11289 +        * If 'force' is set, we only require the "MAY" flags.
11290 +        */
11291 +       vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
11292 +       vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
11293 +       i = 0;
11294 +
11295 +       do {
11296 +               struct vm_area_struct *vma;
11297 +               unsigned int foll_flags;
11298 +
11299 +               vma = find_extend_vma(mm, start);
11300 +               if (!vma && in_gate_area(tsk, start)) {
11301 +                       unsigned long pg = start & PAGE_MASK;
11302 +                       struct vm_area_struct *gate_vma = get_gate_vma(tsk);
11303 +                       pgd_t *pgd;
11304 +                       pud_t *pud;
11305 +                       pmd_t *pmd;
11306 +                       pte_t *pte;
11307 +                       if (write) /* user gate pages are read-only */
11308 +                               return i ? : -EFAULT;
11309 +                       if (pg > TASK_SIZE)
11310 +                               pgd = pgd_offset_k(pg);
11311 +                       else
11312 +                               pgd = pgd_offset_gate(mm, pg);
11313 +                       BUG_ON(pgd_none(*pgd));
11314 +                       pud = pud_offset(pgd, pg);
11315 +                       BUG_ON(pud_none(*pud));
11316 +                       pmd = pmd_offset(pud, pg);
11317 +                       if (pmd_none(*pmd))
11318 +                               return i ? : -EFAULT;
11319 +                       pte = pte_offset_map(pmd, pg);
11320 +                       if (pte_none(*pte)) {
11321 +                               pte_unmap(pte);
11322 +                               return i ? : -EFAULT;
11323 +                       }
11324 +                       if (pages) {
11325 +                               struct page *page = vm_normal_page(gate_vma, start, *pte);
11326 +                               pages[i] = page;
11327 +                               if (page)
11328 +                                       get_page(page);
11329 +                       }
11330 +                       pte_unmap(pte);
11331 +                       if (vmas)
11332 +                               vmas[i] = gate_vma;
11333 +                       i++;
11334 +                       start += PAGE_SIZE;
11335 +                       len--;
11336 +                       continue;
11337 +               }
11338 +
11339 +               if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
11340 +                               || !(vm_flags & vma->vm_flags))
11341 +                       return i ? : -EFAULT;
11342 +
11343 +               if (is_vm_hugetlb_page(vma)) {
11344 +                       i = follow_hugetlb_page(mm, vma, pages, vmas,
11345 +                                               &start, &len, i, write);
11346 +                       continue;
11347 +               }
11348 +
11349 +               foll_flags = FOLL_TOUCH;
11350 +               if (pages)
11351 +                       foll_flags |= FOLL_GET;
11352 +               if (!write && use_zero_page(vma))
11353 +                       foll_flags |= FOLL_ANON;
11354 +
11355 +               do {
11356 +                       struct page *page;
11357 +
11358 +                       /*
11359 +                        * If tsk is ooming, cut off its access to large memory
11360 +                        * allocations. It has a pending SIGKILL, but it can't
11361 +                        * be processed until returning to user space.
11362 +                        */
11363 +                       if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
11364 +                               return i ? i : -ENOMEM;
11365 +
11366 +                       if (write)
11367 +                               foll_flags |= FOLL_WRITE;
11368 +
11369 +                       cond_resched();
11370 +                       while (!(page = follow_page(vma, start, foll_flags))) {
11371 +                               int ret;
11372 +                               ret = handle_mm_fault(mm, vma, start,
11373 +                                               foll_flags & FOLL_WRITE);
11374 +                               if (ret & VM_FAULT_ERROR) {
11375 +                                       if (ret & VM_FAULT_OOM)
11376 +                                               return i ? i : -ENOMEM;
11377 +                                       else if (ret & VM_FAULT_SIGBUS)
11378 +                                               return i ? i : -EFAULT;
11379 +                                       BUG();
11380 +                               }
11381 +                               if (ret & VM_FAULT_MAJOR)
11382 +                                       tsk->maj_flt++;
11383 +                               else
11384 +                                       tsk->min_flt++;
11385 +
11386 +                               /*
11387 +                                * The VM_FAULT_WRITE bit tells us that
11388 +                                * do_wp_page has broken COW when necessary,
11389 +                                * even if maybe_mkwrite decided not to set
11390 +                                * pte_write. We can thus safely do subsequent
11391 +                                * page lookups as if they were reads.
11392 +                                */
11393 +                               if (ret & VM_FAULT_WRITE)
11394 +                                       foll_flags &= ~FOLL_WRITE;
11395 +
11396 +                               cond_resched();
11397 +                       }
11398 +                       if (IS_ERR(page))
11399 +                               return i ? i : PTR_ERR(page);
11400 +                       if (pages) {
11401 +                               pages[i] = page;
11402 +
11403 +                               flush_anon_page(vma, page, start);
11404 +                               flush_dcache_page(page);
11405 +                       }
11406 +                       if (vmas)
11407 +                               vmas[i] = vma;
11408 +                       i++;
11409 +                       start += PAGE_SIZE;
11410 +                       len--;
11411 +               } while (len && start < vma->vm_end);
11412 +       } while (len);
11413 +       return i;
11414 +}
11415 +EXPORT_SYMBOL(get_user_pages);
11416 +
11417 +pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
11418 +                       spinlock_t **ptl)
11419 +{
11420 +       pgd_t * pgd = pgd_offset(mm, addr);
11421 +       pud_t * pud = pud_alloc(mm, pgd, addr);
11422 +       if (pud) {
11423 +               pmd_t * pmd = pmd_alloc(mm, pud, addr);
11424 +               if (pmd)
11425 +                       return pte_alloc_map_lock(mm, pmd, addr, ptl);
11426 +       }
11427 +       return NULL;
11428 +}
11429 +
11430 +/*
11431 + * This is the old fallback for page remapping.
11432 + *
11433 + * For historical reasons, it only allows reserved pages. Only
11434 + * old drivers should use this, and they needed to mark their
11435 + * pages reserved for the old functions anyway.
11436 + */
11437 +static int insert_page(struct vm_area_struct *vma, unsigned long addr,
11438 +                       struct page *page, pgprot_t prot)
11439 +{
11440 +       struct mm_struct *mm = vma->vm_mm;
11441 +       int retval;
11442 +       pte_t *pte;
11443 +       spinlock_t *ptl;
11444 +
11445 +       retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
11446 +       if (retval)
11447 +               goto out;
11448 +
11449 +       retval = -EINVAL;
11450 +       if (PageAnon(page))
11451 +               goto out_uncharge;
11452 +       retval = -ENOMEM;
11453 +       flush_dcache_page(page);
11454 +       pte = get_locked_pte(mm, addr, &ptl);
11455 +       if (!pte)
11456 +               goto out_uncharge;
11457 +       retval = -EBUSY;
11458 +       if (!pte_none(*pte))
11459 +               goto out_unlock;
11460 +
11461 +       /* Ok, finally just insert the thing.. */
11462 +       get_page(page);
11463 +       inc_mm_counter(mm, file_rss);
11464 +       page_add_file_rmap(page);
11465 +       set_pte_at(mm, addr, pte, mk_pte(page, prot));
11466 +
11467 +       retval = 0;
11468 +       pte_unmap_unlock(pte, ptl);
11469 +       return retval;
11470 +out_unlock:
11471 +       pte_unmap_unlock(pte, ptl);
11472 +out_uncharge:
11473 +       mem_cgroup_uncharge_page(page);
11474 +out:
11475 +       return retval;
11476 +}
11477 +
11478 +/**
11479 + * vm_insert_page - insert single page into user vma
11480 + * @vma: user vma to map to
11481 + * @addr: target user address of this page
11482 + * @page: source kernel page
11483 + *
11484 + * This allows drivers to insert individual pages they've allocated
11485 + * into a user vma.
11486 + *
11487 + * The page has to be a nice clean _individual_ kernel allocation.
11488 + * If you allocate a compound page, you need to have marked it as
11489 + * such (__GFP_COMP), or manually just split the page up yourself
11490 + * (see split_page()).
11491 + *
11492 + * NOTE! Traditionally this was done with "remap_pfn_range()" which
11493 + * took an arbitrary page protection parameter. This doesn't allow
11494 + * that. Your vma protection will have to be set up correctly, which
11495 + * means that if you want a shared writable mapping, you'd better
11496 + * ask for a shared writable mapping!
11497 + *
11498 + * The page does not need to be reserved.
11499 + */
11500 +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
11501 +                       struct page *page)
11502 +{
11503 +       if (addr < vma->vm_start || addr >= vma->vm_end)
11504 +               return -EFAULT;
11505 +       if (!page_count(page))
11506 +               return -EINVAL;
11507 +       vma->vm_flags |= VM_INSERTPAGE;
11508 +       return insert_page(vma, addr, page, vma->vm_page_prot);
11509 +}
11510 +EXPORT_SYMBOL(vm_insert_page);
11511 +
11512 +static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
11513 +                       unsigned long pfn, pgprot_t prot)
11514 +{
11515 +       struct mm_struct *mm = vma->vm_mm;
11516 +       int retval;
11517 +       pte_t *pte, entry;
11518 +       spinlock_t *ptl;
11519 +
11520 +       retval = -ENOMEM;
11521 +       pte = get_locked_pte(mm, addr, &ptl);
11522 +       if (!pte)
11523 +               goto out;
11524 +       retval = -EBUSY;
11525 +       if (!pte_none(*pte))
11526 +               goto out_unlock;
11527 +
11528 +       /* Ok, finally just insert the thing.. */
11529 +       entry = pte_mkspecial(pfn_pte(pfn, prot));
11530 +       set_pte_at(mm, addr, pte, entry);
11531 +       update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
11532 +
11533 +       retval = 0;
11534 +out_unlock:
11535 +       pte_unmap_unlock(pte, ptl);
11536 +out:
11537 +       return retval;
11538 +}
11539 +
11540 +/**
11541 + * vm_insert_pfn - insert single pfn into user vma
11542 + * @vma: user vma to map to
11543 + * @addr: target user address of this page
11544 + * @pfn: source kernel pfn
11545 + *
11546 + * Similar to vm_inert_page, this allows drivers to insert individual pages
11547 + * they've allocated into a user vma. Same comments apply.
11548 + *
11549 + * This function should only be called from a vm_ops->fault handler, and
11550 + * in that case the handler should return NULL.
11551 + *
11552 + * vma cannot be a COW mapping.
11553 + *
11554 + * As this is called only for pages that do not currently exist, we
11555 + * do not need to flush old virtual caches or the TLB.
11556 + */
11557 +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
11558 +                       unsigned long pfn)
11559 +{
11560 +       /*
11561 +        * Technically, architectures with pte_special can avoid all these
11562 +        * restrictions (same for remap_pfn_range).  However we would like
11563 +        * consistency in testing and feature parity among all, so we should
11564 +        * try to keep these invariants in place for everybody.
11565 +        */
11566 +       BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
11567 +       BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
11568 +                                               (VM_PFNMAP|VM_MIXEDMAP));
11569 +       BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
11570 +       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
11571 +
11572 +       if (addr < vma->vm_start || addr >= vma->vm_end)
11573 +               return -EFAULT;
11574 +       return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
11575 +}
11576 +EXPORT_SYMBOL(vm_insert_pfn);
11577 +
11578 +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
11579 +                       unsigned long pfn)
11580 +{
11581 +       BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
11582 +
11583 +       if (addr < vma->vm_start || addr >= vma->vm_end)
11584 +               return -EFAULT;
11585 +
11586 +       /*
11587 +        * If we don't have pte special, then we have to use the pfn_valid()
11588 +        * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
11589 +        * refcount the page if pfn_valid is true (hence insert_page rather
11590 +        * than insert_pfn).
11591 +        */
11592 +       if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
11593 +               struct page *page;
11594 +
11595 +               page = pfn_to_page(pfn);
11596 +               return insert_page(vma, addr, page, vma->vm_page_prot);
11597 +       }
11598 +       return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
11599 +}
11600 +EXPORT_SYMBOL(vm_insert_mixed);
11601 +
11602 +/*
11603 + * maps a range of physical memory into the requested pages. the old
11604 + * mappings are removed. any references to nonexistent pages results
11605 + * in null mappings (currently treated as "copy-on-access")
11606 + */
11607 +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
11608 +                       unsigned long addr, unsigned long end,
11609 +                       unsigned long pfn, pgprot_t prot)
11610 +{
11611 +       pte_t *pte;
11612 +       spinlock_t *ptl;
11613 +
11614 +       pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
11615 +       if (!pte)
11616 +               return -ENOMEM;
11617 +       arch_enter_lazy_mmu_mode();
11618 +       do {
11619 +               BUG_ON(!pte_none(*pte));
11620 +               set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
11621 +               pfn++;
11622 +       } while (pte++, addr += PAGE_SIZE, addr != end);
11623 +       arch_leave_lazy_mmu_mode();
11624 +       pte_unmap_unlock(pte - 1, ptl);
11625 +       return 0;
11626 +}
11627 +
11628 +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
11629 +                       unsigned long addr, unsigned long end,
11630 +                       unsigned long pfn, pgprot_t prot)
11631 +{
11632 +       pmd_t *pmd;
11633 +       unsigned long next;
11634 +
11635 +       pfn -= addr >> PAGE_SHIFT;
11636 +       pmd = pmd_alloc(mm, pud, addr);
11637 +       if (!pmd)
11638 +               return -ENOMEM;
11639 +       do {
11640 +               next = pmd_addr_end(addr, end);
11641 +               if (remap_pte_range(mm, pmd, addr, next,
11642 +                               pfn + (addr >> PAGE_SHIFT), prot))
11643 +                       return -ENOMEM;
11644 +       } while (pmd++, addr = next, addr != end);
11645 +       return 0;
11646 +}
11647 +
11648 +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
11649 +                       unsigned long addr, unsigned long end,
11650 +                       unsigned long pfn, pgprot_t prot)
11651 +{
11652 +       pud_t *pud;
11653 +       unsigned long next;
11654 +
11655 +       pfn -= addr >> PAGE_SHIFT;
11656 +       pud = pud_alloc(mm, pgd, addr);
11657 +       if (!pud)
11658 +               return -ENOMEM;
11659 +       do {
11660 +               next = pud_addr_end(addr, end);
11661 +               if (remap_pmd_range(mm, pud, addr, next,
11662 +                               pfn + (addr >> PAGE_SHIFT), prot))
11663 +                       return -ENOMEM;
11664 +       } while (pud++, addr = next, addr != end);
11665 +       return 0;
11666 +}
11667 +
11668 +/**
11669 + * remap_pfn_range - remap kernel memory to userspace
11670 + * @vma: user vma to map to
11671 + * @addr: target user address to start at
11672 + * @pfn: physical address of kernel memory
11673 + * @size: size of map area
11674 + * @prot: page protection flags for this mapping
11675 + *
11676 + *  Note: this is only safe if the mm semaphore is held when called.
11677 + */
11678 +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
11679 +                   unsigned long pfn, unsigned long size, pgprot_t prot)
11680 +{
11681 +       pgd_t *pgd;
11682 +       unsigned long next;
11683 +       unsigned long end = addr + PAGE_ALIGN(size);
11684 +       struct mm_struct *mm = vma->vm_mm;
11685 +       int err;
11686 +
11687 +       /*
11688 +        * Physically remapped pages are special. Tell the
11689 +        * rest of the world about it:
11690 +        *   VM_IO tells people not to look at these pages
11691 +        *      (accesses can have side effects).
11692 +        *   VM_RESERVED is specified all over the place, because
11693 +        *      in 2.4 it kept swapout's vma scan off this vma; but
11694 +        *      in 2.6 the LRU scan won't even find its pages, so this
11695 +        *      flag means no more than count its pages in reserved_vm,
11696 +        *      and omit it from core dump, even when VM_IO turned off.
11697 +        *   VM_PFNMAP tells the core MM that the base pages are just
11698 +        *      raw PFN mappings, and do not have a "struct page" associated
11699 +        *      with them.
11700 +        *
11701 +        * There's a horrible special case to handle copy-on-write
11702 +        * behaviour that some programs depend on. We mark the "original"
11703 +        * un-COW'ed pages by matching them up with "vma->vm_pgoff".
11704 +        */
11705 +       if (is_cow_mapping(vma->vm_flags)) {
11706 +               if (addr != vma->vm_start || end != vma->vm_end)
11707 +                       return -EINVAL;
11708 +               vma->vm_pgoff = pfn;
11709 +       }
11710 +
11711 +       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
11712 +
11713 +       BUG_ON(addr >= end);
11714 +       pfn -= addr >> PAGE_SHIFT;
11715 +       pgd = pgd_offset(mm, addr);
11716 +       flush_cache_range(vma, addr, end);
11717 +       do {
11718 +               next = pgd_addr_end(addr, end);
11719 +               err = remap_pud_range(mm, pgd, addr, next,
11720 +                               pfn + (addr >> PAGE_SHIFT), prot);
11721 +               if (err)
11722 +                       break;
11723 +       } while (pgd++, addr = next, addr != end);
11724 +       return err;
11725 +}
11726 +EXPORT_SYMBOL(remap_pfn_range);
11727 +
11728 +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
11729 +                                    unsigned long addr, unsigned long end,
11730 +                                    pte_fn_t fn, void *data)
11731 +{
11732 +       pte_t *pte;
11733 +       int err;
11734 +       pgtable_t token;
11735 +       spinlock_t *uninitialized_var(ptl);
11736 +
11737 +       pte = (mm == &init_mm) ?
11738 +               pte_alloc_kernel(pmd, addr) :
11739 +               pte_alloc_map_lock(mm, pmd, addr, &ptl);
11740 +       if (!pte)
11741 +               return -ENOMEM;
11742 +
11743 +       BUG_ON(pmd_huge(*pmd));
11744 +
11745 +       token = pmd_pgtable(*pmd);
11746 +
11747 +       do {
11748 +               err = fn(pte, token, addr, data);
11749 +               if (err)
11750 +                       break;
11751 +       } while (pte++, addr += PAGE_SIZE, addr != end);
11752 +
11753 +       if (mm != &init_mm)
11754 +               pte_unmap_unlock(pte-1, ptl);
11755 +       return err;
11756 +}
11757 +
11758 +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
11759 +                                    unsigned long addr, unsigned long end,
11760 +                                    pte_fn_t fn, void *data)
11761 +{
11762 +       pmd_t *pmd;
11763 +       unsigned long next;
11764 +       int err;
11765 +
11766 +       BUG_ON(pud_huge(*pud));
11767 +
11768 +       pmd = pmd_alloc(mm, pud, addr);
11769 +       if (!pmd)
11770 +               return -ENOMEM;
11771 +       do {
11772 +               next = pmd_addr_end(addr, end);
11773 +               err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
11774 +               if (err)
11775 +                       break;
11776 +       } while (pmd++, addr = next, addr != end);
11777 +       return err;
11778 +}
11779 +
11780 +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
11781 +                                    unsigned long addr, unsigned long end,
11782 +                                    pte_fn_t fn, void *data)
11783 +{
11784 +       pud_t *pud;
11785 +       unsigned long next;
11786 +       int err;
11787 +
11788 +       pud = pud_alloc(mm, pgd, addr);
11789 +       if (!pud)
11790 +               return -ENOMEM;
11791 +       do {
11792 +               next = pud_addr_end(addr, end);
11793 +               err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
11794 +               if (err)
11795 +                       break;
11796 +       } while (pud++, addr = next, addr != end);
11797 +       return err;
11798 +}
11799 +
11800 +/*
11801 + * Scan a region of virtual memory, filling in page tables as necessary
11802 + * and calling a provided function on each leaf page table.
11803 + */
11804 +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
11805 +                       unsigned long size, pte_fn_t fn, void *data)
11806 +{
11807 +       pgd_t *pgd;
11808 +       unsigned long next;
11809 +       unsigned long start = addr, end = addr + size;
11810 +       int err;
11811 +
11812 +       BUG_ON(addr >= end);
11813 +       mmu_notifier_invalidate_range_start(mm, start, end);
11814 +       pgd = pgd_offset(mm, addr);
11815 +       do {
11816 +               next = pgd_addr_end(addr, end);
11817 +               err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
11818 +               if (err)
11819 +                       break;
11820 +       } while (pgd++, addr = next, addr != end);
11821 +       mmu_notifier_invalidate_range_end(mm, start, end);
11822 +       return err;
11823 +}
11824 +EXPORT_SYMBOL_GPL(apply_to_page_range);
11825 +
11826 +/*
11827 + * handle_pte_fault chooses page fault handler according to an entry
11828 + * which was read non-atomically.  Before making any commitment, on
11829 + * those architectures or configurations (e.g. i386 with PAE) which
11830 + * might give a mix of unmatched parts, do_swap_page and do_file_page
11831 + * must check under lock before unmapping the pte and proceeding
11832 + * (but do_wp_page is only called after already making such a check;
11833 + * and do_anonymous_page and do_no_page can safely check later on).
11834 + */
11835 +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
11836 +                               pte_t *page_table, pte_t orig_pte)
11837 +{
11838 +       int same = 1;
11839 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
11840 +       if (sizeof(pte_t) > sizeof(unsigned long)) {
11841 +               spinlock_t *ptl = pte_lockptr(mm, pmd);
11842 +               spin_lock(ptl);
11843 +               same = pte_same(*page_table, orig_pte);
11844 +               spin_unlock(ptl);
11845 +       }
11846 +#endif
11847 +       pte_unmap(page_table);
11848 +       return same;
11849 +}
11850 +
11851 +/*
11852 + * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
11853 + * servicing faults for write access.  In the normal case, do always want
11854 + * pte_mkwrite.  But get_user_pages can cause write faults for mappings
11855 + * that do not have writing enabled, when used by access_process_vm.
11856 + */
11857 +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
11858 +{
11859 +       if (likely(vma->vm_flags & VM_WRITE))
11860 +               pte = pte_mkwrite(pte);
11861 +       return pte;
11862 +}
11863 +
11864 +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
11865 +{
11866 +       /*
11867 +        * If the source page was a PFN mapping, we don't have
11868 +        * a "struct page" for it. We do a best-effort copy by
11869 +        * just copying from the original user address. If that
11870 +        * fails, we just zero-fill it. Live with it.
11871 +        */
11872 +       if (unlikely(!src)) {
11873 +               void *kaddr = kmap_atomic(dst, KM_USER0);
11874 +               void __user *uaddr = (void __user *)(va & PAGE_MASK);
11875 +
11876 +               /*
11877 +                * This really shouldn't fail, because the page is there
11878 +                * in the page tables. But it might just be unreadable,
11879 +                * in which case we just give up and fill the result with
11880 +                * zeroes.
11881 +                */
11882 +               if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
11883 +                       memset(kaddr, 0, PAGE_SIZE);
11884 +               kunmap_atomic(kaddr, KM_USER0);
11885 +               flush_dcache_page(dst);
11886 +       } else
11887 +               copy_user_highpage(dst, src, va, vma);
11888 +}
11889 +
11890 +/*
11891 + * This routine handles present pages, when users try to write
11892 + * to a shared page. It is done by copying the page to a new address
11893 + * and decrementing the shared-page counter for the old page.
11894 + *
11895 + * Note that this routine assumes that the protection checks have been
11896 + * done by the caller (the low-level page fault routine in most cases).
11897 + * Thus we can safely just mark it writable once we've done any necessary
11898 + * COW.
11899 + *
11900 + * We also mark the page dirty at this point even though the page will
11901 + * change only once the write actually happens. This avoids a few races,
11902 + * and potentially makes it more efficient.
11903 + *
11904 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
11905 + * but allow concurrent faults), with pte both mapped and locked.
11906 + * We return with mmap_sem still held, but pte unmapped and unlocked.
11907 + */
11908 +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
11909 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
11910 +               spinlock_t *ptl, pte_t orig_pte)
11911 +{
11912 +       struct page *old_page, *new_page;
11913 +       pte_t entry;
11914 +       int reuse = 0, ret = 0;
11915 +       int page_mkwrite = 0;
11916 +       struct page *dirty_page = NULL;
11917 +
11918 +       old_page = vm_normal_page(vma, address, orig_pte);
11919 +       if (!old_page) {
11920 +               /*
11921 +                * VM_MIXEDMAP !pfn_valid() case
11922 +                *
11923 +                * We should not cow pages in a shared writeable mapping.
11924 +                * Just mark the pages writable as we can't do any dirty
11925 +                * accounting on raw pfn maps.
11926 +                */
11927 +               if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
11928 +                                    (VM_WRITE|VM_SHARED))
11929 +                       goto reuse;
11930 +               goto gotten;
11931 +       }
11932 +
11933 +       /*
11934 +        * Take out anonymous pages first, anonymous shared vmas are
11935 +        * not dirty accountable.
11936 +        */
11937 +       if (PageAnon(old_page)) {
11938 +               if (trylock_page(old_page)) {
11939 +                       reuse = can_share_swap_page(old_page);
11940 +                       unlock_page(old_page);
11941 +               }
11942 +       } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
11943 +                                       (VM_WRITE|VM_SHARED))) {
11944 +               /*
11945 +                * Only catch write-faults on shared writable pages,
11946 +                * read-only shared pages can get COWed by
11947 +                * get_user_pages(.write=1, .force=1).
11948 +                */
11949 +               if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
11950 +                       /*
11951 +                        * Notify the address space that the page is about to
11952 +                        * become writable so that it can prohibit this or wait
11953 +                        * for the page to get into an appropriate state.
11954 +                        *
11955 +                        * We do this without the lock held, so that it can
11956 +                        * sleep if it needs to.
11957 +                        */
11958 +                       page_cache_get(old_page);
11959 +                       pte_unmap_unlock(page_table, ptl);
11960 +
11961 +                       if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
11962 +                               goto unwritable_page;
11963 +
11964 +                       /*
11965 +                        * Since we dropped the lock we need to revalidate
11966 +                        * the PTE as someone else may have changed it.  If
11967 +                        * they did, we just return, as we can count on the
11968 +                        * MMU to tell us if they didn't also make it writable.
11969 +                        */
11970 +                       page_table = pte_offset_map_lock(mm, pmd, address,
11971 +                                                        &ptl);
11972 +                       page_cache_release(old_page);
11973 +                       if (!pte_same(*page_table, orig_pte))
11974 +                               goto unlock;
11975 +
11976 +                       page_mkwrite = 1;
11977 +               }
11978 +               dirty_page = old_page;
11979 +               get_page(dirty_page);
11980 +               reuse = 1;
11981 +       }
11982 +
11983 +       if (reuse) {
11984 +reuse:
11985 +               flush_cache_page(vma, address, pte_pfn(orig_pte));
11986 +               entry = pte_mkyoung(orig_pte);
11987 +               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
11988 +               if (ptep_set_access_flags(vma, address, page_table, entry,1))
11989 +                       update_mmu_cache(vma, address, entry);
11990 +               ret |= VM_FAULT_WRITE;
11991 +               goto unlock;
11992 +       }
11993 +
11994 +       /*
11995 +        * Ok, we need to copy. Oh, well..
11996 +        */
11997 +       page_cache_get(old_page);
11998 +gotten:
11999 +       pte_unmap_unlock(page_table, ptl);
12000 +
12001 +       if (unlikely(anon_vma_prepare(vma)))
12002 +               goto oom;
12003 +       VM_BUG_ON(old_page == ZERO_PAGE(0));
12004 +       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
12005 +       if (!new_page)
12006 +               goto oom;
12007 +       cow_user_page(new_page, old_page, address, vma);
12008 +       __SetPageUptodate(new_page);
12009 +
12010 +       if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
12011 +               goto oom_free_new;
12012 +
12013 +       /*
12014 +        * Re-check the pte - we dropped the lock
12015 +        */
12016 +       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
12017 +       if (likely(pte_same(*page_table, orig_pte))) {
12018 +               if (old_page) {
12019 +                       if (!PageAnon(old_page)) {
12020 +                               dec_mm_counter(mm, file_rss);
12021 +                               inc_mm_counter(mm, anon_rss);
12022 +                       }
12023 +               } else
12024 +                       inc_mm_counter(mm, anon_rss);
12025 +               flush_cache_page(vma, address, pte_pfn(orig_pte));
12026 +               entry = mk_pte(new_page, vma->vm_page_prot);
12027 +               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
12028 +               /*
12029 +                * Clear the pte entry and flush it first, before updating the
12030 +                * pte with the new entry. This will avoid a race condition
12031 +                * seen in the presence of one thread doing SMC and another
12032 +                * thread doing COW.
12033 +                */
12034 +               ptep_clear_flush_notify(vma, address, page_table);
12035 +               set_pte_at(mm, address, page_table, entry);
12036 +               update_mmu_cache(vma, address, entry);
12037 +               lru_cache_add_active(new_page);
12038 +               page_add_new_anon_rmap(new_page, vma, address);
12039 +
12040 +               if (old_page) {
12041 +                       /*
12042 +                        * Only after switching the pte to the new page may
12043 +                        * we remove the mapcount here. Otherwise another
12044 +                        * process may come and find the rmap count decremented
12045 +                        * before the pte is switched to the new page, and
12046 +                        * "reuse" the old page writing into it while our pte
12047 +                        * here still points into it and can be read by other
12048 +                        * threads.
12049 +                        *
12050 +                        * The critical issue is to order this
12051 +                        * page_remove_rmap with the ptp_clear_flush above.
12052 +                        * Those stores are ordered by (if nothing else,)
12053 +                        * the barrier present in the atomic_add_negative
12054 +                        * in page_remove_rmap.
12055 +                        *
12056 +                        * Then the TLB flush in ptep_clear_flush ensures that
12057 +                        * no process can access the old page before the
12058 +                        * decremented mapcount is visible. And the old page
12059 +                        * cannot be reused until after the decremented
12060 +                        * mapcount is visible. So transitively, TLBs to
12061 +                        * old page will be flushed before it can be reused.
12062 +                        */
12063 +                       page_remove_rmap(old_page, vma);
12064 +               }
12065 +
12066 +               /* Free the old page.. */
12067 +               new_page = old_page;
12068 +               ret |= VM_FAULT_WRITE;
12069 +       } else
12070 +               mem_cgroup_uncharge_page(new_page);
12071 +
12072 +       if (new_page)
12073 +               page_cache_release(new_page);
12074 +       if (old_page)
12075 +               page_cache_release(old_page);
12076 +unlock:
12077 +       pte_unmap_unlock(page_table, ptl);
12078 +       if (dirty_page) {
12079 +               if (vma->vm_file)
12080 +                       file_update_time(vma->vm_file);
12081 +
12082 +               /*
12083 +                * Yes, Virginia, this is actually required to prevent a race
12084 +                * with clear_page_dirty_for_io() from clearing the page dirty
12085 +                * bit after it clear all dirty ptes, but before a racing
12086 +                * do_wp_page installs a dirty pte.
12087 +                *
12088 +                * do_no_page is protected similarly.
12089 +                */
12090 +               wait_on_page_locked(dirty_page);
12091 +               set_page_dirty_balance(dirty_page, page_mkwrite);
12092 +               put_page(dirty_page);
12093 +       }
12094 +       return ret;
12095 +oom_free_new:
12096 +       page_cache_release(new_page);
12097 +oom:
12098 +       if (old_page)
12099 +               page_cache_release(old_page);
12100 +       return VM_FAULT_OOM;
12101 +
12102 +unwritable_page:
12103 +       page_cache_release(old_page);
12104 +       return VM_FAULT_SIGBUS;
12105 +}
12106 +
12107 +/*
12108 + * Helper functions for unmap_mapping_range().
12109 + *
12110 + * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
12111 + *
12112 + * We have to restart searching the prio_tree whenever we drop the lock,
12113 + * since the iterator is only valid while the lock is held, and anyway
12114 + * a later vma might be split and reinserted earlier while lock dropped.
12115 + *
12116 + * The list of nonlinear vmas could be handled more efficiently, using
12117 + * a placeholder, but handle it in the same way until a need is shown.
12118 + * It is important to search the prio_tree before nonlinear list: a vma
12119 + * may become nonlinear and be shifted from prio_tree to nonlinear list
12120 + * while the lock is dropped; but never shifted from list to prio_tree.
12121 + *
12122 + * In order to make forward progress despite restarting the search,
12123 + * vm_truncate_count is used to mark a vma as now dealt with, so we can
12124 + * quickly skip it next time around.  Since the prio_tree search only
12125 + * shows us those vmas affected by unmapping the range in question, we
12126 + * can't efficiently keep all vmas in step with mapping->truncate_count:
12127 + * so instead reset them all whenever it wraps back to 0 (then go to 1).
12128 + * mapping->truncate_count and vma->vm_truncate_count are protected by
12129 + * i_mmap_lock.
12130 + *
12131 + * In order to make forward progress despite repeatedly restarting some
12132 + * large vma, note the restart_addr from unmap_vmas when it breaks out:
12133 + * and restart from that address when we reach that vma again.  It might
12134 + * have been split or merged, shrunk or extended, but never shifted: so
12135 + * restart_addr remains valid so long as it remains in the vma's range.
12136 + * unmap_mapping_range forces truncate_count to leap over page-aligned
12137 + * values so we can save vma's restart_addr in its truncate_count field.
12138 + */
12139 +#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
12140 +
12141 +static void reset_vma_truncate_counts(struct address_space *mapping)
12142 +{
12143 +       struct vm_area_struct *vma;
12144 +       struct prio_tree_iter iter;
12145 +
12146 +       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
12147 +               vma->vm_truncate_count = 0;
12148 +       list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
12149 +               vma->vm_truncate_count = 0;
12150 +}
12151 +
12152 +static int unmap_mapping_range_vma(struct vm_area_struct *vma,
12153 +               unsigned long start_addr, unsigned long end_addr,
12154 +               struct zap_details *details)
12155 +{
12156 +       unsigned long restart_addr;
12157 +       int need_break;
12158 +
12159 +       /*
12160 +        * files that support invalidating or truncating portions of the
12161 +        * file from under mmaped areas must have their ->fault function
12162 +        * return a locked page (and set VM_FAULT_LOCKED in the return).
12163 +        * This provides synchronisation against concurrent unmapping here.
12164 +        */
12165 +
12166 +again:
12167 +       restart_addr = vma->vm_truncate_count;
12168 +       if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
12169 +               start_addr = restart_addr;
12170 +               if (start_addr >= end_addr) {
12171 +                       /* Top of vma has been split off since last time */
12172 +                       vma->vm_truncate_count = details->truncate_count;
12173 +                       return 0;
12174 +               }
12175 +       }
12176 +
12177 +       restart_addr = zap_page_range(vma, start_addr,
12178 +                                       end_addr - start_addr, details);
12179 +       need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
12180 +
12181 +       if (restart_addr >= end_addr) {
12182 +               /* We have now completed this vma: mark it so */
12183 +               vma->vm_truncate_count = details->truncate_count;
12184 +               if (!need_break)
12185 +                       return 0;
12186 +       } else {
12187 +               /* Note restart_addr in vma's truncate_count field */
12188 +               vma->vm_truncate_count = restart_addr;
12189 +               if (!need_break)
12190 +                       goto again;
12191 +       }
12192 +
12193 +       spin_unlock(details->i_mmap_lock);
12194 +       cond_resched();
12195 +       spin_lock(details->i_mmap_lock);
12196 +       return -EINTR;
12197 +}
12198 +
12199 +static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
12200 +                                           struct zap_details *details)
12201 +{
12202 +       struct vm_area_struct *vma;
12203 +       struct prio_tree_iter iter;
12204 +       pgoff_t vba, vea, zba, zea;
12205 +
12206 +restart:
12207 +       vma_prio_tree_foreach(vma, &iter, root,
12208 +                       details->first_index, details->last_index) {
12209 +               /* Skip quickly over those we have already dealt with */
12210 +               if (vma->vm_truncate_count == details->truncate_count)
12211 +                       continue;
12212 +
12213 +               vba = vma->vm_pgoff;
12214 +               vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
12215 +               /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
12216 +               zba = details->first_index;
12217 +               if (zba < vba)
12218 +                       zba = vba;
12219 +               zea = details->last_index;
12220 +               if (zea > vea)
12221 +                       zea = vea;
12222 +
12223 +               if (unmap_mapping_range_vma(vma,
12224 +                       ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
12225 +                       ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
12226 +                               details) < 0)
12227 +                       goto restart;
12228 +       }
12229 +}
12230 +
12231 +static inline void unmap_mapping_range_list(struct list_head *head,
12232 +                                           struct zap_details *details)
12233 +{
12234 +       struct vm_area_struct *vma;
12235 +
12236 +       /*
12237 +        * In nonlinear VMAs there is no correspondence between virtual address
12238 +        * offset and file offset.  So we must perform an exhaustive search
12239 +        * across *all* the pages in each nonlinear VMA, not just the pages
12240 +        * whose virtual address lies outside the file truncation point.
12241 +        */
12242 +restart:
12243 +       list_for_each_entry(vma, head, shared.vm_set.list) {
12244 +               /* Skip quickly over those we have already dealt with */
12245 +               if (vma->vm_truncate_count == details->truncate_count)
12246 +                       continue;
12247 +               details->nonlinear_vma = vma;
12248 +               if (unmap_mapping_range_vma(vma, vma->vm_start,
12249 +                                       vma->vm_end, details) < 0)
12250 +                       goto restart;
12251 +       }
12252 +}
12253 +
12254 +/**
12255 + * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
12256 + * @mapping: the address space containing mmaps to be unmapped.
12257 + * @holebegin: byte in first page to unmap, relative to the start of
12258 + * the underlying file.  This will be rounded down to a PAGE_SIZE
12259 + * boundary.  Note that this is different from vmtruncate(), which
12260 + * must keep the partial page.  In contrast, we must get rid of
12261 + * partial pages.
12262 + * @holelen: size of prospective hole in bytes.  This will be rounded
12263 + * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
12264 + * end of the file.
12265 + * @even_cows: 1 when truncating a file, unmap even private COWed pages;
12266 + * but 0 when invalidating pagecache, don't throw away private data.
12267 + */
12268 +void unmap_mapping_range(struct address_space *mapping,
12269 +               loff_t const holebegin, loff_t const holelen, int even_cows)
12270 +{
12271 +       struct zap_details details;
12272 +       pgoff_t hba = holebegin >> PAGE_SHIFT;
12273 +       pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
12274 +
12275 +       /* Check for overflow. */
12276 +       if (sizeof(holelen) > sizeof(hlen)) {
12277 +               long long holeend =
12278 +                       (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
12279 +               if (holeend & ~(long long)ULONG_MAX)
12280 +                       hlen = ULONG_MAX - hba + 1;
12281 +       }
12282 +
12283 +       details.check_mapping = even_cows? NULL: mapping;
12284 +       details.nonlinear_vma = NULL;
12285 +       details.first_index = hba;
12286 +       details.last_index = hba + hlen - 1;
12287 +       if (details.last_index < details.first_index)
12288 +               details.last_index = ULONG_MAX;
12289 +       details.i_mmap_lock = &mapping->i_mmap_lock;
12290 +
12291 +       spin_lock(&mapping->i_mmap_lock);
12292 +
12293 +       /* Protect against endless unmapping loops */
12294 +       mapping->truncate_count++;
12295 +       if (unlikely(is_restart_addr(mapping->truncate_count))) {
12296 +               if (mapping->truncate_count == 0)
12297 +                       reset_vma_truncate_counts(mapping);
12298 +               mapping->truncate_count++;
12299 +       }
12300 +       details.truncate_count = mapping->truncate_count;
12301 +
12302 +       if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
12303 +               unmap_mapping_range_tree(&mapping->i_mmap, &details);
12304 +       if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
12305 +               unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
12306 +       spin_unlock(&mapping->i_mmap_lock);
12307 +}
12308 +EXPORT_SYMBOL(unmap_mapping_range);
12309 +
12310 +/**
12311 + * vmtruncate - unmap mappings "freed" by truncate() syscall
12312 + * @inode: inode of the file used
12313 + * @offset: file offset to start truncating
12314 + *
12315 + * NOTE! We have to be ready to update the memory sharing
12316 + * between the file and the memory map for a potential last
12317 + * incomplete page.  Ugly, but necessary.
12318 + */
12319 +int vmtruncate(struct inode * inode, loff_t offset)
12320 +{
12321 +       if (inode->i_size < offset) {
12322 +               unsigned long limit;
12323 +
12324 +               limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
12325 +               if (limit != RLIM_INFINITY && offset > limit)
12326 +                       goto out_sig;
12327 +               if (offset > inode->i_sb->s_maxbytes)
12328 +                       goto out_big;
12329 +               i_size_write(inode, offset);
12330 +       } else {
12331 +               struct address_space *mapping = inode->i_mapping;
12332 +
12333 +               /*
12334 +                * truncation of in-use swapfiles is disallowed - it would
12335 +                * cause subsequent swapout to scribble on the now-freed
12336 +                * blocks.
12337 +                */
12338 +               if (IS_SWAPFILE(inode))
12339 +                       return -ETXTBSY;
12340 +               i_size_write(inode, offset);
12341 +
12342 +               /*
12343 +                * unmap_mapping_range is called twice, first simply for
12344 +                * efficiency so that truncate_inode_pages does fewer
12345 +                * single-page unmaps.  However after this first call, and
12346 +                * before truncate_inode_pages finishes, it is possible for
12347 +                * private pages to be COWed, which remain after
12348 +                * truncate_inode_pages finishes, hence the second
12349 +                * unmap_mapping_range call must be made for correctness.
12350 +                */
12351 +               unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
12352 +               truncate_inode_pages(mapping, offset);
12353 +               unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
12354 +       }
12355 +
12356 +       if (inode->i_op && inode->i_op->truncate)
12357 +               inode->i_op->truncate(inode);
12358 +       return 0;
12359 +
12360 +out_sig:
12361 +       send_sig(SIGXFSZ, current, 0);
12362 +out_big:
12363 +       return -EFBIG;
12364 +}
12365 +EXPORT_SYMBOL(vmtruncate);
12366 +
12367 +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
12368 +{
12369 +       struct address_space *mapping = inode->i_mapping;
12370 +
12371 +       /*
12372 +        * If the underlying filesystem is not going to provide
12373 +        * a way to truncate a range of blocks (punch a hole) -
12374 +        * we should return failure right now.
12375 +        */
12376 +       if (!inode->i_op || !inode->i_op->truncate_range)
12377 +               return -ENOSYS;
12378 +
12379 +       mutex_lock(&inode->i_mutex);
12380 +       down_write(&inode->i_alloc_sem);
12381 +       unmap_mapping_range(mapping, offset, (end - offset), 1);
12382 +       truncate_inode_pages_range(mapping, offset, end);
12383 +       unmap_mapping_range(mapping, offset, (end - offset), 1);
12384 +       inode->i_op->truncate_range(inode, offset, end);
12385 +       up_write(&inode->i_alloc_sem);
12386 +       mutex_unlock(&inode->i_mutex);
12387 +
12388 +       return 0;
12389 +}
12390 +
12391 +/*
12392 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
12393 + * but allow concurrent faults), and pte mapped but not yet locked.
12394 + * We return with mmap_sem still held, but pte unmapped and unlocked.
12395 + */
12396 +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
12397 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
12398 +               int write_access, pte_t orig_pte)
12399 +{
12400 +       spinlock_t *ptl;
12401 +       struct page *page;
12402 +       swp_entry_t entry;
12403 +       pte_t pte;
12404 +       int ret = 0;
12405 +
12406 +       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
12407 +               goto out;
12408 +
12409 +       entry = pte_to_swp_entry(orig_pte);
12410 +       if (is_migration_entry(entry)) {
12411 +               migration_entry_wait(mm, pmd, address);
12412 +               goto out;
12413 +       }
12414 +       delayacct_set_flag(DELAYACCT_PF_SWAPIN);
12415 +       page = lookup_swap_cache(entry);
12416 +       if (!page) {
12417 +               grab_swap_token(); /* Contend for token _before_ read-in */
12418 +               page = swapin_readahead(entry,
12419 +                                       GFP_HIGHUSER_MOVABLE, vma, address);
12420 +               if (!page) {
12421 +                       /*
12422 +                        * Back out if somebody else faulted in this pte
12423 +                        * while we released the pte lock.
12424 +                        */
12425 +                       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
12426 +                       if (likely(pte_same(*page_table, orig_pte)))
12427 +                               ret = VM_FAULT_OOM;
12428 +                       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
12429 +                       goto unlock;
12430 +               }
12431 +
12432 +               /* Had to read the page from swap area: Major fault */
12433 +               ret = VM_FAULT_MAJOR;
12434 +               count_vm_event(PGMAJFAULT);
12435 +       }
12436 +
12437 +       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
12438 +               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
12439 +               ret = VM_FAULT_OOM;
12440 +               goto out;
12441 +       }
12442 +
12443 +       if (!vx_rss_avail(mm, 1)) {
12444 +               ret = VM_FAULT_OOM;
12445 +               goto out;
12446 +       }
12447 +
12448 +       mark_page_accessed(page);
12449 +       lock_page(page);
12450 +       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
12451 +
12452 +       /*
12453 +        * Back out if somebody else already faulted in this pte.
12454 +        */
12455 +       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
12456 +       if (unlikely(!pte_same(*page_table, orig_pte)))
12457 +               goto out_nomap;
12458 +
12459 +       if (unlikely(!PageUptodate(page))) {
12460 +               ret = VM_FAULT_SIGBUS;
12461 +               goto out_nomap;
12462 +       }
12463 +
12464 +       /* The page isn't present yet, go ahead with the fault. */
12465 +
12466 +       inc_mm_counter(mm, anon_rss);
12467 +       pte = mk_pte(page, vma->vm_page_prot);
12468 +       if (write_access && can_share_swap_page(page)) {
12469 +               pte = maybe_mkwrite(pte_mkdirty(pte), vma);
12470 +               write_access = 0;
12471 +       }
12472 +
12473 +       flush_icache_page(vma, page);
12474 +       set_pte_at(mm, address, page_table, pte);
12475 +       page_add_anon_rmap(page, vma, address);
12476 +
12477 +       swap_free(entry);
12478 +       if (vm_swap_full())
12479 +               remove_exclusive_swap_page(page);
12480 +       unlock_page(page);
12481 +
12482 +       if (write_access) {
12483 +               ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
12484 +               if (ret & VM_FAULT_ERROR)
12485 +                       ret &= VM_FAULT_ERROR;
12486 +               goto out;
12487 +       }
12488 +
12489 +       /* No need to invalidate - it was non-present before */
12490 +       update_mmu_cache(vma, address, pte);
12491 +unlock:
12492 +       pte_unmap_unlock(page_table, ptl);
12493 +out:
12494 +       return ret;
12495 +out_nomap:
12496 +       mem_cgroup_uncharge_page(page);
12497 +       pte_unmap_unlock(page_table, ptl);
12498 +       unlock_page(page);
12499 +       page_cache_release(page);
12500 +       return ret;
12501 +}
12502 +
12503 +/*
12504 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
12505 + * but allow concurrent faults), and pte mapped but not yet locked.
12506 + * We return with mmap_sem still held, but pte unmapped and unlocked.
12507 + */
12508 +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
12509 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
12510 +               int write_access)
12511 +{
12512 +       struct page *page;
12513 +       spinlock_t *ptl;
12514 +       pte_t entry;
12515 +
12516 +       /* Allocate our own private page. */
12517 +       pte_unmap(page_table);
12518 +
12519 +       if (!vx_rss_avail(mm, 1))
12520 +               goto oom;
12521 +       if (unlikely(anon_vma_prepare(vma)))
12522 +               goto oom;
12523 +       page = alloc_zeroed_user_highpage_movable(vma, address);
12524 +       if (!page)
12525 +               goto oom;
12526 +       __SetPageUptodate(page);
12527 +
12528 +       if (mem_cgroup_charge(page, mm, GFP_KERNEL))
12529 +               goto oom_free_page;
12530 +
12531 +       entry = mk_pte(page, vma->vm_page_prot);
12532 +       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
12533 +
12534 +       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
12535 +       if (!pte_none(*page_table))
12536 +               goto release;
12537 +       inc_mm_counter(mm, anon_rss);
12538 +       lru_cache_add_active(page);
12539 +       page_add_new_anon_rmap(page, vma, address);
12540 +       set_pte_at(mm, address, page_table, entry);
12541 +
12542 +       /* No need to invalidate - it was non-present before */
12543 +       update_mmu_cache(vma, address, entry);
12544 +unlock:
12545 +       pte_unmap_unlock(page_table, ptl);
12546 +       return 0;
12547 +release:
12548 +       mem_cgroup_uncharge_page(page);
12549 +       page_cache_release(page);
12550 +       goto unlock;
12551 +oom_free_page:
12552 +       page_cache_release(page);
12553 +oom:
12554 +       return VM_FAULT_OOM;
12555 +}
12556 +
12557 +/*
12558 + * __do_fault() tries to create a new page mapping. It aggressively
12559 + * tries to share with existing pages, but makes a separate copy if
12560 + * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
12561 + * the next page fault.
12562 + *
12563 + * As this is called only for pages that do not currently exist, we
12564 + * do not need to flush old virtual caches or the TLB.
12565 + *
12566 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
12567 + * but allow concurrent faults), and pte neither mapped nor locked.
12568 + * We return with mmap_sem still held, but pte unmapped and unlocked.
12569 + */
12570 +static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
12571 +               unsigned long address, pmd_t *pmd,
12572 +               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
12573 +{
12574 +       pte_t *page_table;
12575 +       spinlock_t *ptl;
12576 +       struct page *page;
12577 +       pte_t entry;
12578 +       int anon = 0;
12579 +       struct page *dirty_page = NULL;
12580 +       struct vm_fault vmf;
12581 +       int ret;
12582 +       int page_mkwrite = 0;
12583 +
12584 +       vmf.virtual_address = (void __user *)(address & PAGE_MASK);
12585 +       vmf.pgoff = pgoff;
12586 +       vmf.flags = flags;
12587 +       vmf.page = NULL;
12588 +
12589 +       ret = vma->vm_ops->fault(vma, &vmf);
12590 +       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
12591 +               return ret;
12592 +
12593 +       /*
12594 +        * For consistency in subsequent calls, make the faulted page always
12595 +        * locked.
12596 +        */
12597 +       if (unlikely(!(ret & VM_FAULT_LOCKED)))
12598 +               lock_page(vmf.page);
12599 +       else
12600 +               VM_BUG_ON(!PageLocked(vmf.page));
12601 +
12602 +       /*
12603 +        * Should we do an early C-O-W break?
12604 +        */
12605 +       page = vmf.page;
12606 +       if (flags & FAULT_FLAG_WRITE) {
12607 +               if (!(vma->vm_flags & VM_SHARED)) {
12608 +                       anon = 1;
12609 +                       if (unlikely(anon_vma_prepare(vma))) {
12610 +                               ret = VM_FAULT_OOM;
12611 +                               goto out;
12612 +                       }
12613 +                       page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
12614 +                                               vma, address);
12615 +                       if (!page) {
12616 +                               ret = VM_FAULT_OOM;
12617 +                               goto out;
12618 +                       }
12619 +                       copy_user_highpage(page, vmf.page, address, vma);
12620 +                       __SetPageUptodate(page);
12621 +               } else {
12622 +                       /*
12623 +                        * If the page will be shareable, see if the backing
12624 +                        * address space wants to know that the page is about
12625 +                        * to become writable
12626 +                        */
12627 +                       if (vma->vm_ops->page_mkwrite) {
12628 +                               unlock_page(page);
12629 +                               if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
12630 +                                       ret = VM_FAULT_SIGBUS;
12631 +                                       anon = 1; /* no anon but release vmf.page */
12632 +                                       goto out_unlocked;
12633 +                               }
12634 +                               lock_page(page);
12635 +                               /*
12636 +                                * XXX: this is not quite right (racy vs
12637 +                                * invalidate) to unlock and relock the page
12638 +                                * like this, however a better fix requires
12639 +                                * reworking page_mkwrite locking API, which
12640 +                                * is better done later.
12641 +                                */
12642 +                               if (!page->mapping) {
12643 +                                       ret = 0;
12644 +                                       anon = 1; /* no anon but release vmf.page */
12645 +                                       goto out;
12646 +                               }
12647 +                               page_mkwrite = 1;
12648 +                       }
12649 +               }
12650 +
12651 +       }
12652 +
12653 +       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
12654 +               ret = VM_FAULT_OOM;
12655 +               goto out;
12656 +       }
12657 +
12658 +       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
12659 +
12660 +       /*
12661 +        * This silly early PAGE_DIRTY setting removes a race
12662 +        * due to the bad i386 page protection. But it's valid
12663 +        * for other architectures too.
12664 +        *
12665 +        * Note that if write_access is true, we either now have
12666 +        * an exclusive copy of the page, or this is a shared mapping,
12667 +        * so we can make it writable and dirty to avoid having to
12668 +        * handle that later.
12669 +        */
12670 +       /* Only go through if we didn't race with anybody else... */
12671 +       if (likely(pte_same(*page_table, orig_pte))) {
12672 +               flush_icache_page(vma, page);
12673 +               entry = mk_pte(page, vma->vm_page_prot);
12674 +               if (flags & FAULT_FLAG_WRITE)
12675 +                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
12676 +               set_pte_at(mm, address, page_table, entry);
12677 +               if (anon) {
12678 +                        inc_mm_counter(mm, anon_rss);
12679 +                        lru_cache_add_active(page);
12680 +                        page_add_new_anon_rmap(page, vma, address);
12681 +               } else {
12682 +                       inc_mm_counter(mm, file_rss);
12683 +                       page_add_file_rmap(page);
12684 +                       if (flags & FAULT_FLAG_WRITE) {
12685 +                               dirty_page = page;
12686 +                               get_page(dirty_page);
12687 +                       }
12688 +               }
12689 +
12690 +               /* no need to invalidate: a not-present page won't be cached */
12691 +               update_mmu_cache(vma, address, entry);
12692 +       } else {
12693 +               mem_cgroup_uncharge_page(page);
12694 +               if (anon)
12695 +                       page_cache_release(page);
12696 +               else
12697 +                       anon = 1; /* no anon but release faulted_page */
12698 +       }
12699 +
12700 +       pte_unmap_unlock(page_table, ptl);
12701 +
12702 +out:
12703 +       unlock_page(vmf.page);
12704 +out_unlocked:
12705 +       if (anon)
12706 +               page_cache_release(vmf.page);
12707 +       else if (dirty_page) {
12708 +               if (vma->vm_file)
12709 +                       file_update_time(vma->vm_file);
12710 +
12711 +               set_page_dirty_balance(dirty_page, page_mkwrite);
12712 +               put_page(dirty_page);
12713 +       }
12714 +
12715 +       return ret;
12716 +}
12717 +
12718 +static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
12719 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
12720 +               int write_access, pte_t orig_pte)
12721 +{
12722 +       pgoff_t pgoff = (((address & PAGE_MASK)
12723 +                       - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
12724 +       unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
12725 +
12726 +       pte_unmap(page_table);
12727 +       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
12728 +}
12729 +
12730 +/*
12731 + * Fault of a previously existing named mapping. Repopulate the pte
12732 + * from the encoded file_pte if possible. This enables swappable
12733 + * nonlinear vmas.
12734 + *
12735 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
12736 + * but allow concurrent faults), and pte mapped but not yet locked.
12737 + * We return with mmap_sem still held, but pte unmapped and unlocked.
12738 + */
12739 +static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
12740 +               unsigned long address, pte_t *page_table, pmd_t *pmd,
12741 +               int write_access, pte_t orig_pte)
12742 +{
12743 +       unsigned int flags = FAULT_FLAG_NONLINEAR |
12744 +                               (write_access ? FAULT_FLAG_WRITE : 0);
12745 +       pgoff_t pgoff;
12746 +
12747 +       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
12748 +               return 0;
12749 +
12750 +       if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
12751 +                       !(vma->vm_flags & VM_CAN_NONLINEAR))) {
12752 +               /*
12753 +                * Page table corrupted: show pte and kill process.
12754 +                */
12755 +               print_bad_pte(vma, orig_pte, address);
12756 +               return VM_FAULT_OOM;
12757 +       }
12758 +
12759 +       pgoff = pte_to_pgoff(orig_pte);
12760 +       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
12761 +}
12762 +
12763 +/*
12764 + * These routines also need to handle stuff like marking pages dirty
12765 + * and/or accessed for architectures that don't do it in hardware (most
12766 + * RISC architectures).  The early dirtying is also good on the i386.
12767 + *
12768 + * There is also a hook called "update_mmu_cache()" that architectures
12769 + * with external mmu caches can use to update those (ie the Sparc or
12770 + * PowerPC hashed page tables that act as extended TLBs).
12771 + *
12772 + * We enter with non-exclusive mmap_sem (to exclude vma changes,
12773 + * but allow concurrent faults), and pte mapped but not yet locked.
12774 + * We return with mmap_sem still held, but pte unmapped and unlocked.
12775 + */
12776 +static inline int handle_pte_fault(struct mm_struct *mm,
12777 +               struct vm_area_struct *vma, unsigned long address,
12778 +               pte_t *pte, pmd_t *pmd, int write_access)
12779 +{
12780 +       pte_t entry;
12781 +       spinlock_t *ptl;
12782 +       int ret = 0, type = VXPT_UNKNOWN;
12783 +
12784 +       entry = *pte;
12785 +       if (!pte_present(entry)) {
12786 +               if (pte_none(entry)) {
12787 +                       if (vma->vm_ops) {
12788 +                               if (likely(vma->vm_ops->fault))
12789 +                                       return do_linear_fault(mm, vma, address,
12790 +                                               pte, pmd, write_access, entry);
12791 +                       }
12792 +                       return do_anonymous_page(mm, vma, address,
12793 +                                                pte, pmd, write_access);
12794 +               }
12795 +               if (pte_file(entry))
12796 +                       return do_nonlinear_fault(mm, vma, address,
12797 +                                       pte, pmd, write_access, entry);
12798 +               return do_swap_page(mm, vma, address,
12799 +                                       pte, pmd, write_access, entry);
12800 +       }
12801 +
12802 +       ptl = pte_lockptr(mm, pmd);
12803 +       spin_lock(ptl);
12804 +       if (unlikely(!pte_same(*pte, entry)))
12805 +               goto unlock;
12806 +       if (write_access) {
12807 +               if (!pte_write(entry)) {
12808 +                       ret = do_wp_page(mm, vma, address,
12809 +                                       pte, pmd, ptl, entry);
12810 +                       type = VXPT_WRITE;
12811 +                       goto out;
12812 +               }
12813 +               entry = pte_mkdirty(entry);
12814 +       }
12815 +       entry = pte_mkyoung(entry);
12816 +       if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
12817 +               update_mmu_cache(vma, address, entry);
12818 +       } else {
12819 +               /*
12820 +                * This is needed only for protection faults but the arch code
12821 +                * is not yet telling us if this is a protection fault or not.
12822 +                * This still avoids useless tlb flushes for .text page faults
12823 +                * with threads.
12824 +                */
12825 +               if (write_access)
12826 +                       flush_tlb_page(vma, address);
12827 +       }
12828 +unlock:
12829 +       pte_unmap_unlock(pte, ptl);
12830 +       ret = 0;
12831 +out:
12832 +       vx_page_fault(mm, vma, type, ret);
12833 +       return ret;
12834 +}
12835 +
12836 +/*
12837 + * By the time we get here, we already hold the mm semaphore
12838 + */
12839 +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
12840 +               unsigned long address, int write_access)
12841 +{
12842 +       pgd_t *pgd;
12843 +       pud_t *pud;
12844 +       pmd_t *pmd;
12845 +       pte_t *pte;
12846 +
12847 +       __set_current_state(TASK_RUNNING);
12848 +
12849 +       count_vm_event(PGFAULT);
12850 +
12851 +       if (unlikely(is_vm_hugetlb_page(vma)))
12852 +               return hugetlb_fault(mm, vma, address, write_access);
12853 +
12854 +       pgd = pgd_offset(mm, address);
12855 +       pud = pud_alloc(mm, pgd, address);
12856 +       if (!pud)
12857 +               return VM_FAULT_OOM;
12858 +       pmd = pmd_alloc(mm, pud, address);
12859 +       if (!pmd)
12860 +               return VM_FAULT_OOM;
12861 +       pte = pte_alloc_map(mm, pmd, address);
12862 +       if (!pte)
12863 +               return VM_FAULT_OOM;
12864 +
12865 +       return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
12866 +}
12867 +
12868 +#ifndef __PAGETABLE_PUD_FOLDED
12869 +/*
12870 + * Allocate page upper directory.
12871 + * We've already handled the fast-path in-line.
12872 + */
12873 +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
12874 +{
12875 +       pud_t *new = pud_alloc_one(mm, address);
12876 +       if (!new)
12877 +               return -ENOMEM;
12878 +
12879 +       smp_wmb(); /* See comment in __pte_alloc */
12880 +
12881 +       spin_lock(&mm->page_table_lock);
12882 +       if (pgd_present(*pgd))          /* Another has populated it */
12883 +               pud_free(mm, new);
12884 +       else
12885 +               pgd_populate(mm, pgd, new);
12886 +       spin_unlock(&mm->page_table_lock);
12887 +       return 0;
12888 +}
12889 +#endif /* __PAGETABLE_PUD_FOLDED */
12890 +
12891 +#ifndef __PAGETABLE_PMD_FOLDED
12892 +/*
12893 + * Allocate page middle directory.
12894 + * We've already handled the fast-path in-line.
12895 + */
12896 +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
12897 +{
12898 +       pmd_t *new = pmd_alloc_one(mm, address);
12899 +       if (!new)
12900 +               return -ENOMEM;
12901 +
12902 +       smp_wmb(); /* See comment in __pte_alloc */
12903 +
12904 +       spin_lock(&mm->page_table_lock);
12905 +#ifndef __ARCH_HAS_4LEVEL_HACK
12906 +       if (pud_present(*pud))          /* Another has populated it */
12907 +               pmd_free(mm, new);
12908 +       else
12909 +               pud_populate(mm, pud, new);
12910 +#else
12911 +       if (pgd_present(*pud))          /* Another has populated it */
12912 +               pmd_free(mm, new);
12913 +       else
12914 +               pgd_populate(mm, pud, new);
12915 +#endif /* __ARCH_HAS_4LEVEL_HACK */
12916 +       spin_unlock(&mm->page_table_lock);
12917 +       return 0;
12918 +}
12919 +#endif /* __PAGETABLE_PMD_FOLDED */
12920 +
12921 +int make_pages_present(unsigned long addr, unsigned long end)
12922 +{
12923 +       int ret, len, write;
12924 +       struct vm_area_struct * vma;
12925 +
12926 +       vma = find_vma(current->mm, addr);
12927 +       if (!vma)
12928 +               return -ENOMEM;
12929 +       write = (vma->vm_flags & VM_WRITE) != 0;
12930 +       BUG_ON(addr >= end);
12931 +       BUG_ON(end > vma->vm_end);
12932 +       len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
12933 +       ret = get_user_pages(current, current->mm, addr,
12934 +                       len, write, 0, NULL, NULL);
12935 +       if (ret < 0) {
12936 +               /*
12937 +                  SUS require strange return value to mlock
12938 +                   - invalid addr generate to ENOMEM.
12939 +                   - out of memory should generate EAGAIN.
12940 +               */
12941 +               if (ret == -EFAULT)
12942 +                       ret = -ENOMEM;
12943 +               else if (ret == -ENOMEM)
12944 +                       ret = -EAGAIN;
12945 +               return ret;
12946 +       }
12947 +       return ret == len ? 0 : -ENOMEM;
12948 +}
12949 +
12950 +#if !defined(__HAVE_ARCH_GATE_AREA)
12951 +
12952 +#if defined(AT_SYSINFO_EHDR)
12953 +static struct vm_area_struct gate_vma;
12954 +
12955 +static int __init gate_vma_init(void)
12956 +{
12957 +       gate_vma.vm_mm = NULL;
12958 +       gate_vma.vm_start = FIXADDR_USER_START;
12959 +       gate_vma.vm_end = FIXADDR_USER_END;
12960 +       gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
12961 +       gate_vma.vm_page_prot = __P101;
12962 +       /*
12963 +        * Make sure the vDSO gets into every core dump.
12964 +        * Dumping its contents makes post-mortem fully interpretable later
12965 +        * without matching up the same kernel and hardware config to see
12966 +        * what PC values meant.
12967 +        */
12968 +       gate_vma.vm_flags |= VM_ALWAYSDUMP;
12969 +       return 0;
12970 +}
12971 +__initcall(gate_vma_init);
12972 +#endif
12973 +
12974 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
12975 +{
12976 +#ifdef AT_SYSINFO_EHDR
12977 +       return &gate_vma;
12978 +#else
12979 +       return NULL;
12980 +#endif
12981 +}
12982 +
12983 +int in_gate_area_no_task(unsigned long addr)
12984 +{
12985 +#ifdef AT_SYSINFO_EHDR
12986 +       if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
12987 +               return 1;
12988 +#endif
12989 +       return 0;
12990 +}
12991 +
12992 +#endif /* __HAVE_ARCH_GATE_AREA */
12993 +
12994 +#ifdef CONFIG_HAVE_IOREMAP_PROT
12995 +static resource_size_t follow_phys(struct vm_area_struct *vma,
12996 +                       unsigned long address, unsigned int flags,
12997 +                       unsigned long *prot)
12998 +{
12999 +       pgd_t *pgd;
13000 +       pud_t *pud;
13001 +       pmd_t *pmd;
13002 +       pte_t *ptep, pte;
13003 +       spinlock_t *ptl;
13004 +       resource_size_t phys_addr = 0;
13005 +       struct mm_struct *mm = vma->vm_mm;
13006 +
13007 +       VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
13008 +
13009 +       pgd = pgd_offset(mm, address);
13010 +       if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
13011 +               goto no_page_table;
13012 +
13013 +       pud = pud_offset(pgd, address);
13014 +       if (pud_none(*pud) || unlikely(pud_bad(*pud)))
13015 +               goto no_page_table;
13016 +
13017 +       pmd = pmd_offset(pud, address);
13018 +       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
13019 +               goto no_page_table;
13020 +
13021 +       /* We cannot handle huge page PFN maps. Luckily they don't exist. */
13022 +       if (pmd_huge(*pmd))
13023 +               goto no_page_table;
13024 +
13025 +       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
13026 +       if (!ptep)
13027 +               goto out;
13028 +
13029 +       pte = *ptep;
13030 +       if (!pte_present(pte))
13031 +               goto unlock;
13032 +       if ((flags & FOLL_WRITE) && !pte_write(pte))
13033 +               goto unlock;
13034 +       phys_addr = pte_pfn(pte);
13035 +       phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
13036 +
13037 +       *prot = pgprot_val(pte_pgprot(pte));
13038 +
13039 +unlock:
13040 +       pte_unmap_unlock(ptep, ptl);
13041 +out:
13042 +       return phys_addr;
13043 +no_page_table:
13044 +       return 0;
13045 +}
13046 +
13047 +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
13048 +                       void *buf, int len, int write)
13049 +{
13050 +       resource_size_t phys_addr;
13051 +       unsigned long prot = 0;
13052 +       void *maddr;
13053 +       int offset = addr & (PAGE_SIZE-1);
13054 +
13055 +       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
13056 +               return -EINVAL;
13057 +
13058 +       phys_addr = follow_phys(vma, addr, write, &prot);
13059 +
13060 +       if (!phys_addr)
13061 +               return -EINVAL;
13062 +
13063 +       maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
13064 +       if (write)
13065 +               memcpy_toio(maddr + offset, buf, len);
13066 +       else
13067 +               memcpy_fromio(buf, maddr + offset, len);
13068 +       iounmap(maddr);
13069 +
13070 +       return len;
13071 +}
13072 +#endif
13073 +
13074 +/*
13075 + * Access another process' address space.
13076 + * Source/target buffer must be kernel space,
13077 + * Do not walk the page table directly, use get_user_pages
13078 + */
13079 +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
13080 +{
13081 +       struct mm_struct *mm;
13082 +       struct vm_area_struct *vma;
13083 +       void *old_buf = buf;
13084 +
13085 +       mm = get_task_mm(tsk);
13086 +       if (!mm)
13087 +               return 0;
13088 +
13089 +       down_read(&mm->mmap_sem);
13090 +       /* ignore errors, just check how much was successfully transferred */
13091 +       while (len) {
13092 +               int bytes, ret, offset;
13093 +               void *maddr;
13094 +               struct page *page = NULL;
13095 +
13096 +               ret = get_user_pages(tsk, mm, addr, 1,
13097 +                               write, 1, &page, &vma);
13098 +               if (ret <= 0) {
13099 +                       /*
13100 +                        * Check if this is a VM_IO | VM_PFNMAP VMA, which
13101 +                        * we can access using slightly different code.
13102 +                        */
13103 +#ifdef CONFIG_HAVE_IOREMAP_PROT
13104 +                       vma = find_vma(mm, addr);
13105 +                       if (!vma)
13106 +                               break;
13107 +                       if (vma->vm_ops && vma->vm_ops->access)
13108 +                               ret = vma->vm_ops->access(vma, addr, buf,
13109 +                                                         len, write);
13110 +                       if (ret <= 0)
13111 +#endif
13112 +                               break;
13113 +                       bytes = ret;
13114 +               } else {
13115 +                       bytes = len;
13116 +                       offset = addr & (PAGE_SIZE-1);
13117 +                       if (bytes > PAGE_SIZE-offset)
13118 +                               bytes = PAGE_SIZE-offset;
13119 +
13120 +                       maddr = kmap(page);
13121 +                       if (write) {
13122 +                               copy_to_user_page(vma, page, addr,
13123 +                                                 maddr + offset, buf, bytes);
13124 +                               set_page_dirty_lock(page);
13125 +                       } else {
13126 +                               copy_from_user_page(vma, page, addr,
13127 +                                                   buf, maddr + offset, bytes);
13128 +                       }
13129 +                       kunmap(page);
13130 +                       page_cache_release(page);
13131 +               }
13132 +               len -= bytes;
13133 +               buf += bytes;
13134 +               addr += bytes;
13135 +       }
13136 +       up_read(&mm->mmap_sem);
13137 +       mmput(mm);
13138 +
13139 +       return buf - old_buf;
13140 +}
13141 +
13142 +/*
13143 + * Print the name of a VMA.
13144 + */
13145 +void print_vma_addr(char *prefix, unsigned long ip)
13146 +{
13147 +       struct mm_struct *mm = current->mm;
13148 +       struct vm_area_struct *vma;
13149 +
13150 +       /*
13151 +        * Do not print if we are in atomic
13152 +        * contexts (in exception stacks, etc.):
13153 +        */
13154 +       if (preempt_count())
13155 +               return;
13156 +
13157 +       down_read(&mm->mmap_sem);
13158 +       vma = find_vma(mm, ip);
13159 +       if (vma && vma->vm_file) {
13160 +               struct file *f = vma->vm_file;
13161 +               char *buf = (char *)__get_free_page(GFP_KERNEL);
13162 +               if (buf) {
13163 +                       char *p, *s;
13164 +
13165 +                       p = d_path(&f->f_path, buf, PAGE_SIZE);
13166 +                       if (IS_ERR(p))
13167 +                               p = "?";
13168 +                       s = strrchr(p, '/');
13169 +                       if (s)
13170 +                               p = s+1;
13171 +                       printk("%s%s[%lx+%lx]", prefix, p,
13172 +                                       vma->vm_start,
13173 +                                       vma->vm_end - vma->vm_start);
13174 +                       free_page((unsigned long)buf);
13175 +               }
13176 +       }
13177 +       up_read(&current->mm->mmap_sem);
13178 +}
13179 diff -Nurb linux-2.6.27-590/mm/slab.c linux-2.6.27-591/mm/slab.c
13180 --- linux-2.6.27-590/mm/slab.c  2010-02-01 19:42:07.000000000 -0500
13181 +++ linux-2.6.27-591/mm/slab.c  2010-02-01 19:43:07.000000000 -0500
13182 @@ -110,6 +110,7 @@
13183  #include       <linux/fault-inject.h>
13184  #include       <linux/rtmutex.h>
13185  #include       <linux/reciprocal_div.h>
13186 +#include <linux/arrays.h>
13187  #include       <linux/debugobjects.h>
13188
13189  #include       <asm/cacheflush.h>
13190 @@ -248,6 +249,14 @@
13191         void *addr;
13192  };
13193
13194 +extern void (*rec_event)(void *,unsigned int);
13195 +struct event_spec {
13196 +       unsigned long pc;
13197 +       unsigned long dcookie;
13198 +       unsigned count;
13199 +       unsigned char reason;
13200 +};
13201 +
13202  /*
13203   * struct array_cache
13204   *
13205 @@ -3469,6 +3478,19 @@
13206         local_irq_restore(save_flags);
13207         objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
13208         prefetchw(objp);
13209 +#ifdef CONFIG_CHOPSTIX
13210 +       if (rec_event && objp) {
13211 +               struct event event;
13212 +               struct event_spec espec;
13213 +
13214 +               espec.reason = 0; /* alloc */
13215 +               event.event_data=&espec;
13216 +               event.task = current;
13217 +               espec.pc=caller;
13218 +               event.event_type=5;
13219 +               (*rec_event)(&event, cachep->buffer_size);
13220 +       }
13221 +#endif
13222
13223         if (unlikely((flags & __GFP_ZERO) && objp))
13224                 memset(objp, 0, obj_size(cachep));
13225 @@ -3578,12 +3600,26 @@
13226   * Release an obj back to its cache. If the obj has a constructed state, it must
13227   * be in this state _before_ it is released.  Called with disabled ints.
13228   */
13229 -static inline void __cache_free(struct kmem_cache *cachep, void *objp)
13230 +static inline void __cache_free(struct kmem_cache *cachep, void *objp, void *caller)
13231  {
13232         struct array_cache *ac = cpu_cache_get(cachep);
13233
13234         check_irq_off();
13235 -       objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
13236 +       objp = cache_free_debugcheck(cachep, objp, caller);
13237 + #ifdef CONFIG_CHOPSTIX
13238 +       if (rec_event && objp) {
13239 +               struct event event;
13240 +               struct event_spec espec;
13241 +
13242 +               espec.reason = 1; /* free */
13243 +               event.event_data=&espec;
13244 +               event.task = current;
13245 +               espec.pc=caller;
13246 +               event.event_type=4;
13247 +               (*rec_event)(&event, cachep->buffer_size);
13248 +       }
13249 + #endif
13250 +
13251         vx_slab_free(cachep);
13252
13253         /*
13254 @@ -3714,6 +3750,7 @@
13255                                           void *caller)
13256  {
13257         struct kmem_cache *cachep;
13258 +       void *ret;
13259
13260         /* If you want to save a few bytes .text space: replace
13261          * __ with kmem_.
13262 @@ -3741,10 +3778,17 @@
13263  EXPORT_SYMBOL(__kmalloc_track_caller);
13264
13265  #else
13266 +#ifdef CONFIG_CHOPSTIX
13267 +void *__kmalloc(size_t size, gfp_t flags)
13268 +{
13269 +       return __do_kmalloc(size, flags, __builtin_return_address(0));
13270 +}
13271 +#else
13272  void *__kmalloc(size_t size, gfp_t flags)
13273  {
13274         return __do_kmalloc(size, flags, NULL);
13275  }
13276 +#endif
13277  EXPORT_SYMBOL(__kmalloc);
13278  #endif
13279
13280 @@ -3764,7 +3808,7 @@
13281         debug_check_no_locks_freed(objp, obj_size(cachep));
13282         if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
13283                 debug_check_no_obj_freed(objp, obj_size(cachep));
13284 -       __cache_free(cachep, objp);
13285 +       __cache_free(cachep, objp,__builtin_return_address(0));
13286         local_irq_restore(flags);
13287  }
13288  EXPORT_SYMBOL(kmem_cache_free);
13289 @@ -3790,7 +3834,7 @@
13290         c = virt_to_cache(objp);
13291         debug_check_no_locks_freed(objp, obj_size(c));
13292         debug_check_no_obj_freed(objp, obj_size(c));
13293 -       __cache_free(c, (void *)objp);
13294 +       __cache_free(c, (void *)objp,__builtin_return_address(0));
13295         local_irq_restore(flags);
13296  }
13297  EXPORT_SYMBOL(kfree);