This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / sched.c
1 /*
2  *  kernel/sched.c
3  *
4  *  Kernel scheduler and related syscalls
5  *
6  *  Copyright (C) 1991-2002  Linus Torvalds
7  *
8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9  *              make semaphores SMP safe
10  *  1998-11-19  Implemented schedule_timeout() and related stuff
11  *              by Andrea Arcangeli
12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
13  *              hybrid priority-list and round-robin design with
14  *              an array-switch method of distributing timeslices
15  *              and per-CPU runqueues.  Cleanups and useful suggestions
16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
17  *  2003-09-03  Interactivity tuning by Con Kolivas.
18  *  2004-04-02  Scheduler domains code by Nick Piggin
19  */
20 #include <linux/mm.h>
21 #include <linux/module.h>
22 #include <linux/nmi.h>
23 #include <linux/init.h>
24 #include <asm/uaccess.h>
25 #include <linux/highmem.h>
26 #include <linux/smp_lock.h>
27 #include <asm/mmu_context.h>
28 #include <linux/interrupt.h>
29 #include <linux/completion.h>
30 #include <linux/kernel_stat.h>
31 #include <linux/security.h>
32 #include <linux/notifier.h>
33 #include <linux/profile.h>
34 #include <linux/suspend.h>
35 #include <linux/blkdev.h>
36 #include <linux/delay.h>
37 #include <linux/smp.h>
38 #include <linux/timer.h>
39 #include <linux/rcupdate.h>
40 #include <linux/cpu.h>
41 #include <linux/percpu.h>
42 #include <linux/kthread.h>
43 #include <linux/seq_file.h>
44 #include <linux/times.h>
45 #include <linux/vserver/sched.h>
46 #include <linux/vs_base.h>
47 #include <linux/vs_context.h>
48 #include <linux/vs_cvirt.h>
49 #include <asm/tlb.h>
50
51 #include <asm/unistd.h>
52
53 #ifdef CONFIG_NUMA
54 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
55 #else
56 #define cpu_to_node_mask(cpu) (cpu_online_map)
57 #endif
58
59 /* used to soft spin in sched while dump is in progress */
60 unsigned long dump_oncpu;
61 EXPORT_SYMBOL(dump_oncpu);
62
63 /*
64  * Convert user-nice values [ -20 ... 0 ... 19 ]
65  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
66  * and back.
67  */
68 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
69 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
70 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
71
72 /*
73  * 'User priority' is the nice value converted to something we
74  * can work with better when scaling various scheduler parameters,
75  * it's a [ 0 ... 39 ] range.
76  */
77 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
78 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
79 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
80
81 /*
82  * Some helpers for converting nanosecond timing to jiffy resolution
83  */
84 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
85 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
86
87 /*
88  * These are the 'tuning knobs' of the scheduler:
89  *
90  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
91  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
92  * Timeslices get refilled after they expire.
93  */
94 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
95 #define DEF_TIMESLICE           (100 * HZ / 1000)
96 #define ON_RUNQUEUE_WEIGHT       30
97 #define CHILD_PENALTY            95
98 #define PARENT_PENALTY          100
99 #define EXIT_WEIGHT               3
100 #define PRIO_BONUS_RATIO         25
101 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
102 #define INTERACTIVE_DELTA         2
103 #define MAX_SLEEP_AVG           (DEF_TIMESLICE * MAX_BONUS)
104 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
105 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
106 #define CREDIT_LIMIT            100
107
108 /*
109  * If a task is 'interactive' then we reinsert it in the active
110  * array after it has expired its current timeslice. (it will not
111  * continue to run immediately, it will still roundrobin with
112  * other interactive tasks.)
113  *
114  * This part scales the interactivity limit depending on niceness.
115  *
116  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
117  * Here are a few examples of different nice levels:
118  *
119  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
120  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
121  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
122  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
123  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
124  *
125  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
126  *  priority range a task can explore, a value of '1' means the
127  *  task is rated interactive.)
128  *
129  * Ie. nice +19 tasks can never get 'interactive' enough to be
130  * reinserted into the active array. And only heavily CPU-hog nice -20
131  * tasks will be expired. Default nice 0 tasks are somewhere between,
132  * it takes some effort for them to get interactive, but it's not
133  * too hard.
134  */
135
136 #define CURRENT_BONUS(p) \
137         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
138                 MAX_SLEEP_AVG)
139
140 #ifdef CONFIG_SMP
141 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
142                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
143                         num_online_cpus())
144 #else
145 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
146                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
147 #endif
148
149 #define SCALE(v1,v1_max,v2_max) \
150         (v1) * (v2_max) / (v1_max)
151
152 #define DELTA(p) \
153         (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
154
155 #define TASK_INTERACTIVE(p) \
156         ((p)->prio <= (p)->static_prio - DELTA(p))
157
158 #define INTERACTIVE_SLEEP(p) \
159         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
160                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
161
162 #define HIGH_CREDIT(p) \
163         ((p)->interactive_credit > CREDIT_LIMIT)
164
165 #define LOW_CREDIT(p) \
166         ((p)->interactive_credit < -CREDIT_LIMIT)
167
168 #ifdef CONFIG_CKRM_CPU_SCHEDULE
169 /*
170  *  if belong to different class, compare class priority
171  *  otherwise compare task priority 
172  */
173 #define TASK_PREEMPTS_CURR(p, rq) \
174         ( ((p)->cpu_class != (rq)->curr->cpu_class) \
175           && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \
176           ? class_preempts_curr((p),(rq)->curr)  \
177           : ((p)->prio < (rq)->curr->prio)
178 #else
179 #define TASK_PREEMPTS_CURR(p, rq) \
180         ((p)->prio < (rq)->curr->prio)
181 #endif
182
183 /*
184  * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
185  * to time slice values: [800ms ... 100ms ... 5ms]
186  *
187  * The higher a thread's priority, the bigger timeslices
188  * it gets during one round of execution. But even the lowest
189  * priority thread gets MIN_TIMESLICE worth of execution time.
190  */
191
192 #define SCALE_PRIO(x, prio) \
193         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
194
195 unsigned int task_timeslice(task_t *p)
196 {
197         if (p->static_prio < NICE_TO_PRIO(0))
198                 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
199         else
200                 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
201 }
202 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)       \
203                                 < (long long) (sd)->cache_hot_time)
204
205 enum idle_type
206 {
207         IDLE,
208         NOT_IDLE,
209         NEWLY_IDLE,
210         MAX_IDLE_TYPES
211 };
212
213 struct sched_domain;
214
215 /*
216  * These are the runqueue data structures:
217  */
218
219 typedef struct runqueue runqueue_t;
220 #include <linux/ckrm_classqueue.h>
221 #include <linux/ckrm_sched.h>
222
223 /*
224  * This is the main, per-CPU runqueue data structure.
225  *
226  * Locking rule: those places that want to lock multiple runqueues
227  * (such as the load balancing or the thread migration code), lock
228  * acquire operations must be ordered by ascending &runqueue.
229  */
230 struct runqueue {
231         spinlock_t lock;
232
233         /*
234          * nr_running and cpu_load should be in the same cacheline because
235          * remote CPUs use both these fields when doing load calculation.
236          */
237         unsigned long nr_running;
238 #ifdef CONFIG_SMP
239         unsigned long cpu_load;
240 #endif
241         unsigned long long nr_switches;
242         unsigned long expired_timestamp, nr_uninterruptible;
243         unsigned long long timestamp_last_tick;
244         task_t *curr, *idle;
245         struct mm_struct *prev_mm;
246 #ifdef CONFIG_CKRM_CPU_SCHEDULE
247         struct classqueue_struct classqueue;   
248         ckrm_load_t ckrm_load;
249 #else
250         prio_array_t *active, *expired, arrays[2];
251 #endif
252         int best_expired_prio;
253         atomic_t nr_iowait;
254
255 #ifdef CONFIG_SMP
256         struct sched_domain *sd;
257
258         /* For active balancing */
259         int active_balance;
260         int push_cpu;
261
262         task_t *migration_thread;
263         struct list_head migration_queue;
264 #endif
265
266 #ifdef CONFIG_VSERVER_HARDCPU
267         struct list_head hold_queue;
268         int idle_tokens;
269 #endif
270
271 #ifdef CONFIG_SCHEDSTATS
272         /* latency stats */
273         struct sched_info rq_sched_info;
274
275         /* sys_sched_yield() stats */
276         unsigned long yld_exp_empty;
277         unsigned long yld_act_empty;
278         unsigned long yld_both_empty;
279         unsigned long yld_cnt;
280
281         /* schedule() stats */
282         unsigned long sched_noswitch;
283         unsigned long sched_switch;
284         unsigned long sched_cnt;
285         unsigned long sched_goidle;
286
287         /* pull_task() stats */
288         unsigned long pt_gained[MAX_IDLE_TYPES];
289         unsigned long pt_lost[MAX_IDLE_TYPES];
290
291         /* active_load_balance() stats */
292         unsigned long alb_cnt;
293         unsigned long alb_lost;
294         unsigned long alb_gained;
295         unsigned long alb_failed;
296
297         /* try_to_wake_up() stats */
298         unsigned long ttwu_cnt;
299         unsigned long ttwu_attempts;
300         unsigned long ttwu_moved;
301
302         /* wake_up_new_task() stats */
303         unsigned long wunt_cnt;
304         unsigned long wunt_moved;
305
306         /* sched_migrate_task() stats */
307         unsigned long smt_cnt;
308
309         /* sched_balance_exec() stats */
310         unsigned long sbe_cnt;
311 #endif
312 };
313
314 static DEFINE_PER_CPU(struct runqueue, runqueues);
315
316 /*
317  * sched-domains (multiprocessor balancing) declarations:
318  */
319 #ifdef CONFIG_SMP
320 #define SCHED_LOAD_SCALE        128UL   /* increase resolution of load */
321
322 #define SD_BALANCE_NEWIDLE      1       /* Balance when about to become idle */
323 #define SD_BALANCE_EXEC         2       /* Balance on exec */
324 #define SD_WAKE_IDLE            4       /* Wake to idle CPU on task wakeup */
325 #define SD_WAKE_AFFINE          8       /* Wake task to waking CPU */
326 #define SD_WAKE_BALANCE         16      /* Perform balancing at task wakeup */
327 #define SD_SHARE_CPUPOWER       32      /* Domain members share cpu power */
328
329 struct sched_group {
330         struct sched_group *next;       /* Must be a circular list */
331         cpumask_t cpumask;
332
333         /*
334          * CPU power of this group, SCHED_LOAD_SCALE being max power for a
335          * single CPU. This is read only (except for setup, hotplug CPU).
336          */
337         unsigned long cpu_power;
338 };
339
340 struct sched_domain {
341         /* These fields must be setup */
342         struct sched_domain *parent;    /* top domain must be null terminated */
343         struct sched_group *groups;     /* the balancing groups of the domain */
344         cpumask_t span;                 /* span of all CPUs in this domain */
345         unsigned long min_interval;     /* Minimum balance interval ms */
346         unsigned long max_interval;     /* Maximum balance interval ms */
347         unsigned int busy_factor;       /* less balancing by factor if busy */
348         unsigned int imbalance_pct;     /* No balance until over watermark */
349         unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
350         unsigned int cache_nice_tries;  /* Leave cache hot tasks for # tries */
351         unsigned int per_cpu_gain;      /* CPU % gained by adding domain cpus */
352         int flags;                      /* See SD_* */
353
354         /* Runtime fields. */
355         unsigned long last_balance;     /* init to jiffies. units in jiffies */
356         unsigned int balance_interval;  /* initialise to 1. units in ms. */
357         unsigned int nr_balance_failed; /* initialise to 0 */
358
359 #ifdef CONFIG_SCHEDSTATS
360         /* load_balance() stats */
361         unsigned long lb_cnt[MAX_IDLE_TYPES];
362         unsigned long lb_failed[MAX_IDLE_TYPES];
363         unsigned long lb_imbalance[MAX_IDLE_TYPES];
364         unsigned long lb_nobusyg[MAX_IDLE_TYPES];
365         unsigned long lb_nobusyq[MAX_IDLE_TYPES];
366
367         /* sched_balance_exec() stats */
368         unsigned long sbe_attempts;
369         unsigned long sbe_pushed;
370
371         /* try_to_wake_up() stats */
372         unsigned long ttwu_wake_affine;
373         unsigned long ttwu_wake_balance;
374 #endif
375 };
376
377 #ifndef ARCH_HAS_SCHED_TUNE
378 #ifdef CONFIG_SCHED_SMT
379 #define ARCH_HAS_SCHED_WAKE_IDLE
380 /* Common values for SMT siblings */
381 #define SD_SIBLING_INIT (struct sched_domain) {         \
382         .span                   = CPU_MASK_NONE,        \
383         .parent                 = NULL,                 \
384         .groups                 = NULL,                 \
385         .min_interval           = 1,                    \
386         .max_interval           = 2,                    \
387         .busy_factor            = 8,                    \
388         .imbalance_pct          = 110,                  \
389         .cache_hot_time         = 0,                    \
390         .cache_nice_tries       = 0,                    \
391         .per_cpu_gain           = 25,                   \
392         .flags                  = SD_BALANCE_NEWIDLE    \
393                                 | SD_BALANCE_EXEC       \
394                                 | SD_WAKE_AFFINE        \
395                                 | SD_WAKE_IDLE          \
396                                 | SD_SHARE_CPUPOWER,    \
397         .last_balance           = jiffies,              \
398         .balance_interval       = 1,                    \
399         .nr_balance_failed      = 0,                    \
400 }
401 #endif
402
403 /* Common values for CPUs */
404 #define SD_CPU_INIT (struct sched_domain) {             \
405         .span                   = CPU_MASK_NONE,        \
406         .parent                 = NULL,                 \
407         .groups                 = NULL,                 \
408         .min_interval           = 1,                    \
409         .max_interval           = 4,                    \
410         .busy_factor            = 64,                   \
411         .imbalance_pct          = 125,                  \
412         .cache_hot_time         = cache_decay_ticks*1000000 ? : (5*1000000/2),\
413         .cache_nice_tries       = 1,                    \
414         .per_cpu_gain           = 100,                  \
415         .flags                  = SD_BALANCE_NEWIDLE    \
416                                 | SD_BALANCE_EXEC       \
417                                 | SD_WAKE_AFFINE        \
418                                 | SD_WAKE_BALANCE,      \
419         .last_balance           = jiffies,              \
420         .balance_interval       = 1,                    \
421         .nr_balance_failed      = 0,                    \
422 }
423
424 /* Arch can override this macro in processor.h */
425 #if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
426 #define SD_NODE_INIT (struct sched_domain) {            \
427         .span                   = CPU_MASK_NONE,        \
428         .parent                 = NULL,                 \
429         .groups                 = NULL,                 \
430         .min_interval           = 8,                    \
431         .max_interval           = 32,                   \
432         .busy_factor            = 32,                   \
433         .imbalance_pct          = 125,                  \
434         .cache_hot_time         = (10*1000000),         \
435         .cache_nice_tries       = 1,                    \
436         .per_cpu_gain           = 100,                  \
437         .flags                  = SD_BALANCE_EXEC       \
438                                 | SD_WAKE_BALANCE,      \
439         .last_balance           = jiffies,              \
440         .balance_interval       = 1,                    \
441         .nr_balance_failed      = 0,                    \
442 }
443 #endif
444 #endif /* ARCH_HAS_SCHED_TUNE */
445 #endif
446
447
448 #define for_each_domain(cpu, domain) \
449         for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
450
451 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
452 #define this_rq()               (&__get_cpu_var(runqueues))
453 #define task_rq(p)              cpu_rq(task_cpu(p))
454 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
455
456 /*
457  * Default context-switch locking:
458  */
459 #ifndef prepare_arch_switch
460 # define prepare_arch_switch(rq, next)  do { } while (0)
461 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
462 # define task_running(rq, p)            ((rq)->curr == (p))
463 #endif
464
465 /*
466  * task_rq_lock - lock the runqueue a given task resides on and disable
467  * interrupts.  Note the ordering: we can safely lookup the task_rq without
468  * explicitly disabling preemption.
469  */
470 static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
471 {
472         struct runqueue *rq;
473
474 repeat_lock_task:
475         local_irq_save(*flags);
476         rq = task_rq(p);
477         spin_lock(&rq->lock);
478         if (unlikely(rq != task_rq(p))) {
479                 spin_unlock_irqrestore(&rq->lock, *flags);
480                 goto repeat_lock_task;
481         }
482         return rq;
483 }
484
485 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
486 {
487         spin_unlock_irqrestore(&rq->lock, *flags);
488 }
489
490 #ifdef CONFIG_SCHEDSTATS
491 /*
492  * bump this up when changing the output format or the meaning of an existing
493  * format, so that tools can adapt (or abort)
494  */
495 #define SCHEDSTAT_VERSION 10
496
497 static int show_schedstat(struct seq_file *seq, void *v)
498 {
499         int cpu;
500         enum idle_type itype;
501
502         seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
503         seq_printf(seq, "timestamp %lu\n", jiffies);
504         for_each_online_cpu(cpu) {
505                 runqueue_t *rq = cpu_rq(cpu);
506 #ifdef CONFIG_SMP
507                 struct sched_domain *sd;
508                 int dcnt = 0;
509 #endif
510
511                 /* runqueue-specific stats */
512                 seq_printf(seq,
513                     "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu "
514                     "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
515                     cpu, rq->yld_both_empty,
516                     rq->yld_act_empty, rq->yld_exp_empty,
517                     rq->yld_cnt, rq->sched_noswitch,
518                     rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
519                     rq->alb_cnt, rq->alb_gained, rq->alb_lost,
520                     rq->alb_failed,
521                     rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts,
522                     rq->wunt_cnt, rq->wunt_moved,
523                     rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time,
524                     rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
525
526                 for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++)
527                         seq_printf(seq, " %lu %lu", rq->pt_gained[itype],
528                                                     rq->pt_lost[itype]);
529                 seq_printf(seq, "\n");
530
531 #ifdef CONFIG_SMP
532                 /* domain-specific stats */
533                 for_each_domain(cpu, sd) {
534                         char mask_str[NR_CPUS];
535
536                         cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
537                         seq_printf(seq, "domain%d %s", dcnt++, mask_str);
538                         for (itype = IDLE; itype < MAX_IDLE_TYPES; itype++) {
539                                 seq_printf(seq, " %lu %lu %lu %lu %lu",
540                                     sd->lb_cnt[itype],
541                                     sd->lb_failed[itype],
542                                     sd->lb_imbalance[itype],
543                                     sd->lb_nobusyq[itype],
544                                     sd->lb_nobusyg[itype]);
545                         }
546                         seq_printf(seq, " %lu %lu %lu %lu\n",
547                             sd->sbe_pushed, sd->sbe_attempts,
548                             sd->ttwu_wake_affine, sd->ttwu_wake_balance);
549                 }
550 #endif
551         }
552         return 0;
553 }
554
555 static int schedstat_open(struct inode *inode, struct file *file)
556 {
557         unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
558         char *buf = kmalloc(size, GFP_KERNEL);
559         struct seq_file *m;
560         int res;
561
562         if (!buf)
563                 return -ENOMEM;
564         res = single_open(file, show_schedstat, NULL);
565         if (!res) {
566                 m = file->private_data;
567                 m->buf = buf;
568                 m->size = size;
569         } else
570                 kfree(buf);
571         return res;
572 }
573
574 struct file_operations proc_schedstat_operations = {
575         .open    = schedstat_open,
576         .read    = seq_read,
577         .llseek  = seq_lseek,
578         .release = single_release,
579 };
580
581 # define schedstat_inc(rq, field)       rq->field++;
582 # define schedstat_add(rq, field, amt)  rq->field += amt;
583 #else /* !CONFIG_SCHEDSTATS */
584 # define schedstat_inc(rq, field)       do { } while (0);
585 # define schedstat_add(rq, field, amt)  do { } while (0);
586 #endif
587
588 /*
589  * rq_lock - lock a given runqueue and disable interrupts.
590  */
591 static runqueue_t *this_rq_lock(void)
592 {
593         runqueue_t *rq;
594
595         local_irq_disable();
596         rq = this_rq();
597         spin_lock(&rq->lock);
598
599         return rq;
600 }
601
602 static inline void rq_unlock(runqueue_t *rq)
603 {
604         spin_unlock_irq(&rq->lock);
605 }
606
607 #ifdef CONFIG_SCHEDSTATS
608 /*
609  * Called when a process is dequeued from the active array and given
610  * the cpu.  We should note that with the exception of interactive
611  * tasks, the expired queue will become the active queue after the active
612  * queue is empty, without explicitly dequeuing and requeuing tasks in the
613  * expired queue.  (Interactive tasks may be requeued directly to the
614  * active queue, thus delaying tasks in the expired queue from running;
615  * see scheduler_tick()).
616  *
617  * This function is only called from sched_info_arrive(), rather than
618  * dequeue_task(). Even though a task may be queued and dequeued multiple
619  * times as it is shuffled about, we're really interested in knowing how
620  * long it was from the *first* time it was queued to the time that it
621  * finally hit a cpu.
622  */
623 static inline void sched_info_dequeued(task_t *t)
624 {
625         t->sched_info.last_queued = 0;
626 }
627
628 /*
629  * Called when a task finally hits the cpu.  We can now calculate how
630  * long it was waiting to run.  We also note when it began so that we
631  * can keep stats on how long its timeslice is.
632  */
633 static inline void sched_info_arrive(task_t *t)
634 {
635         unsigned long now = jiffies, diff = 0;
636         struct runqueue *rq = task_rq(t);
637
638         if (t->sched_info.last_queued)
639                 diff = now - t->sched_info.last_queued;
640         sched_info_dequeued(t);
641         t->sched_info.run_delay += diff;
642         t->sched_info.last_arrival = now;
643         t->sched_info.pcnt++;
644
645         if (!rq)
646                 return;
647
648         rq->rq_sched_info.run_delay += diff;
649         rq->rq_sched_info.pcnt++;
650 }
651
652 /*
653  * Called when a process is queued into either the active or expired
654  * array.  The time is noted and later used to determine how long we
655  * had to wait for us to reach the cpu.  Since the expired queue will
656  * become the active queue after active queue is empty, without dequeuing
657  * and requeuing any tasks, we are interested in queuing to either. It
658  * is unusual but not impossible for tasks to be dequeued and immediately
659  * requeued in the same or another array: this can happen in sched_yield(),
660  * set_user_nice(), and even load_balance() as it moves tasks from runqueue
661  * to runqueue.
662  *
663  * This function is only called from enqueue_task(), but also only updates
664  * the timestamp if it is already not set.  It's assumed that
665  * sched_info_dequeued() will clear that stamp when appropriate.
666  */
667 static inline void sched_info_queued(task_t *t)
668 {
669         if (!t->sched_info.last_queued)
670                 t->sched_info.last_queued = jiffies;
671 }
672
673 /*
674  * Called when a process ceases being the active-running process, either
675  * voluntarily or involuntarily.  Now we can calculate how long we ran.
676  */
677 static inline void sched_info_depart(task_t *t)
678 {
679         struct runqueue *rq = task_rq(t);
680         unsigned long diff = jiffies - t->sched_info.last_arrival;
681
682         t->sched_info.cpu_time += diff;
683
684         if (rq)
685                 rq->rq_sched_info.cpu_time += diff;
686 }
687
688 /*
689  * Called when tasks are switched involuntarily due, typically, to expiring
690  * their time slice.  (This may also be called when switching to or from
691  * the idle task.)  We are only called when prev != next.
692  */
693 static inline void sched_info_switch(task_t *prev, task_t *next)
694 {
695         struct runqueue *rq = task_rq(prev);
696
697         /*
698          * prev now departs the cpu.  It's not interesting to record
699          * stats about how efficient we were at scheduling the idle
700          * process, however.
701          */
702         if (prev != rq->idle)
703                 sched_info_depart(prev);
704
705         if (next != rq->idle)
706                 sched_info_arrive(next);
707 }
708 #else
709 #define sched_info_queued(t)            do { } while (0)
710 #define sched_info_switch(t, next)      do { } while (0)
711 #endif /* CONFIG_SCHEDSTATS */
712
713 #ifdef CONFIG_CKRM_CPU_SCHEDULE
714 static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
715 {
716         cq_node_t *node = classqueue_get_head(&rq->classqueue);
717         return ((node) ? class_list_entry(node) : NULL);
718 }
719
720 /*
721  * return the cvt of the current running class
722  * if no current running class, return 0
723  * assume cpu is valid (cpu_online(cpu) == 1)
724  */
725 CVT_t get_local_cur_cvt(int cpu)
726 {
727         ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu));
728
729         if (lrq)
730                 return lrq->local_cvt;
731         else    
732                 return 0;
733 }
734
735 static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
736 {
737         prio_array_t               *array;
738         struct task_struct         *next;
739         ckrm_lrq_t *queue;
740         int idx;
741         int cpu = smp_processor_id();
742
743         // it is guaranteed be the ( rq->nr_running > 0 ) check in 
744         // schedule that a task will be found.
745
746  retry_next_class:
747         queue = rq_get_next_class(rq);
748         // BUG_ON( !queue );
749
750         array = queue->active;
751         if (unlikely(!array->nr_active)) {
752                 queue->active = queue->expired;
753                 queue->expired = array;
754                 queue->expired_timestamp = 0;
755
756                 schedstat_inc(rq, sched_switch);
757                 if (queue->active->nr_active)
758                         set_top_priority(queue,
759                                          find_first_bit(queue->active->bitmap, MAX_PRIO));
760                 else {
761                         classqueue_dequeue(queue->classqueue,
762                                            &queue->classqueue_linkobj);
763                         cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
764                 }
765                 goto retry_next_class;                          
766         } else
767                 schedstat_inc(rq, sched_noswitch);
768         // BUG_ON(!array->nr_active);
769
770         idx = queue->top_priority;
771         // BUG_ON (idx == MAX_PRIO);
772         next = task_list_entry(array->queue[idx].next);
773         return next;
774 }
775 #else /*! CONFIG_CKRM_CPU_SCHEDULE*/
776 static inline struct task_struct * rq_get_next_task(struct runqueue* rq) 
777 {
778         prio_array_t *array;
779         struct list_head *queue;
780         int idx;
781
782         array = rq->active;
783         if (unlikely(!array->nr_active)) {
784                 /*
785                  * Switch the active and expired arrays.
786                  */
787                 schedstat_inc(rq, sched_switch);
788                 rq->active = rq->expired;
789                 rq->expired = array;
790                 array = rq->active;
791                 rq->expired_timestamp = 0;
792                 rq->best_expired_prio = MAX_PRIO;
793         } else 
794                 schedstat_inc(rq, sched_noswitch);
795
796         idx = sched_find_first_bit(array->bitmap);
797         queue = array->queue + idx;
798         return list_entry(queue->next, task_t, run_list);
799 }
800
801 static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
802 static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
803 static inline void init_cpu_classes(void) { }
804 #define rq_ckrm_load(rq) NULL
805 static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
806 #endif  /* CONFIG_CKRM_CPU_SCHEDULE */
807
808 /*
809  * Adding/removing a task to/from a priority array:
810  */
811 static void dequeue_task(struct task_struct *p, prio_array_t *array)
812 {
813         array->nr_active--;
814         list_del(&p->run_list);
815         if (list_empty(array->queue + p->prio))
816                 __clear_bit(p->prio, array->bitmap);
817         class_dequeue_task(p,array);
818 }
819
820 static void enqueue_task(struct task_struct *p, prio_array_t *array)
821 {
822         sched_info_queued(p);
823         list_add_tail(&p->run_list, array->queue + p->prio);
824         __set_bit(p->prio, array->bitmap);
825         array->nr_active++;
826         p->array = array;
827         class_enqueue_task(p,array);
828 }
829
830 /*
831  * Used by the migration code - we pull tasks from the head of the
832  * remote queue so we want these tasks to show up at the head of the
833  * local queue:
834  */
835 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
836 {
837         list_add(&p->run_list, array->queue + p->prio);
838         __set_bit(p->prio, array->bitmap);
839         array->nr_active++;
840         p->array = array;
841         class_enqueue_task(p,array);
842 }
843
844 /*
845  * effective_prio - return the priority that is based on the static
846  * priority but is modified by bonuses/penalties.
847  *
848  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
849  * into the -5 ... 0 ... +5 bonus/penalty range.
850  *
851  * We use 25% of the full 0...39 priority range so that:
852  *
853  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
854  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
855  *
856  * Both properties are important to certain workloads.
857  */
858 static int effective_prio(task_t *p)
859 {
860         int bonus, prio;
861
862         if (rt_task(p))
863                 return p->prio;
864
865         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
866
867         prio = p->static_prio - bonus;
868
869 #ifdef CONFIG_VSERVER_HARDCPU
870         if (task_vx_flags(p, VXF_SCHED_PRIO, 0))
871                 prio += effective_vavavoom(p, MAX_USER_PRIO);
872 #endif
873
874         if (prio < MAX_RT_PRIO)
875                 prio = MAX_RT_PRIO;
876         if (prio > MAX_PRIO-1)
877                 prio = MAX_PRIO-1;
878         return prio;
879 }
880
881 /*
882  * __activate_task - move a task to the runqueue.
883  */
884 static inline void __activate_task(task_t *p, runqueue_t *rq)
885 {
886         enqueue_task(p, rq_active(p,rq));
887         rq->nr_running++;
888 }
889
890 /*
891  * __activate_idle_task - move idle task to the _front_ of runqueue.
892  */
893 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
894 {
895         enqueue_task_head(p, rq_active(p,rq));
896         rq->nr_running++;
897 }
898
899 static void recalc_task_prio(task_t *p, unsigned long long now)
900 {
901         unsigned long long __sleep_time = now - p->timestamp;
902         unsigned long sleep_time;
903
904         if (__sleep_time > NS_MAX_SLEEP_AVG)
905                 sleep_time = NS_MAX_SLEEP_AVG;
906         else
907                 sleep_time = (unsigned long)__sleep_time;
908
909         if (likely(sleep_time > 0)) {
910                 /*
911                  * User tasks that sleep a long time are categorised as
912                  * idle and will get just interactive status to stay active &
913                  * prevent them suddenly becoming cpu hogs and starving
914                  * other processes.
915                  */
916                 if (p->mm && p->activated != -1 &&
917                         sleep_time > INTERACTIVE_SLEEP(p)) {
918                                 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
919                                                 DEF_TIMESLICE);
920                                 if (!HIGH_CREDIT(p))
921                                         p->interactive_credit++;
922                 } else {
923                         /*
924                          * The lower the sleep avg a task has the more
925                          * rapidly it will rise with sleep time.
926                          */
927                         sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
928
929                         /*
930                          * Tasks with low interactive_credit are limited to
931                          * one timeslice worth of sleep avg bonus.
932                          */
933                         if (LOW_CREDIT(p) &&
934                             sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
935                                 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
936
937                         /*
938                          * Non high_credit tasks waking from uninterruptible
939                          * sleep are limited in their sleep_avg rise as they
940                          * are likely to be cpu hogs waiting on I/O
941                          */
942                         if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
943                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
944                                         sleep_time = 0;
945                                 else if (p->sleep_avg + sleep_time >=
946                                                 INTERACTIVE_SLEEP(p)) {
947                                         p->sleep_avg = INTERACTIVE_SLEEP(p);
948                                         sleep_time = 0;
949                                 }
950                         }
951
952                         /*
953                          * This code gives a bonus to interactive tasks.
954                          *
955                          * The boost works by updating the 'average sleep time'
956                          * value here, based on ->timestamp. The more time a
957                          * task spends sleeping, the higher the average gets -
958                          * and the higher the priority boost gets as well.
959                          */
960                         p->sleep_avg += sleep_time;
961
962                         if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
963                                 p->sleep_avg = NS_MAX_SLEEP_AVG;
964                                 if (!HIGH_CREDIT(p))
965                                         p->interactive_credit++;
966                         }
967                 }
968         }
969
970         p->prio = effective_prio(p);
971 }
972
973 /*
974  * activate_task - move a task to the runqueue and do priority recalculation
975  *
976  * Update all the scheduling statistics stuff. (sleep average
977  * calculation, priority modifiers, etc.)
978  */
979 static void activate_task(task_t *p, runqueue_t *rq, int local)
980 {
981         unsigned long long now;
982
983         now = sched_clock();
984 #ifdef CONFIG_SMP
985         if (!local) {
986                 /* Compensate for drifting sched_clock */
987                 runqueue_t *this_rq = this_rq();
988                 now = (now - this_rq->timestamp_last_tick)
989                         + rq->timestamp_last_tick;
990         }
991 #endif
992
993         recalc_task_prio(p, now);
994
995         /*
996          * This checks to make sure it's not an uninterruptible task
997          * that is now waking up.
998          */
999         if (!p->activated) {
1000                 /*
1001                  * Tasks which were woken up by interrupts (ie. hw events)
1002                  * are most likely of interactive nature. So we give them
1003                  * the credit of extending their sleep time to the period
1004                  * of time they spend on the runqueue, waiting for execution
1005                  * on a CPU, first time around:
1006                  */
1007                 if (in_interrupt())
1008                         p->activated = 2;
1009                 else {
1010                         /*
1011                          * Normal first-time wakeups get a credit too for
1012                          * on-runqueue time, but it will be weighted down:
1013                          */
1014                         p->activated = 1;
1015                 }
1016         }
1017         p->timestamp = now;
1018
1019         vx_activate_task(p);
1020         __activate_task(p, rq);
1021 }
1022
1023 /*
1024  * deactivate_task - remove a task from the runqueue.
1025  */
1026 static void __deactivate_task(struct task_struct *p, runqueue_t *rq)
1027 {
1028         rq->nr_running--;
1029         if (p->state == TASK_UNINTERRUPTIBLE)
1030                 rq->nr_uninterruptible++;
1031         dequeue_task(p, p->array);
1032
1033         p->array = NULL;
1034 }
1035
1036 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
1037 {
1038         __deactivate_task(p, rq);
1039         vx_deactivate_task(p);
1040 }
1041
1042 /*
1043  * resched_task - mark a task 'to be rescheduled now'.
1044  *
1045  * On UP this means the setting of the need_resched flag, on SMP it
1046  * might also involve a cross-CPU call to trigger the scheduler on
1047  * the target CPU.
1048  */
1049 #ifdef CONFIG_SMP
1050 static void resched_task(task_t *p)
1051 {
1052         int need_resched, nrpolling;
1053
1054         BUG_ON(!spin_is_locked(&task_rq(p)->lock));
1055
1056         /* minimise the chance of sending an interrupt to poll_idle() */
1057         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
1058         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
1059         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
1060
1061         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
1062                 smp_send_reschedule(task_cpu(p));
1063 }
1064 #else
1065 static inline void resched_task(task_t *p)
1066 {
1067         set_tsk_need_resched(p);
1068 }
1069 #endif
1070
1071 /**
1072  * task_curr - is this task currently executing on a CPU?
1073  * @p: the task in question.
1074  */
1075 inline int task_curr(const task_t *p)
1076 {
1077         return cpu_curr(task_cpu(p)) == p;
1078 }
1079
1080 #ifdef CONFIG_SMP
1081 enum request_type {
1082         REQ_MOVE_TASK,
1083         REQ_SET_DOMAIN,
1084 };
1085
1086 typedef struct {
1087         struct list_head list;
1088         enum request_type type;
1089
1090         /* For REQ_MOVE_TASK */
1091         task_t *task;
1092         int dest_cpu;
1093
1094         /* For REQ_SET_DOMAIN */
1095         struct sched_domain *sd;
1096
1097         struct completion done;
1098 } migration_req_t;
1099
1100 /*
1101  * The task's runqueue lock must be held.
1102  * Returns true if you have to wait for migration thread.
1103  */
1104 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
1105 {
1106         runqueue_t *rq = task_rq(p);
1107
1108         /*
1109          * If the task is not on a runqueue (and not running), then
1110          * it is sufficient to simply update the task's cpu field.
1111          */
1112         if (!p->array && !task_running(rq, p)) {
1113                 set_task_cpu(p, dest_cpu);
1114                 return 0;
1115         }
1116
1117         init_completion(&req->done);
1118         req->type = REQ_MOVE_TASK;
1119         req->task = p;
1120         req->dest_cpu = dest_cpu;
1121         list_add(&req->list, &rq->migration_queue);
1122         return 1;
1123 }
1124
1125 /*
1126  * wait_task_inactive - wait for a thread to unschedule.
1127  *
1128  * The caller must ensure that the task *will* unschedule sometime soon,
1129  * else this function might spin for a *long* time. This function can't
1130  * be called with interrupts off, or it may introduce deadlock with
1131  * smp_call_function() if an IPI is sent by the same process we are
1132  * waiting to become inactive.
1133  */
1134 void wait_task_inactive(task_t * p)
1135 {
1136         unsigned long flags;
1137         runqueue_t *rq;
1138         int preempted;
1139
1140 repeat:
1141         rq = task_rq_lock(p, &flags);
1142         /* Must be off runqueue entirely, not preempted. */
1143         if (unlikely(p->array)) {
1144                 /* If it's preempted, we yield.  It could be a while. */
1145                 preempted = !task_running(rq, p);
1146                 task_rq_unlock(rq, &flags);
1147                 cpu_relax();
1148                 if (preempted)
1149                         yield();
1150                 goto repeat;
1151         }
1152         task_rq_unlock(rq, &flags);
1153 }
1154
1155 /***
1156  * kick_process - kick a running thread to enter/exit the kernel
1157  * @p: the to-be-kicked thread
1158  *
1159  * Cause a process which is running on another CPU to enter
1160  * kernel-mode, without any delay. (to get signals handled.)
1161  */
1162 void kick_process(task_t *p)
1163 {
1164         int cpu;
1165
1166         preempt_disable();
1167         cpu = task_cpu(p);
1168         if ((cpu != smp_processor_id()) && task_curr(p))
1169                 smp_send_reschedule(cpu);
1170         preempt_enable();
1171 }
1172
1173 /*
1174  * Return a low guess at the load of a migration-source cpu.
1175  *
1176  * We want to under-estimate the load of migration sources, to
1177  * balance conservatively.
1178  */
1179 static inline unsigned long source_load(int cpu)
1180 {
1181         runqueue_t *rq = cpu_rq(cpu);
1182         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
1183
1184         return min(rq->cpu_load, load_now);
1185 }
1186
1187 /*
1188  * Return a high guess at the load of a migration-target cpu
1189  */
1190 static inline unsigned long target_load(int cpu)
1191 {
1192         runqueue_t *rq = cpu_rq(cpu);
1193         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
1194
1195         return max(rq->cpu_load, load_now);
1196 }
1197
1198 #endif
1199
1200 /*
1201  * wake_idle() is useful especially on SMT architectures to wake a
1202  * task onto an idle sibling if we would otherwise wake it onto a
1203  * busy sibling.
1204  *
1205  * Returns the CPU we should wake onto.
1206  */
1207 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1208 static int wake_idle(int cpu, task_t *p)
1209 {
1210         cpumask_t tmp;
1211         runqueue_t *rq = cpu_rq(cpu);
1212         struct sched_domain *sd;
1213         int i;
1214
1215         if (idle_cpu(cpu))
1216                 return cpu;
1217
1218         sd = rq->sd;
1219         if (!(sd->flags & SD_WAKE_IDLE))
1220                 return cpu;
1221
1222         cpus_and(tmp, sd->span, p->cpus_allowed);
1223
1224         for_each_cpu_mask(i, tmp) {
1225                 if (idle_cpu(i))
1226                         return i;
1227         }
1228
1229         return cpu;
1230 }
1231 #else
1232 static inline int wake_idle(int cpu, task_t *p)
1233 {
1234         return cpu;
1235 }
1236 #endif
1237
1238 /***
1239  * try_to_wake_up - wake up a thread
1240  * @p: the to-be-woken-up thread
1241  * @state: the mask of task states that can be woken
1242  * @sync: do a synchronous wakeup?
1243  *
1244  * Put it on the run-queue if it's not already there. The "current"
1245  * thread is always on the run-queue (except when the actual
1246  * re-schedule is in progress), and as such you're allowed to do
1247  * the simpler "current->state = TASK_RUNNING" to mark yourself
1248  * runnable without the overhead of this.
1249  *
1250  * returns failure only if the task is already active.
1251  */
1252 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
1253 {
1254         int cpu, this_cpu, success = 0;
1255         unsigned long flags;
1256         long old_state;
1257         runqueue_t *rq;
1258 #ifdef CONFIG_SMP
1259         unsigned long load, this_load;
1260         struct sched_domain *sd;
1261         int new_cpu;
1262 #endif
1263
1264         rq = task_rq_lock(p, &flags);
1265         schedstat_inc(rq, ttwu_cnt);
1266         old_state = p->state;
1267         if (!(old_state & state))
1268                 goto out;
1269
1270         if (p->array)
1271                 goto out_running;
1272
1273         cpu = task_cpu(p);
1274         this_cpu = smp_processor_id();
1275
1276 #ifdef CONFIG_SMP
1277         if (unlikely(task_running(rq, p)))
1278                 goto out_activate;
1279
1280         new_cpu = cpu;
1281
1282         if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1283                 goto out_set_cpu;
1284
1285         load = source_load(cpu);
1286         this_load = target_load(this_cpu);
1287
1288         /*
1289          * If sync wakeup then subtract the (maximum possible) effect of
1290          * the currently running task from the load of the current CPU:
1291          */
1292         if (sync)
1293                 this_load -= SCHED_LOAD_SCALE;
1294
1295         /* Don't pull the task off an idle CPU to a busy one */
1296         if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1297                 goto out_set_cpu;
1298
1299         new_cpu = this_cpu; /* Wake to this CPU if we can */
1300
1301         /*
1302          * Scan domains for affine wakeup and passive balancing
1303          * possibilities.
1304          */
1305         for_each_domain(this_cpu, sd) {
1306                 unsigned int imbalance;
1307                 /*
1308                  * Start passive balancing when half the imbalance_pct
1309                  * limit is reached.
1310                  */
1311                 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1312
1313                 if ((sd->flags & SD_WAKE_AFFINE) &&
1314                                 !task_hot(p, rq->timestamp_last_tick, sd)) {
1315                         /*
1316                          * This domain has SD_WAKE_AFFINE and p is cache cold
1317                          * in this domain.
1318                          */
1319                         if (cpu_isset(cpu, sd->span)) {
1320                                 schedstat_inc(sd, ttwu_wake_affine);
1321                                 goto out_set_cpu;
1322                         }
1323                 } else if ((sd->flags & SD_WAKE_BALANCE) &&
1324                                 imbalance*this_load <= 100*load) {
1325                         /*
1326                          * This domain has SD_WAKE_BALANCE and there is
1327                          * an imbalance.
1328                          */
1329                         if (cpu_isset(cpu, sd->span)) {
1330                                 schedstat_inc(sd, ttwu_wake_balance);
1331                                 goto out_set_cpu;
1332                         }
1333                 }
1334         }
1335
1336         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1337 out_set_cpu:
1338         schedstat_inc(rq, ttwu_attempts);
1339         new_cpu = wake_idle(new_cpu, p);
1340         if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
1341                 schedstat_inc(rq, ttwu_moved);
1342                 set_task_cpu(p, new_cpu);
1343                 task_rq_unlock(rq, &flags);
1344                 /* might preempt at this point */
1345                 rq = task_rq_lock(p, &flags);
1346                 old_state = p->state;
1347                 if (!(old_state & state))
1348                         goto out;
1349                 if (p->array)
1350                         goto out_running;
1351
1352                 this_cpu = smp_processor_id();
1353                 cpu = task_cpu(p);
1354         }
1355
1356 out_activate:
1357 #endif /* CONFIG_SMP */
1358         if (old_state == TASK_UNINTERRUPTIBLE) {
1359                 rq->nr_uninterruptible--;
1360                 /*
1361                  * Tasks on involuntary sleep don't earn
1362                  * sleep_avg beyond just interactive state.
1363                  */
1364                 p->activated = -1;
1365         }
1366
1367         /*
1368          * Sync wakeups (i.e. those types of wakeups where the waker
1369          * has indicated that it will leave the CPU in short order)
1370          * don't trigger a preemption, if the woken up task will run on
1371          * this cpu. (in this case the 'I will reschedule' promise of
1372          * the waker guarantees that the freshly woken up task is going
1373          * to be considered on this CPU.)
1374          */
1375         activate_task(p, rq, cpu == this_cpu);
1376         if (!sync || cpu != this_cpu) {
1377                 if (TASK_PREEMPTS_CURR(p, rq))
1378                         resched_task(rq->curr);
1379         }
1380         success = 1;
1381
1382 out_running:
1383         p->state = TASK_RUNNING;
1384 out:
1385         task_rq_unlock(rq, &flags);
1386
1387         return success;
1388 }
1389
1390 int fastcall wake_up_process(task_t * p)
1391 {
1392         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1393                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1394 }
1395
1396 EXPORT_SYMBOL(wake_up_process);
1397
1398 int fastcall wake_up_state(task_t *p, unsigned int state)
1399 {
1400         return try_to_wake_up(p, state, 0);
1401 }
1402
1403 #ifdef CONFIG_SMP
1404 static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1405                            struct sched_domain *sd);
1406 #endif
1407
1408 /*
1409  * Perform scheduler related setup for a newly forked process p.
1410  * p is forked by current.
1411  */
1412 void fastcall sched_fork(task_t *p)
1413 {
1414         /*
1415          * We mark the process as running here, but have not actually
1416          * inserted it onto the runqueue yet. This guarantees that
1417          * nobody will actually run it, and a signal or other external
1418          * event cannot wake it up and insert it on the runqueue either.
1419          */
1420         p->state = TASK_RUNNING;
1421         INIT_LIST_HEAD(&p->run_list);
1422         p->array = NULL;
1423         spin_lock_init(&p->switch_lock);
1424 #ifdef CONFIG_SCHEDSTATS
1425         memset(&p->sched_info, 0, sizeof(p->sched_info));
1426 #endif
1427 #ifdef CONFIG_CKRM_CPU_SCHEDULE
1428         cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0);
1429 #endif
1430 #ifdef CONFIG_PREEMPT
1431         /*
1432          * During context-switch we hold precisely one spinlock, which
1433          * schedule_tail drops. (in the common case it's this_rq()->lock,
1434          * but it also can be p->switch_lock.) So we compensate with a count
1435          * of 1. Also, we want to start with kernel preemption disabled.
1436          */
1437         p->thread_info->preempt_count = 1;
1438 #endif
1439         /*
1440          * Share the timeslice between parent and child, thus the
1441          * total amount of pending timeslices in the system doesn't change,
1442          * resulting in more scheduling fairness.
1443          */
1444         local_irq_disable();
1445         p->time_slice = (current->time_slice + 1) >> 1;
1446         /*
1447          * The remainder of the first timeslice might be recovered by
1448          * the parent if the child exits early enough.
1449          */
1450         p->first_time_slice = 1;
1451         current->time_slice >>= 1;
1452         p->timestamp = sched_clock();
1453         if (unlikely(!current->time_slice)) {
1454                 /*
1455                  * This case is rare, it happens when the parent has only
1456                  * a single jiffy left from its timeslice. Taking the
1457                  * runqueue lock is not a problem.
1458                  */
1459                 current->time_slice = 1;
1460                 preempt_disable();
1461                 scheduler_tick(0, 0);
1462                 local_irq_enable();
1463                 preempt_enable();
1464         } else
1465                 local_irq_enable();
1466 }
1467
1468 /*
1469  * wake_up_new_task - wake up a newly created task for the first time.
1470  *
1471  * This function will do some initial scheduler statistics housekeeping
1472  * that must be done for every newly created context, then puts the task
1473  * on the runqueue and wakes it.
1474  */
1475 void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1476 {
1477         unsigned long flags;
1478         int this_cpu, cpu;
1479         runqueue_t *rq, *this_rq;
1480
1481         rq = task_rq_lock(p, &flags);
1482         cpu = task_cpu(p);
1483         this_cpu = smp_processor_id();
1484
1485         BUG_ON(p->state != TASK_RUNNING);
1486
1487         schedstat_inc(rq, wunt_cnt);
1488         /*
1489          * We decrease the sleep average of forking parents
1490          * and children as well, to keep max-interactive tasks
1491          * from forking tasks that are max-interactive. The parent
1492          * (current) is done further down, under its lock.
1493          */
1494         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1495                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1496
1497         p->interactive_credit = 0;
1498
1499         p->prio = effective_prio(p);
1500
1501         vx_activate_task(p);
1502         if (likely(cpu == this_cpu)) {
1503                 if (!(clone_flags & CLONE_VM)) {
1504                         /*
1505                          * The VM isn't cloned, so we're in a good position to
1506                          * do child-runs-first in anticipation of an exec. This
1507                          * usually avoids a lot of COW overhead.
1508                          */
1509                         if (unlikely(!current->array))
1510                                 __activate_task(p, rq);
1511                         else {
1512                                 p->prio = current->prio;
1513                                 list_add_tail(&p->run_list, &current->run_list);
1514                                 p->array = current->array;
1515                                 p->array->nr_active++;
1516                                 rq->nr_running++;
1517                                 class_enqueue_task(p,p->array);
1518                         }
1519                         set_need_resched();
1520                 } else
1521                         /* Run child last */
1522                         __activate_task(p, rq);
1523                 /*
1524                  * We skip the following code due to cpu == this_cpu
1525                  *
1526                  *   task_rq_unlock(rq, &flags);
1527                  *   this_rq = task_rq_lock(current, &flags);
1528                  */
1529                 this_rq = rq;
1530         } else {
1531                 this_rq = cpu_rq(this_cpu);
1532
1533                 /*
1534                  * Not the local CPU - must adjust timestamp. This should
1535                  * get optimised away in the !CONFIG_SMP case.
1536                  */
1537                 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1538                                         + rq->timestamp_last_tick;
1539                 __activate_task(p, rq);
1540                 if (TASK_PREEMPTS_CURR(p, rq))
1541                         resched_task(rq->curr);
1542
1543                 schedstat_inc(rq, wunt_moved);
1544                 /*
1545                  * Parent and child are on different CPUs, now get the
1546                  * parent runqueue to update the parent's ->sleep_avg:
1547                  */
1548                 task_rq_unlock(rq, &flags);
1549                 this_rq = task_rq_lock(current, &flags);
1550         }
1551         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1552                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1553         task_rq_unlock(this_rq, &flags);
1554 }
1555
1556 /*
1557  * Potentially available exiting-child timeslices are
1558  * retrieved here - this way the parent does not get
1559  * penalized for creating too many threads.
1560  *
1561  * (this cannot be used to 'generate' timeslices
1562  * artificially, because any timeslice recovered here
1563  * was given away by the parent in the first place.)
1564  */
1565 void fastcall sched_exit(task_t * p)
1566 {
1567         unsigned long flags;
1568         runqueue_t *rq;
1569
1570         /*
1571          * If the child was a (relative-) CPU hog then decrease
1572          * the sleep_avg of the parent as well.
1573          */
1574         rq = task_rq_lock(p->parent, &flags);
1575         if (p->first_time_slice) {
1576                 p->parent->time_slice += p->time_slice;
1577                 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1578                         p->parent->time_slice = task_timeslice(p);
1579         }
1580         if (p->sleep_avg < p->parent->sleep_avg)
1581                 p->parent->sleep_avg = p->parent->sleep_avg /
1582                 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1583                 (EXIT_WEIGHT + 1);
1584         task_rq_unlock(rq, &flags);
1585 }
1586
1587 /**
1588  * finish_task_switch - clean up after a task-switch
1589  * @prev: the thread we just switched away from.
1590  *
1591  * We enter this with the runqueue still locked, and finish_arch_switch()
1592  * will unlock it along with doing any other architecture-specific cleanup
1593  * actions.
1594  *
1595  * Note that we may have delayed dropping an mm in context_switch(). If
1596  * so, we finish that here outside of the runqueue lock.  (Doing it
1597  * with the lock held can cause deadlocks; see schedule() for
1598  * details.)
1599  */
1600 static void finish_task_switch(task_t *prev)
1601 {
1602         runqueue_t *rq = this_rq();
1603         struct mm_struct *mm = rq->prev_mm;
1604         unsigned long prev_task_flags;
1605
1606         rq->prev_mm = NULL;
1607
1608         /*
1609          * A task struct has one reference for the use as "current".
1610          * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
1611          * calls schedule one last time. The schedule call will never return,
1612          * and the scheduled task must drop that reference.
1613          * The test for EXIT_ZOMBIE must occur while the runqueue locks are
1614          * still held, otherwise prev could be scheduled on another cpu, die
1615          * there before we look at prev->state, and then the reference would
1616          * be dropped twice.
1617          *              Manfred Spraul <manfred@colorfullife.com>
1618          */
1619         prev_task_flags = prev->flags;
1620         finish_arch_switch(rq, prev);
1621         if (mm)
1622                 mmdrop(mm);
1623         if (unlikely(prev_task_flags & PF_DEAD))
1624                 put_task_struct(prev);
1625 }
1626
1627 /**
1628  * schedule_tail - first thing a freshly forked thread must call.
1629  * @prev: the thread we just switched away from.
1630  */
1631 asmlinkage void schedule_tail(task_t *prev)
1632 {
1633         finish_task_switch(prev);
1634
1635         if (current->set_child_tid)
1636                 put_user(current->pid, current->set_child_tid);
1637 }
1638
1639 /*
1640  * context_switch - switch to the new MM and the new
1641  * thread's register state.
1642  */
1643 static inline
1644 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1645 {
1646         struct mm_struct *mm = next->mm;
1647         struct mm_struct *oldmm = prev->active_mm;
1648
1649         if (unlikely(!mm)) {
1650                 next->active_mm = oldmm;
1651                 atomic_inc(&oldmm->mm_count);
1652                 enter_lazy_tlb(oldmm, next);
1653         } else
1654                 switch_mm(oldmm, mm, next);
1655
1656         if (unlikely(!prev->mm)) {
1657                 prev->active_mm = NULL;
1658                 WARN_ON(rq->prev_mm);
1659                 rq->prev_mm = oldmm;
1660         }
1661
1662         /* Here we just switch the register state and the stack. */
1663         switch_to(prev, next, prev);
1664
1665         return prev;
1666 }
1667
1668 /*
1669  * nr_running, nr_uninterruptible and nr_context_switches:
1670  *
1671  * externally visible scheduler statistics: current number of runnable
1672  * threads, current number of uninterruptible-sleeping threads, total
1673  * number of context switches performed since bootup.
1674  */
1675 unsigned long nr_running(void)
1676 {
1677         unsigned long i, sum = 0;
1678
1679         for_each_online_cpu(i)
1680                 sum += cpu_rq(i)->nr_running;
1681
1682         return sum;
1683 }
1684
1685 unsigned long nr_uninterruptible(void)
1686 {
1687         unsigned long i, sum = 0;
1688
1689         for_each_cpu(i)
1690                 sum += cpu_rq(i)->nr_uninterruptible;
1691
1692         return sum;
1693 }
1694
1695 unsigned long long nr_context_switches(void)
1696 {
1697         unsigned long long i, sum = 0;
1698
1699         for_each_cpu(i)
1700                 sum += cpu_rq(i)->nr_switches;
1701
1702         return sum;
1703 }
1704
1705 unsigned long nr_iowait(void)
1706 {
1707         unsigned long i, sum = 0;
1708
1709         for_each_cpu(i)
1710                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1711
1712         return sum;
1713 }
1714
1715 #ifdef CONFIG_SMP
1716
1717 /*
1718  * double_rq_lock - safely lock two runqueues
1719  *
1720  * Note this does not disable interrupts like task_rq_lock,
1721  * you need to do so manually before calling.
1722  */
1723 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1724 {
1725         if (rq1 == rq2)
1726                 spin_lock(&rq1->lock);
1727         else {
1728                 if (rq1 < rq2) {
1729                         spin_lock(&rq1->lock);
1730                         spin_lock(&rq2->lock);
1731                 } else {
1732                         spin_lock(&rq2->lock);
1733                         spin_lock(&rq1->lock);
1734                 }
1735         }
1736 }
1737
1738 /*
1739  * double_rq_unlock - safely unlock two runqueues
1740  *
1741  * Note this does not restore interrupts like task_rq_unlock,
1742  * you need to do so manually after calling.
1743  */
1744 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1745 {
1746         spin_unlock(&rq1->lock);
1747         if (rq1 != rq2)
1748                 spin_unlock(&rq2->lock);
1749 }
1750
1751 /*
1752  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1753  */
1754 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1755 {
1756         if (unlikely(!spin_trylock(&busiest->lock))) {
1757                 if (busiest < this_rq) {
1758                         spin_unlock(&this_rq->lock);
1759                         spin_lock(&busiest->lock);
1760                         spin_lock(&this_rq->lock);
1761                 } else
1762                         spin_lock(&busiest->lock);
1763         }
1764 }
1765
1766 /*
1767  * find_idlest_cpu - find the least busy runqueue.
1768  */
1769 static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1770                            struct sched_domain *sd)
1771 {
1772         unsigned long load, min_load, this_load;
1773         int i, min_cpu;
1774         cpumask_t mask;
1775
1776         min_cpu = UINT_MAX;
1777         min_load = ULONG_MAX;
1778
1779         cpus_and(mask, sd->span, p->cpus_allowed);
1780
1781         for_each_cpu_mask(i, mask) {
1782                 load = target_load(i);
1783
1784                 if (load < min_load) {
1785                         min_cpu = i;
1786                         min_load = load;
1787
1788                         /* break out early on an idle CPU: */
1789                         if (!min_load)
1790                                 break;
1791                 }
1792         }
1793
1794         /* add +1 to account for the new task */
1795         this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1796
1797         /*
1798          * Would with the addition of the new task to the
1799          * current CPU there be an imbalance between this
1800          * CPU and the idlest CPU?
1801          *
1802          * Use half of the balancing threshold - new-context is
1803          * a good opportunity to balance.
1804          */
1805         if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1806                 return min_cpu;
1807
1808         return this_cpu;
1809 }
1810
1811 /*
1812  * If dest_cpu is allowed for this process, migrate the task to it.
1813  * This is accomplished by forcing the cpu_allowed mask to only
1814  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1815  * the cpu_allowed mask is restored.
1816  */
1817 static void sched_migrate_task(task_t *p, int dest_cpu)
1818 {
1819         migration_req_t req;
1820         runqueue_t *rq;
1821         unsigned long flags;
1822
1823         rq = task_rq_lock(p, &flags);
1824         if (!cpu_isset(dest_cpu, p->cpus_allowed)
1825             || unlikely(cpu_is_offline(dest_cpu)))
1826                 goto out;
1827
1828         schedstat_inc(rq, smt_cnt);
1829         /* force the process onto the specified CPU */
1830         if (migrate_task(p, dest_cpu, &req)) {
1831                 /* Need to wait for migration thread (might exit: take ref). */
1832                 struct task_struct *mt = rq->migration_thread;
1833                 get_task_struct(mt);
1834                 task_rq_unlock(rq, &flags);
1835                 wake_up_process(mt);
1836                 put_task_struct(mt);
1837                 wait_for_completion(&req.done);
1838                 return;
1839         }
1840 out:
1841         task_rq_unlock(rq, &flags);
1842 }
1843
1844 /*
1845  * sched_exec(): find the highest-level, exec-balance-capable
1846  * domain and try to migrate the task to the least loaded CPU.
1847  *
1848  * execve() is a valuable balancing opportunity, because at this point
1849  * the task has the smallest effective memory and cache footprint.
1850  */
1851 void sched_exec(void)
1852 {
1853         struct sched_domain *tmp, *sd = NULL;
1854         int new_cpu, this_cpu = get_cpu();
1855
1856         schedstat_inc(this_rq(), sbe_cnt);
1857         /* Prefer the current CPU if there's only this task running */
1858         if (this_rq()->nr_running <= 1)
1859                 goto out;
1860
1861         for_each_domain(this_cpu, tmp)
1862                 if (tmp->flags & SD_BALANCE_EXEC)
1863                         sd = tmp;
1864
1865         if (sd) {
1866                 schedstat_inc(sd, sbe_attempts);
1867                 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1868                 if (new_cpu != this_cpu) {
1869                         schedstat_inc(sd, sbe_pushed);
1870                         put_cpu();
1871                         sched_migrate_task(current, new_cpu);
1872                         return;
1873                 }
1874         }
1875 out:
1876         put_cpu();
1877 }
1878
1879 /*
1880  * pull_task - move a task from a remote runqueue to the local runqueue.
1881  * Both runqueues must be locked.
1882  */
1883 static inline
1884 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1885                runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1886 {
1887         dequeue_task(p, src_array);
1888         src_rq->nr_running--;
1889         set_task_cpu(p, this_cpu);
1890         this_rq->nr_running++;
1891         enqueue_task(p, this_array);
1892         p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1893                                 + this_rq->timestamp_last_tick;
1894         /*
1895          * Note that idle threads have a prio of MAX_PRIO, for this test
1896          * to be always true for them.
1897          */
1898         if (TASK_PREEMPTS_CURR(p, this_rq))
1899                 resched_task(this_rq->curr);
1900 }
1901
1902 /*
1903  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1904  */
1905 static inline
1906 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1907                      struct sched_domain *sd, enum idle_type idle)
1908 {
1909         /*
1910          * We do not migrate tasks that are:
1911          * 1) running (obviously), or
1912          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1913          * 3) are cache-hot on their current CPU.
1914          */
1915         if (task_running(rq, p))
1916                 return 0;
1917         if (!cpu_isset(this_cpu, p->cpus_allowed))
1918                 return 0;
1919
1920         /* Aggressive migration if we've failed balancing */
1921         if (idle == NEWLY_IDLE ||
1922                         sd->nr_balance_failed < sd->cache_nice_tries) {
1923                 if (task_hot(p, rq->timestamp_last_tick, sd))
1924                         return 0;
1925         }
1926
1927         return 1;
1928 }
1929
1930 #ifdef CONFIG_CKRM_CPU_SCHEDULE
1931 static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
1932                                       int phase, enum idle_type idle)
1933 {
1934         long pressure = task_load(tmp);
1935         
1936         if (pressure > max) 
1937                 return 0;
1938
1939         if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
1940                 return 0;
1941         return 1;
1942 }
1943
1944 /*
1945  * move tasks for a specic local class
1946  * return number of tasks pulled
1947  */
1948 static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
1949                                       runqueue_t *this_rq,
1950                                       runqueue_t *busiest,
1951                                       struct sched_domain *sd,
1952                                       int this_cpu,
1953                                       enum idle_type idle,
1954                                       long* pressure_imbalance) 
1955 {
1956         prio_array_t *array, *dst_array;
1957         struct list_head *head, *curr;
1958         task_t *tmp;
1959         int idx;
1960         int pulled = 0;
1961         int phase = -1;
1962         long pressure_min, pressure_max;
1963         /*hzheng: magic : 90% balance is enough*/
1964         long balance_min = *pressure_imbalance / 10; 
1965 /*
1966  * we don't want to migrate tasks that will reverse the balance
1967  *     or the tasks that make too small difference
1968  */
1969 #define CKRM_BALANCE_MAX_RATIO  100
1970 #define CKRM_BALANCE_MIN_RATIO  1
1971  start:
1972         phase ++;
1973         /*
1974          * We first consider expired tasks. Those will likely not be
1975          * executed in the near future, and they are most likely to
1976          * be cache-cold, thus switching CPUs has the least effect
1977          * on them.
1978          */
1979         if (src_lrq->expired->nr_active) {
1980                 array = src_lrq->expired;
1981                 dst_array = dst_lrq->expired;
1982         } else {
1983                 array = src_lrq->active;
1984                 dst_array = dst_lrq->active;
1985         }
1986         
1987  new_array:
1988         /* Start searching at priority 0: */
1989         idx = 0;
1990  skip_bitmap:
1991         if (!idx)
1992                 idx = sched_find_first_bit(array->bitmap);
1993         else
1994                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1995         if (idx >= MAX_PRIO) {
1996                 if (array == src_lrq->expired && src_lrq->active->nr_active) {
1997                         array = src_lrq->active;
1998                         dst_array = dst_lrq->active;
1999                         goto new_array;
2000                 }
2001                 if ((! phase) && (! pulled) && (idle != IDLE))
2002                         goto start; //try again
2003                 else 
2004                         goto out; //finished search for this lrq
2005         }
2006         
2007         head = array->queue + idx;
2008         curr = head->prev;
2009  skip_queue:
2010         tmp = list_entry(curr, task_t, run_list);
2011         
2012         curr = curr->prev;
2013         
2014         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
2015                 if (curr != head)
2016                         goto skip_queue;
2017                 idx++;
2018                 goto skip_bitmap;
2019         }
2020
2021         pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
2022         pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
2023         /*
2024          * skip the tasks that will reverse the balance too much
2025          */
2026         if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
2027                 *pressure_imbalance -= task_load(tmp);
2028                 pull_task(busiest, array, tmp, 
2029                           this_rq, dst_array, this_cpu);
2030                 pulled++;
2031
2032                 if (*pressure_imbalance <= balance_min)
2033                         goto out;
2034         }
2035                 
2036         if (curr != head)
2037                 goto skip_queue;
2038         idx++;
2039         goto skip_bitmap;
2040  out:          
2041         return pulled;
2042 }
2043
2044 static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
2045 {
2046         long imbalance;
2047         /*
2048          * make sure after balance, imbalance' > - imbalance/2
2049          * we don't want the imbalance be reversed too much
2050          */
2051         imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) 
2052                 - pid_get_pressure(rq_ckrm_load(this_rq),1);
2053         imbalance /= 2;
2054         return imbalance;
2055 }
2056
2057 /*
2058  * try to balance the two runqueues
2059  *
2060  * Called with both runqueues locked.
2061  * if move_tasks is called, it will try to move at least one task over
2062  */
2063 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
2064                       unsigned long max_nr_move, struct sched_domain *sd,
2065                       enum idle_type idle)
2066 {
2067         struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
2068         ckrm_lrq_t* src_lrq,*dst_lrq;
2069         long pressure_imbalance, pressure_imbalance_old;
2070         int src_cpu = task_cpu(busiest->curr);
2071         struct list_head *list;
2072         int pulled = 0;
2073         long imbalance;
2074
2075         imbalance =  ckrm_rq_imbalance(this_rq,busiest);
2076
2077         if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
2078                 goto out;
2079
2080         //try to find the vip class
2081         list_for_each_entry(clsptr,&active_cpu_classes,links) {
2082                 src_lrq = get_ckrm_lrq(clsptr,src_cpu);
2083
2084                 if (! lrq_nr_running(src_lrq))
2085                         continue;
2086
2087                 if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
2088                         {
2089                                 vip_cls = clsptr;
2090                         }
2091         }
2092
2093         /*
2094          * do search from the most significant class
2095          * hopefully, less tasks will be migrated this way
2096          */
2097         clsptr = vip_cls;
2098
2099  move_class:
2100         if (! clsptr)
2101                 goto out;
2102         
2103
2104         src_lrq = get_ckrm_lrq(clsptr,src_cpu);
2105         if (! lrq_nr_running(src_lrq))
2106                 goto other_class;
2107         
2108         dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
2109
2110         //how much pressure for this class should be transferred
2111         pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
2112         if (pulled && ! pressure_imbalance) 
2113                 goto other_class;
2114         
2115         pressure_imbalance_old = pressure_imbalance;
2116         
2117         //move tasks
2118         pulled += 
2119                 ckrm_cls_move_tasks(src_lrq,dst_lrq,
2120                                     this_rq,
2121                                     busiest,
2122                                     sd,this_cpu,idle,
2123                                     &pressure_imbalance);
2124
2125         /* 
2126          * hzheng: 2 is another magic number
2127          * stop balancing if the imbalance is less than 25% of the orig
2128          */
2129         if (pressure_imbalance <= (pressure_imbalance_old >> 2))
2130                 goto out;
2131                 
2132         //update imbalance
2133         imbalance *= pressure_imbalance / pressure_imbalance_old;
2134  other_class:
2135         //who is next?
2136         list = clsptr->links.next;
2137         if (list == &active_cpu_classes)
2138                 list = list->next;
2139         clsptr = list_entry(list, typeof(*clsptr), links);
2140         if (clsptr != vip_cls)
2141                 goto move_class;
2142  out:
2143         return pulled;
2144 }
2145
2146 /**
2147  * ckrm_check_balance - is load balancing necessary?
2148  * return 0 if load balancing is not necessary
2149  * otherwise return the average load of the system
2150  * also, update nr_group
2151  *
2152  * heuristics: 
2153  *   no load balancing if it's load is over average
2154  *   no load balancing if it's load is far more than the min
2155  * task:
2156  *   read the status of all the runqueues
2157  */
2158 static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
2159                                              enum idle_type idle, int* nr_group)
2160 {
2161         struct sched_group *group = sd->groups;
2162         unsigned long min_load, max_load, avg_load;
2163         unsigned long total_load, this_load, total_pwr;
2164
2165         max_load = this_load = total_load = total_pwr = 0;
2166         min_load = 0xFFFFFFFF;
2167         *nr_group = 0;
2168
2169         do {
2170                 cpumask_t tmp;
2171                 unsigned long load;
2172                 int local_group;
2173                 int i, nr_cpus = 0;
2174
2175                 /* Tally up the load of all CPUs in the group */
2176                 cpus_and(tmp, group->cpumask, cpu_online_map);
2177                 if (unlikely(cpus_empty(tmp)))
2178                         goto nextgroup;
2179
2180                 avg_load = 0;
2181                 local_group = cpu_isset(this_cpu, group->cpumask);
2182
2183                 for_each_cpu_mask(i, tmp) {
2184                         load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
2185                         nr_cpus++;
2186                         avg_load += load;
2187                 }
2188
2189                 if (!nr_cpus)
2190                         goto nextgroup;
2191
2192                 total_load += avg_load;
2193                 total_pwr += group->cpu_power;
2194
2195                 /* Adjust by relative CPU power of the group */
2196                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2197
2198                 if (local_group) {
2199                         this_load = avg_load;
2200                         goto nextgroup;
2201                 } else if (avg_load > max_load) {
2202                         max_load = avg_load;
2203                 }      
2204                 if (avg_load < min_load) {
2205                         min_load = avg_load;
2206                 }
2207 nextgroup:
2208                 group = group->next;
2209                 *nr_group = *nr_group + 1;
2210         } while (group != sd->groups);
2211
2212         if (!max_load || this_load >= max_load)
2213                 goto out_balanced;
2214
2215         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2216
2217         /* hzheng: debugging: 105 is a magic number
2218          * 100*max_load <= sd->imbalance_pct*this_load)
2219          * should use imbalance_pct instead
2220          */
2221         if (this_load > avg_load 
2222             || 100*max_load < 105*this_load
2223             || 100*min_load < 70*this_load
2224             )
2225                 goto out_balanced;
2226
2227         return avg_load;
2228  out_balanced:
2229         return 0;
2230 }
2231
2232 /**
2233  * any group that has above average load is considered busy
2234  * find the busiest queue from any of busy group
2235  */
2236 static runqueue_t *
2237 ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
2238                      unsigned long avg_load, enum idle_type idle,
2239                      int nr_group)
2240 {
2241         struct sched_group *group;
2242         runqueue_t * busiest=NULL;
2243         unsigned long rand;
2244         
2245         group = sd->groups;
2246         rand = get_ckrm_rand(nr_group);
2247         nr_group = 0;
2248
2249         do {
2250                 unsigned long load,total_load,max_load;
2251                 cpumask_t tmp;
2252                 int i;
2253                 runqueue_t * grp_busiest;
2254
2255                 cpus_and(tmp, group->cpumask, cpu_online_map);
2256                 if (unlikely(cpus_empty(tmp)))
2257                         goto find_nextgroup;
2258
2259                 total_load = 0;
2260                 max_load = 0;
2261                 grp_busiest = NULL;
2262                 for_each_cpu_mask(i, tmp) {
2263                         load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
2264                         total_load += load;
2265                         if (load > max_load) {
2266                                 max_load = load;
2267                                 grp_busiest = cpu_rq(i);
2268                         }                               
2269                 }
2270
2271                 total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
2272                 if (total_load > avg_load) {
2273                         busiest = grp_busiest;
2274                         if (nr_group >= rand)
2275                                 break;
2276                 }
2277         find_nextgroup:         
2278                 group = group->next;
2279                 nr_group ++;
2280         } while (group != sd->groups);
2281
2282         return busiest;
2283 }
2284
2285 /**
2286  * load_balance - pressure based load balancing algorithm used by ckrm
2287  */
2288 static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
2289                         struct sched_domain *sd, enum idle_type idle)
2290 {
2291         runqueue_t *busiest;
2292         unsigned long avg_load;
2293         int nr_moved,nr_group;
2294
2295         avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
2296         if (! avg_load)
2297                 goto out_balanced;
2298
2299         busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
2300         if (! busiest)
2301                 goto out_balanced;
2302         /*
2303          * This should be "impossible", but since load
2304          * balancing is inherently racy and statistical,
2305          * it could happen in theory.
2306          */
2307         if (unlikely(busiest == this_rq)) {
2308                 WARN_ON(1);
2309                 goto out_balanced;
2310         }
2311
2312         nr_moved = 0;
2313         if (busiest->nr_running > 1) {
2314                 /*
2315                  * Attempt to move tasks. If find_busiest_group has found
2316                  * an imbalance but busiest->nr_running <= 1, the group is
2317                  * still unbalanced. nr_moved simply stays zero, so it is
2318                  * correctly treated as an imbalance.
2319                  */
2320                 double_lock_balance(this_rq, busiest);
2321                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2322                                       0,sd, idle);              
2323                 spin_unlock(&busiest->lock);
2324                 if (nr_moved) {
2325                         adjust_local_weight();
2326                 }
2327         }
2328
2329         if (!nr_moved) 
2330                 sd->nr_balance_failed ++;
2331         else
2332                 sd->nr_balance_failed  = 0;             
2333
2334         /* We were unbalanced, so reset the balancing interval */
2335         sd->balance_interval = sd->min_interval;
2336
2337         return nr_moved;
2338
2339 out_balanced:
2340         /* tune up the balancing interval */
2341         if (sd->balance_interval < sd->max_interval)
2342                 sd->balance_interval *= 2;
2343
2344         return 0;
2345 }
2346
2347 /*
2348  * this_rq->lock is already held
2349  */
2350 static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2351                                        struct sched_domain *sd)
2352 {
2353         int ret;
2354         read_lock(&class_list_lock);
2355         ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
2356         read_unlock(&class_list_lock);
2357         return ret;
2358 }
2359
2360 static inline int load_balance(int this_cpu, runqueue_t *this_rq,
2361                         struct sched_domain *sd, enum idle_type idle)
2362 {
2363         int ret;
2364
2365         spin_lock(&this_rq->lock);
2366         read_lock(&class_list_lock);
2367         ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
2368         read_unlock(&class_list_lock);
2369         spin_unlock(&this_rq->lock);
2370         return ret;
2371 }
2372 #else /*! CONFIG_CKRM_CPU_SCHEDULE */
2373 /*
2374  * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
2375  * as part of a balancing operation within "domain". Returns the number of
2376  * tasks moved.
2377  *
2378  * Called with both runqueues locked.
2379  */
2380 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
2381                       unsigned long max_nr_move, struct sched_domain *sd,
2382                       enum idle_type idle)
2383 {
2384         prio_array_t *array, *dst_array;
2385         struct list_head *head, *curr;
2386         int idx, pulled = 0;
2387         task_t *tmp;
2388
2389         if (max_nr_move <= 0 || busiest->nr_running <= 1)
2390                 goto out;
2391
2392         /*
2393          * We first consider expired tasks. Those will likely not be
2394          * executed in the near future, and they are most likely to
2395          * be cache-cold, thus switching CPUs has the least effect
2396          * on them.
2397          */
2398         if (busiest->expired->nr_active) {
2399                 array = busiest->expired;
2400                 dst_array = this_rq->expired;
2401         } else {
2402                 array = busiest->active;
2403                 dst_array = this_rq->active;
2404         }
2405
2406 new_array:
2407         /* Start searching at priority 0: */
2408         idx = 0;
2409 skip_bitmap:
2410         if (!idx)
2411                 idx = sched_find_first_bit(array->bitmap);
2412         else
2413                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2414         if (idx >= MAX_PRIO) {
2415                 if (array == busiest->expired && busiest->active->nr_active) {
2416                         array = busiest->active;
2417                         dst_array = this_rq->active;
2418                         goto new_array;
2419                 }
2420                 goto out;
2421         }
2422
2423         head = array->queue + idx;
2424         curr = head->prev;
2425 skip_queue:
2426         tmp = list_entry(curr, task_t, run_list);
2427
2428         curr = curr->prev;
2429
2430         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
2431                 if (curr != head)
2432                         goto skip_queue;
2433                 idx++;
2434                 goto skip_bitmap;
2435         }
2436
2437         /*
2438          * Right now, this is the only place pull_task() is called,
2439          * so we can safely collect pull_task() stats here rather than
2440          * inside pull_task().
2441          */
2442         schedstat_inc(this_rq, pt_gained[idle]);
2443         schedstat_inc(busiest, pt_lost[idle]);
2444
2445         pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
2446         pulled++;
2447
2448         /* We only want to steal up to the prescribed number of tasks. */
2449         if (pulled < max_nr_move) {
2450                 if (curr != head)
2451                         goto skip_queue;
2452                 idx++;
2453                 goto skip_bitmap;
2454         }
2455 out:
2456         return pulled;
2457 }
2458
2459 /*
2460  * find_busiest_group finds and returns the busiest CPU group within the
2461  * domain. It calculates and returns the number of tasks which should be
2462  * moved to restore balance via the imbalance parameter.
2463  */
2464 static struct sched_group *
2465 find_busiest_group(struct sched_domain *sd, int this_cpu,
2466                    unsigned long *imbalance, enum idle_type idle)
2467 {
2468         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2469         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2470
2471         max_load = this_load = total_load = total_pwr = 0;
2472
2473         do {
2474                 unsigned long load;
2475                 int local_group;
2476                 int i, nr_cpus = 0;
2477
2478                 local_group = cpu_isset(this_cpu, group->cpumask);
2479
2480                 /* Tally up the load of all CPUs in the group */
2481                 avg_load = 0;
2482
2483                 for_each_cpu_mask(i, group->cpumask) {
2484                         /* Bias balancing toward cpus of our domain */
2485                         if (local_group)
2486                                 load = target_load(i);
2487                         else
2488                                 load = source_load(i);
2489
2490                         nr_cpus++;
2491                         avg_load += load;
2492                 }
2493
2494                 if (!nr_cpus)
2495                         goto nextgroup;
2496
2497                 total_load += avg_load;
2498                 total_pwr += group->cpu_power;
2499
2500                 /* Adjust by relative CPU power of the group */
2501                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2502
2503                 if (local_group) {
2504                         this_load = avg_load;
2505                         this = group;
2506                         goto nextgroup;
2507                 } else if (avg_load > max_load) {
2508                         max_load = avg_load;
2509                         busiest = group;
2510                 }
2511 nextgroup:
2512                 group = group->next;
2513         } while (group != sd->groups);
2514
2515         if (!busiest || this_load >= max_load)
2516                 goto out_balanced;
2517
2518         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2519
2520         if (this_load >= avg_load ||
2521                         100*max_load <= sd->imbalance_pct*this_load)
2522                 goto out_balanced;
2523
2524         /*
2525          * We're trying to get all the cpus to the average_load, so we don't
2526          * want to push ourselves above the average load, nor do we wish to
2527          * reduce the max loaded cpu below the average load, as either of these
2528          * actions would just result in more rebalancing later, and ping-pong
2529          * tasks around. Thus we look for the minimum possible imbalance.
2530          * Negative imbalances (*we* are more loaded than anyone else) will
2531          * be counted as no imbalance for these purposes -- we can't fix that
2532          * by pulling tasks to us.  Be careful of negative numbers as they'll
2533          * appear as very large values with unsigned longs.
2534          */
2535         *imbalance = min(max_load - avg_load, avg_load - this_load);
2536
2537         /* How much load to actually move to equalise the imbalance */
2538         *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
2539                                 / SCHED_LOAD_SCALE;
2540
2541         if (*imbalance < SCHED_LOAD_SCALE - 1) {
2542                 unsigned long pwr_now = 0, pwr_move = 0;
2543                 unsigned long tmp;
2544
2545                 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
2546                         *imbalance = 1;
2547                         return busiest;
2548                 }
2549
2550                 /*
2551                  * OK, we don't have enough imbalance to justify moving tasks,
2552                  * however we may be able to increase total CPU power used by
2553                  * moving them.
2554                  */
2555
2556                 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
2557                 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
2558                 pwr_now /= SCHED_LOAD_SCALE;
2559
2560                 /* Amount of load we'd subtract */
2561                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
2562                 if (max_load > tmp)
2563                         pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
2564                                                         max_load - tmp);
2565
2566                 /* Amount of load we'd add */
2567                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
2568                 if (max_load < tmp)
2569                         tmp = max_load;
2570                 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
2571                 pwr_move /= SCHED_LOAD_SCALE;
2572
2573                 /* Move if we gain another 8th of a CPU worth of throughput */
2574                 if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
2575                         goto out_balanced;
2576
2577                 *imbalance = 1;
2578                 return busiest;
2579         }
2580
2581         /* Get rid of the scaling factor, rounding down as we divide */
2582         *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
2583
2584         return busiest;
2585
2586 out_balanced:
2587         if (busiest && (idle == NEWLY_IDLE ||
2588                         (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
2589                 *imbalance = 1;
2590                 return busiest;
2591         }
2592
2593         *imbalance = 0;
2594         return NULL;
2595 }
2596
2597 /*
2598  * find_busiest_queue - find the busiest runqueue among the cpus in group.
2599  */
2600 static runqueue_t *find_busiest_queue(struct sched_group *group)
2601 {
2602         unsigned long load, max_load = 0;
2603         runqueue_t *busiest = NULL;
2604         int i;
2605
2606         for_each_cpu_mask(i, group->cpumask) {
2607                 load = source_load(i);
2608
2609                 if (load > max_load) {
2610                         max_load = load;
2611                         busiest = cpu_rq(i);
2612                 }
2613         }
2614
2615         return busiest;
2616 }
2617
2618 /*
2619  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2620  * tasks if there is an imbalance.
2621  *
2622  * Called with this_rq unlocked.
2623  */
2624 static int load_balance(int this_cpu, runqueue_t *this_rq,
2625                         struct sched_domain *sd, enum idle_type idle)
2626 {
2627         struct sched_group *group;
2628         runqueue_t *busiest;
2629         unsigned long imbalance;
2630         int nr_moved;
2631
2632         spin_lock(&this_rq->lock);
2633         schedstat_inc(sd, lb_cnt[idle]);
2634
2635         group = find_busiest_group(sd, this_cpu, &imbalance, idle);
2636         if (!group) {
2637                 schedstat_inc(sd, lb_nobusyg[idle]);
2638                 goto out_balanced;
2639         }
2640
2641         busiest = find_busiest_queue(group);
2642         if (!busiest) {
2643                 schedstat_inc(sd, lb_nobusyq[idle]);
2644                 goto out_balanced;
2645         }
2646
2647         /*
2648          * This should be "impossible", but since load
2649          * balancing is inherently racy and statistical,
2650          * it could happen in theory.
2651          */
2652         if (unlikely(busiest == this_rq)) {
2653                 WARN_ON(1);
2654                 goto out_balanced;
2655         }
2656
2657         schedstat_add(sd, lb_imbalance[idle], imbalance);
2658
2659         nr_moved = 0;
2660         if (busiest->nr_running > 1) {
2661                 /*
2662                  * Attempt to move tasks. If find_busiest_group has found
2663                  * an imbalance but busiest->nr_running <= 1, the group is
2664                  * still unbalanced. nr_moved simply stays zero, so it is
2665                  * correctly treated as an imbalance.
2666                  */
2667                 double_lock_balance(this_rq, busiest);
2668                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2669                                                 imbalance, sd, idle);
2670                 spin_unlock(&busiest->lock);
2671         }
2672         spin_unlock(&this_rq->lock);
2673
2674         if (!nr_moved) {
2675                 schedstat_inc(sd, lb_failed[idle]);
2676                 sd->nr_balance_failed++;
2677
2678                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2679                         int wake = 0;
2680
2681                         spin_lock(&busiest->lock);
2682                         if (!busiest->active_balance) {
2683                                 busiest->active_balance = 1;
2684                                 busiest->push_cpu = this_cpu;
2685                                 wake = 1;
2686                         }
2687                         spin_unlock(&busiest->lock);
2688                         if (wake)
2689                                 wake_up_process(busiest->migration_thread);
2690
2691                         /*
2692                          * We've kicked active balancing, reset the failure
2693                          * counter.
2694                          */
2695                         sd->nr_balance_failed = sd->cache_nice_tries;
2696                 }
2697         } else
2698                 sd->nr_balance_failed = 0;
2699
2700         /* We were unbalanced, so reset the balancing interval */
2701         sd->balance_interval = sd->min_interval;
2702
2703         return nr_moved;
2704
2705 out_balanced:
2706         spin_unlock(&this_rq->lock);
2707
2708         /* tune up the balancing interval */
2709         if (sd->balance_interval < sd->max_interval)
2710                 sd->balance_interval *= 2;
2711
2712         return 0;
2713 }
2714
2715 /*
2716  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2717  * tasks if there is an imbalance.
2718  *
2719  * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2720  * this_rq is locked.
2721  */
2722 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2723                                 struct sched_domain *sd)
2724 {
2725         struct sched_group *group;
2726         runqueue_t *busiest = NULL;
2727         unsigned long imbalance;
2728         int nr_moved = 0;
2729
2730         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2731         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2732         if (!group) {
2733                 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2734                 goto out;
2735         }
2736
2737         busiest = find_busiest_queue(group);
2738         if (!busiest || busiest == this_rq) {
2739                 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2740                 goto out;
2741         }
2742
2743         /* Attempt to move tasks */
2744         double_lock_balance(this_rq, busiest);
2745
2746         schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2747         nr_moved = move_tasks(this_rq, this_cpu, busiest,
2748                                         imbalance, sd, NEWLY_IDLE);
2749         if (!nr_moved)
2750                 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2751
2752         spin_unlock(&busiest->lock);
2753
2754 out:
2755         return nr_moved;
2756 }
2757 #endif /* CONFIG_CKRM_CPU_SCHEDULE*/
2758
2759
2760 /*
2761  * idle_balance is called by schedule() if this_cpu is about to become
2762  * idle. Attempts to pull tasks from other CPUs.
2763  */
2764 static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2765 {
2766         struct sched_domain *sd;
2767
2768         for_each_domain(this_cpu, sd) {
2769                 if (sd->flags & SD_BALANCE_NEWIDLE) {
2770                         if (load_balance_newidle(this_cpu, this_rq, sd)) {
2771                                 /* We've pulled tasks over so stop searching */
2772                                 break;
2773                         }
2774                 }
2775         }
2776 }
2777
2778 /*
2779  * active_load_balance is run by migration threads. It pushes a running
2780  * task off the cpu. It can be required to correctly have at least 1 task
2781  * running on each physical CPU where possible, and not have a physical /
2782  * logical imbalance.
2783  *
2784  * Called with busiest locked.
2785  */
2786 static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
2787 {
2788         struct sched_domain *sd;
2789         struct sched_group *group, *busy_group;
2790         int i;
2791
2792         schedstat_inc(busiest, alb_cnt);
2793         if (busiest->nr_running <= 1)
2794                 return;
2795
2796         for_each_domain(busiest_cpu, sd)
2797                 if (cpu_isset(busiest->push_cpu, sd->span))
2798                         break;
2799         if (!sd)
2800                 return;
2801
2802         group = sd->groups;
2803         while (!cpu_isset(busiest_cpu, group->cpumask))
2804                 group = group->next;
2805         busy_group = group;
2806
2807         group = sd->groups;
2808         do {
2809                 runqueue_t *rq;
2810                 int push_cpu = 0;
2811
2812                 if (group == busy_group)
2813                         goto next_group;
2814
2815                 for_each_cpu_mask(i, group->cpumask) {
2816                         if (!idle_cpu(i))
2817                                 goto next_group;
2818                         push_cpu = i;
2819                 }
2820
2821                 rq = cpu_rq(push_cpu);
2822
2823                 /*
2824                  * This condition is "impossible", but since load
2825                  * balancing is inherently a bit racy and statistical,
2826                  * it can trigger.. Reported by Bjorn Helgaas on a
2827                  * 128-cpu setup.
2828                  */
2829                 if (unlikely(busiest == rq))
2830                         goto next_group;
2831                 double_lock_balance(busiest, rq);
2832                 if (move_tasks(rq, push_cpu, busiest, 1, sd, IDLE)) {
2833                         schedstat_inc(busiest, alb_lost);
2834                         schedstat_inc(rq, alb_gained);
2835                 } else {
2836                         schedstat_inc(busiest, alb_failed);
2837                 }
2838                 spin_unlock(&rq->lock);
2839 next_group:
2840                 group = group->next;
2841         } while (group != sd->groups);
2842 }
2843
2844 /*
2845  * rebalance_tick will get called every timer tick, on every CPU.
2846  *
2847  * It checks each scheduling domain to see if it is due to be balanced,
2848  * and initiates a balancing operation if so.
2849  *
2850  * Balancing parameters are set up in arch_init_sched_domains.
2851  */
2852
2853 /* Don't have all balancing operations going off at once */
2854 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2855
2856 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2857                            enum idle_type idle)
2858 {
2859         unsigned long old_load, this_load;
2860         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2861         struct sched_domain *sd;
2862
2863         /* Update our load */
2864         old_load = this_rq->cpu_load;
2865         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2866         /*
2867          * Round up the averaging division if load is increasing. This
2868          * prevents us from getting stuck on 9 if the load is 10, for
2869          * example.
2870          */
2871         if (this_load > old_load)
2872                 old_load++;
2873         this_rq->cpu_load = (old_load + this_load) / 2;
2874
2875         for_each_domain(this_cpu, sd) {
2876                 unsigned long interval = sd->balance_interval;
2877
2878                 if (idle != IDLE)
2879                         interval *= sd->busy_factor;
2880
2881                 /* scale ms to jiffies */
2882                 interval = msecs_to_jiffies(interval);
2883                 if (unlikely(!interval))
2884                         interval = 1;
2885
2886                 if (j - sd->last_balance >= interval) {
2887                         if (load_balance(this_cpu, this_rq, sd, idle)) {
2888                                 /* We've pulled tasks over so no longer idle */
2889                                 idle = NOT_IDLE;
2890                         }
2891                         sd->last_balance += interval;
2892                 }
2893         }
2894 }
2895 #else /* SMP*/
2896 /*
2897  * on UP we do not need to balance between CPUs:
2898  */
2899 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2900 {
2901 }
2902 static inline void idle_balance(int cpu, runqueue_t *rq)
2903 {
2904 }
2905 #endif
2906
2907 static inline int wake_priority_sleeper(runqueue_t *rq)
2908 {
2909         int ret = 0;
2910 #ifdef CONFIG_SCHED_SMT
2911         spin_lock(&rq->lock);
2912         /*
2913          * If an SMT sibling task has been put to sleep for priority
2914          * reasons reschedule the idle task to see if it can now run.
2915          */
2916         if (rq->nr_running) {
2917                 resched_task(rq->idle);
2918                 ret = 1;
2919         }
2920         spin_unlock(&rq->lock);
2921 #endif
2922         return ret;
2923 }
2924
2925 DEFINE_PER_CPU(struct kernel_stat, kstat);
2926 EXPORT_PER_CPU_SYMBOL(kstat);
2927
2928 /*
2929  * We place interactive tasks back into the active array, if possible.
2930  *
2931  * To guarantee that this does not starve expired tasks we ignore the
2932  * interactivity of a task if the first expired task had to wait more
2933  * than a 'reasonable' amount of time. This deadline timeout is
2934  * load-dependent, as the frequency of array switched decreases with
2935  * increasing number of running tasks. We also ignore the interactivity
2936  * if a better static_prio task has expired:
2937  */
2938
2939 #ifndef CONFIG_CKRM_CPU_SCHEDULE
2940 #define EXPIRED_STARVING(rq) \
2941         ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2942                 (jiffies - (rq)->expired_timestamp >= \
2943                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2944                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
2945 #else
2946 #define EXPIRED_STARVING(rq) \
2947                 (STARVATION_LIMIT && ((rq)->expired_timestamp && \
2948                 (jiffies - (rq)->expired_timestamp >= \
2949                         STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
2950 #endif
2951
2952 /*
2953  * This function gets called by the timer code, with HZ frequency.
2954  * We call it with interrupts disabled.
2955  *
2956  * It also gets called by the fork code, when changing the parent's
2957  * timeslices.
2958  */
2959 void scheduler_tick(int user_ticks, int sys_ticks)
2960 {
2961         int cpu = smp_processor_id();
2962         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2963         runqueue_t *rq = this_rq();
2964         task_t *p = current;
2965         struct vx_info *vxi = p->vx_info;
2966
2967         rq->timestamp_last_tick = sched_clock();
2968
2969         if (rcu_pending(cpu))
2970                 rcu_check_callbacks(cpu, user_ticks);
2971
2972
2973         if (vxi) {
2974                 vxi->sched.cpu[cpu].user_ticks += user_ticks;
2975                 vxi->sched.cpu[cpu].sys_ticks += sys_ticks;
2976         }
2977
2978         /* note: this timer irq context must be accounted for as well */
2979         if (hardirq_count() - HARDIRQ_OFFSET) {
2980                 cpustat->irq += sys_ticks;
2981                 sys_ticks = 0;
2982         } else if (softirq_count()) {
2983                 cpustat->softirq += sys_ticks;
2984                 sys_ticks = 0;
2985         }
2986
2987         if (p == rq->idle) {
2988                 if (atomic_read(&rq->nr_iowait) > 0)
2989                         cpustat->iowait += sys_ticks;
2990                         // vx_cpustat_acc(vxi, iowait, cpu, cpustat, sys_ticks);
2991                 else
2992                         cpustat->idle += sys_ticks;
2993                         // vx_cpustat_acc(vxi, idle, cpu, cpustat, sys_ticks);
2994
2995                 if (wake_priority_sleeper(rq))
2996                         goto out;
2997
2998                 ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
2999
3000 #ifdef CONFIG_VSERVER_HARDCPU_IDLE
3001                 if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
3002                         set_need_resched();
3003 #endif
3004                 rebalance_tick(cpu, rq, IDLE);
3005                 return;
3006         }
3007         if (TASK_NICE(p) > 0)
3008                 cpustat->nice += user_ticks;
3009         else
3010                 cpustat->user += user_ticks;
3011         cpustat->system += sys_ticks;
3012
3013         /* Task might have expired already, but not scheduled off yet */
3014         if (p->array != rq_active(p,rq)) {
3015                 set_tsk_need_resched(p);
3016                 goto out;
3017         }
3018         spin_lock(&rq->lock);
3019         /*
3020          * The task was running during this tick - update the
3021          * time slice counter. Note: we do not update a thread's
3022          * priority until it either goes to sleep or uses up its
3023          * timeslice. This makes it possible for interactive tasks
3024          * to use up their timeslices at their highest priority levels.
3025          */
3026         if (rt_task(p)) {
3027                 /*
3028                  * RR tasks need a special form of timeslice management.
3029                  * FIFO tasks have no timeslices.
3030                  */
3031                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3032                         p->time_slice = task_timeslice(p);
3033                         p->first_time_slice = 0;
3034                         set_tsk_need_resched(p);
3035
3036                         /* put it at the end of the queue: */
3037                         dequeue_task(p, rq_active(p,rq));
3038                         enqueue_task(p, rq_active(p,rq));
3039                 }
3040                 goto out_unlock;
3041         }
3042         if (vx_need_resched(p)) {
3043 #ifdef CONFIG_CKRM_CPU_SCHEDULE
3044                 /* Hubertus ... we can abstract this out */
3045                 ckrm_lrq_t* rq = get_task_lrq(p);
3046 #endif
3047                 dequeue_task(p, rq->active);
3048                 set_tsk_need_resched(p);
3049                 p->prio = effective_prio(p);
3050                 p->time_slice = task_timeslice(p);
3051                 p->first_time_slice = 0;
3052
3053                 if (!rq->expired_timestamp)
3054                         rq->expired_timestamp = jiffies;
3055                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
3056                         enqueue_task(p, rq->expired);
3057                         if (p->static_prio < this_rq()->best_expired_prio)
3058                                 this_rq()->best_expired_prio = p->static_prio;
3059                 } else
3060                         enqueue_task(p, rq->active);
3061         } else {
3062                 /*
3063                  * Prevent a too long timeslice allowing a task to monopolize
3064                  * the CPU. We do this by splitting up the timeslice into
3065                  * smaller pieces.
3066                  *
3067                  * Note: this does not mean the task's timeslices expire or
3068                  * get lost in any way, they just might be preempted by
3069                  * another task of equal priority. (one with higher
3070                  * priority would have preempted this task already.) We
3071                  * requeue this task to the end of the list on this priority
3072                  * level, which is in essence a round-robin of tasks with
3073                  * equal priority.
3074                  *
3075                  * This only applies to tasks in the interactive
3076                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
3077                  */
3078                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3079                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3080                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3081                         (p->array == rq_active(p,rq))) {
3082
3083                         dequeue_task(p, rq_active(p,rq));
3084                         set_tsk_need_resched(p);
3085                         p->prio = effective_prio(p);
3086                         enqueue_task(p, rq_active(p,rq));
3087                 }
3088         }
3089 out_unlock:
3090         spin_unlock(&rq->lock);
3091 out:
3092         ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
3093         rebalance_tick(cpu, rq, NOT_IDLE);
3094 }
3095
3096 #ifdef CONFIG_SCHED_SMT
3097 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
3098 {
3099         struct sched_domain *sd = this_rq->sd;
3100         cpumask_t sibling_map;
3101         int i;
3102
3103         if (!(sd->flags & SD_SHARE_CPUPOWER))
3104                 return;
3105
3106 #ifdef CONFIG_CKRM_CPU_SCHEDULE
3107         if (prev != rq->idle) {
3108                 unsigned long long run = now - prev->timestamp;
3109                 ckrm_lrq_t * lrq = get_task_lrq(prev);
3110
3111                 lrq->lrq_load -= task_load(prev);
3112                 cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
3113                 lrq->lrq_load += task_load(prev);
3114
3115                 cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
3116                 update_local_cvt(prev, run);
3117         }
3118 #endif
3119         /*
3120          * Unlock the current runqueue because we have to lock in
3121          * CPU order to avoid deadlocks. Caller knows that we might
3122          * unlock. We keep IRQs disabled.
3123          */
3124         spin_unlock(&this_rq->lock);
3125
3126         sibling_map = sd->span;
3127
3128         for_each_cpu_mask(i, sibling_map)
3129                 spin_lock(&cpu_rq(i)->lock);
3130         /*
3131          * We clear this CPU from the mask. This both simplifies the
3132          * inner loop and keps this_rq locked when we exit:
3133          */
3134         cpu_clear(this_cpu, sibling_map);
3135
3136         for_each_cpu_mask(i, sibling_map) {
3137                 runqueue_t *smt_rq = cpu_rq(i);
3138
3139                 /*
3140                  * If an SMT sibling task is sleeping due to priority
3141                  * reasons wake it up now.
3142                  */
3143                 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
3144                         resched_task(smt_rq->idle);
3145         }
3146
3147         for_each_cpu_mask(i, sibling_map)
3148                 spin_unlock(&cpu_rq(i)->lock);
3149         /*
3150          * We exit with this_cpu's rq still held and IRQs
3151          * still disabled:
3152          */
3153 }
3154
3155 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
3156 {
3157         struct sched_domain *sd = this_rq->sd;
3158         cpumask_t sibling_map;
3159         prio_array_t *array;
3160         int ret = 0, i;
3161         task_t *p;
3162
3163         if (!(sd->flags & SD_SHARE_CPUPOWER))
3164                 return 0;
3165
3166         /*
3167          * The same locking rules and details apply as for
3168          * wake_sleeping_dependent():
3169          */
3170         spin_unlock(&this_rq->lock);
3171         sibling_map = sd->span;
3172         for_each_cpu_mask(i, sibling_map)
3173                 spin_lock(&cpu_rq(i)->lock);
3174         cpu_clear(this_cpu, sibling_map);
3175
3176         /*
3177          * Establish next task to be run - it might have gone away because
3178          * we released the runqueue lock above:
3179          */
3180         if (!this_rq->nr_running)
3181                 goto out_unlock;
3182         array = this_rq->active;
3183         if (!array->nr_active)
3184                 array = this_rq->expired;
3185         BUG_ON(!array->nr_active);
3186
3187         p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
3188                 task_t, run_list);
3189
3190         for_each_cpu_mask(i, sibling_map) {
3191                 runqueue_t *smt_rq = cpu_rq(i);
3192                 task_t *smt_curr = smt_rq->curr;
3193
3194                 /*
3195                  * If a user task with lower static priority than the
3196                  * running task on the SMT sibling is trying to schedule,
3197                  * delay it till there is proportionately less timeslice
3198                  * left of the sibling task to prevent a lower priority
3199                  * task from using an unfair proportion of the
3200                  * physical cpu's resources. -ck
3201                  */
3202                 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
3203                         task_timeslice(p) || rt_task(smt_curr)) &&
3204                         p->mm && smt_curr->mm && !rt_task(p))
3205                                 ret = 1;
3206
3207                 /*
3208                  * Reschedule a lower priority task on the SMT sibling,
3209                  * or wake it up if it has been put to sleep for priority
3210                  * reasons.
3211                  */
3212                 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
3213                         task_timeslice(smt_curr) || rt_task(p)) &&
3214                         smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
3215                         (smt_curr == smt_rq->idle && smt_rq->nr_running))
3216                                 resched_task(smt_curr);
3217         }
3218 out_unlock:
3219         for_each_cpu_mask(i, sibling_map)
3220                 spin_unlock(&cpu_rq(i)->lock);
3221         return ret;
3222 }
3223 #else
3224 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
3225 {
3226 }
3227
3228 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
3229 {
3230         return 0;
3231 }
3232 #endif
3233
3234 /*
3235  * schedule() is the main scheduler function.
3236  */
3237 asmlinkage void __sched schedule(void)
3238 {
3239         long *switch_count;
3240         task_t *prev, *next;
3241         runqueue_t *rq;
3242         prio_array_t *array;
3243         unsigned long long now;
3244         unsigned long run_time;
3245         int cpu;
3246 #ifdef  CONFIG_VSERVER_HARDCPU
3247         struct vx_info *vxi;
3248         int maxidle = -HZ;
3249 #endif
3250
3251         /*
3252          * If crash dump is in progress, this other cpu's
3253          * need to wait until it completes.
3254          * NB: this code is optimized away for kernels without
3255          * dumping enabled.
3256          */
3257          if (unlikely(dump_oncpu))
3258                  goto dump_scheduling_disabled;
3259
3260
3261         //WARN_ON(system_state == SYSTEM_BOOTING);
3262         /*
3263          * Test if we are atomic.  Since do_exit() needs to call into
3264          * schedule() atomically, we ignore that path for now.
3265          * Otherwise, whine if we are scheduling when we should not be.
3266          */
3267         if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {
3268                 if (unlikely(in_atomic())) {
3269                         printk(KERN_ERR "bad: scheduling while atomic!\n");
3270                         dump_stack();
3271                 }
3272         }
3273
3274 need_resched:
3275         preempt_disable();
3276         prev = current;
3277         rq = this_rq();
3278
3279         /*
3280          * The idle thread is not allowed to schedule!
3281          * Remove this check after it has been exercised a bit.
3282          */
3283         if (unlikely(current == rq->idle) && current->state != TASK_RUNNING) {
3284                 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
3285                 dump_stack();
3286         }
3287
3288         release_kernel_lock(prev);
3289         schedstat_inc(rq, sched_cnt);
3290         now = sched_clock();
3291         if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
3292                 run_time = now - prev->timestamp;
3293         else
3294                 run_time = NS_MAX_SLEEP_AVG;
3295
3296         /*
3297          * Tasks with interactive credits get charged less run_time
3298          * at high sleep_avg to delay them losing their interactive
3299          * status
3300          */
3301         if (HIGH_CREDIT(prev))
3302                 run_time /= (CURRENT_BONUS(prev) ? : 1);
3303
3304         spin_lock_irq(&rq->lock);
3305
3306 #ifdef CONFIG_CKRM_CPU_SCHEDULE
3307         if (prev != rq->idle) {
3308                 unsigned long long run = now - prev->timestamp;
3309                 ckrm_lrq_t * lrq = get_task_lrq(prev);
3310
3311                 lrq->lrq_load -= task_load(prev);
3312                 cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
3313                 lrq->lrq_load += task_load(prev);
3314
3315                 cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
3316                 update_local_cvt(prev, run);
3317         }
3318 #endif
3319
3320         if (unlikely(current->flags & PF_DEAD))
3321                 current->state = EXIT_DEAD;
3322         /*
3323          * if entering off of a kernel preemption go straight
3324          * to picking the next task.
3325          */
3326         switch_count = &prev->nivcsw;
3327         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3328                 switch_count = &prev->nvcsw;
3329                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3330                                 unlikely(signal_pending(prev))))
3331                         prev->state = TASK_RUNNING;
3332                 else
3333                         deactivate_task(prev, rq);
3334         }
3335
3336 #ifdef CONFIG_VSERVER_HARDCPU
3337         if (!list_empty(&rq->hold_queue)) {
3338                 struct list_head *l, *n;
3339                 int ret;
3340
3341                 vxi = NULL;
3342                 list_for_each_safe(l, n, &rq->hold_queue) {
3343                         next = list_entry(l, task_t, run_list);
3344                         if (vxi == next->vx_info)
3345                                 continue;
3346
3347                         vxi = next->vx_info;
3348                         ret = vx_tokens_recalc(vxi);
3349                         // tokens = vx_tokens_avail(next);
3350
3351                         if (ret > 0) {
3352                                 list_del(&next->run_list);
3353                                 next->state &= ~TASK_ONHOLD;
3354                                 // one less waiting
3355                                 vx_onhold_dec(vxi);
3356                                 array = rq->expired;
3357                                 next->prio = MAX_PRIO-1;
3358                                 enqueue_task(next, array);
3359                                 rq->nr_running++;
3360                                 if (next->static_prio < rq->best_expired_prio)
3361                                         rq->best_expired_prio = next->static_prio;
3362
3363                                 // printk("··· %8lu unhold %p [%d]\n", jiffies, next, next->prio);
3364                                 break;
3365                         }
3366                         if ((ret < 0) && (maxidle < ret))
3367                                 maxidle = ret;
3368                 }
3369         }
3370         rq->idle_tokens = -maxidle;
3371
3372 pick_next:
3373 #endif
3374
3375         cpu = smp_processor_id();
3376         if (unlikely(!rq->nr_running)) {
3377 go_idle:
3378                 idle_balance(cpu, rq);
3379                 if (!rq->nr_running) {
3380                         next = rq->idle;
3381                         rq->expired_timestamp = 0;
3382                         wake_sleeping_dependent(cpu, rq);
3383                         /*
3384                          * wake_sleeping_dependent() might have released
3385                          * the runqueue, so break out if we got new
3386                          * tasks meanwhile:
3387                          */
3388                         if (!rq->nr_running)
3389                                 goto switch_tasks;
3390                 }
3391         } else {
3392                 if (dependent_sleeper(cpu, rq)) {
3393                         schedstat_inc(rq, sched_goidle);
3394                         next = rq->idle;
3395                         goto switch_tasks;
3396                 }
3397                 /*
3398                  * dependent_sleeper() releases and reacquires the runqueue
3399                  * lock, hence go into the idle loop if the rq went
3400                  * empty meanwhile:
3401                  */
3402                 if (unlikely(!rq->nr_running))
3403                         goto go_idle;
3404         }
3405
3406         /* MEF: CKRM refactored code into rq_get_next_task(); make
3407          * sure that when upgrading changes are reflected into both
3408          * versions of the code.
3409          */
3410         next = rq_get_next_task(rq);
3411
3412 #ifdef  CONFIG_VSERVER_HARDCPU
3413         vxi = next->vx_info;
3414         if (vx_info_flags(vxi, VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
3415                 int ret = vx_tokens_recalc(vxi);
3416
3417                 if (unlikely(ret <= 0)) {
3418                         if (ret && (rq->idle_tokens > -ret))
3419                                 rq->idle_tokens = -ret;
3420                         __deactivate_task(next, rq);
3421                         recalc_task_prio(next, now);
3422                         // a new one on hold
3423                         vx_onhold_inc(vxi);
3424                         next->state |= TASK_ONHOLD;
3425                         list_add_tail(&next->run_list, &rq->hold_queue);
3426                         //printk("··· %8lu hold   %p [%d]\n", jiffies, next, next->prio);
3427                         goto pick_next;
3428                 }
3429         }
3430 #endif
3431
3432         if (!rt_task(next) && next->activated > 0) {
3433                 unsigned long long delta = now - next->timestamp;
3434
3435                 if (next->activated == 1)
3436                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
3437
3438                 array = next->array;
3439                 dequeue_task(next, array);
3440                 recalc_task_prio(next, next->timestamp + delta);
3441                 enqueue_task(next, array);
3442         }
3443         next->activated = 0;
3444 switch_tasks:
3445         prefetch(next);
3446         clear_tsk_need_resched(prev);
3447         rcu_qsctr_inc(task_cpu(prev));
3448
3449         prev->sleep_avg -= run_time;
3450         if ((long)prev->sleep_avg <= 0) {
3451                 prev->sleep_avg = 0;
3452                 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
3453                         prev->interactive_credit--;
3454         }
3455         prev->timestamp = prev->last_ran = now;
3456
3457         sched_info_switch(prev, next);
3458         if (likely(prev != next)) {
3459                 next->timestamp = now;
3460                 rq->nr_switches++;
3461                 rq->curr = next;
3462                 ++*switch_count;
3463
3464                 prepare_arch_switch(rq, next);
3465                 prev = context_switch(rq, prev, next);
3466                 barrier();
3467
3468                 finish_task_switch(prev);
3469         } else
3470                 spin_unlock_irq(&rq->lock);
3471
3472         reacquire_kernel_lock(current);
3473         preempt_enable_no_resched();
3474         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3475                 goto need_resched;
3476
3477         return;
3478
3479  dump_scheduling_disabled:
3480         /* allow scheduling only if this is the dumping cpu */
3481         if (dump_oncpu != smp_processor_id()+1) {
3482                 while (dump_oncpu)
3483                         cpu_relax();
3484         }
3485         return;
3486 }
3487
3488 EXPORT_SYMBOL(schedule);
3489 #ifdef CONFIG_PREEMPT
3490 /*
3491  * this is is the entry point to schedule() from in-kernel preemption
3492  * off of preempt_enable.  Kernel preemptions off return from interrupt
3493  * occur there and call schedule directly.
3494  */
3495 asmlinkage void __sched preempt_schedule(void)
3496 {
3497         struct thread_info *ti = current_thread_info();
3498
3499         /*
3500          * If there is a non-zero preempt_count or interrupts are disabled,
3501          * we do not want to preempt the current task.  Just return..
3502          */
3503         if (unlikely(ti->preempt_count || irqs_disabled()))
3504                 return;
3505
3506 need_resched:
3507         ti->preempt_count = PREEMPT_ACTIVE;
3508         schedule();
3509         ti->preempt_count = 0;
3510
3511         /* we could miss a preemption opportunity between schedule and now */
3512         barrier();
3513         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3514                 goto need_resched;
3515 }
3516
3517 EXPORT_SYMBOL(preempt_schedule);
3518 #endif /* CONFIG_PREEMPT */
3519
3520 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
3521 {
3522         task_t *p = curr->task;
3523         return try_to_wake_up(p, mode, sync);
3524 }
3525
3526 EXPORT_SYMBOL(default_wake_function);
3527
3528 /*
3529  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
3530  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
3531  * number) then we wake all the non-exclusive tasks and one exclusive task.
3532  *
3533  * There are circumstances in which we can try to wake a task which has already
3534  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
3535  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3536  */
3537 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3538                              int nr_exclusive, int sync, void *key)
3539 {
3540         struct list_head *tmp, *next;
3541
3542         list_for_each_safe(tmp, next, &q->task_list) {
3543                 wait_queue_t *curr;
3544                 unsigned flags;
3545                 curr = list_entry(tmp, wait_queue_t, task_list);
3546                 flags = curr->flags;
3547                 if (curr->func(curr, mode, sync, key) &&
3548                     (flags & WQ_FLAG_EXCLUSIVE) &&
3549                     !--nr_exclusive)
3550                         break;
3551         }
3552 }
3553
3554 /**
3555  * __wake_up - wake up threads blocked on a waitqueue.
3556  * @q: the waitqueue
3557  * @mode: which threads
3558  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3559  */
3560 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3561                                 int nr_exclusive, void *key)
3562 {
3563         unsigned long flags;
3564
3565         spin_lock_irqsave(&q->lock, flags);
3566         __wake_up_common(q, mode, nr_exclusive, 0, key);
3567         spin_unlock_irqrestore(&q->lock, flags);
3568 }
3569
3570 EXPORT_SYMBOL(__wake_up);
3571
3572 /*
3573  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3574  */
3575 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3576 {
3577         __wake_up_common(q, mode, 1, 0, NULL);
3578 }
3579
3580 /**
3581  * __wake_up - sync- wake up threads blocked on a waitqueue.
3582  * @q: the waitqueue
3583  * @mode: which threads
3584  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3585  *
3586  * The sync wakeup differs that the waker knows that it will schedule
3587  * away soon, so while the target thread will be woken up, it will not
3588  * be migrated to another CPU - ie. the two threads are 'synchronized'
3589  * with each other. This can prevent needless bouncing between CPUs.
3590  *
3591  * On UP it can prevent extra preemption.
3592  */
3593 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3594 {
3595         unsigned long flags;
3596         int sync = 1;
3597
3598         if (unlikely(!q))
3599                 return;
3600
3601         if (unlikely(!nr_exclusive))
3602                 sync = 0;
3603
3604         spin_lock_irqsave(&q->lock, flags);
3605         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3606         spin_unlock_irqrestore(&q->lock, flags);
3607 }
3608 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3609
3610 void fastcall complete(struct completion *x)
3611 {
3612         unsigned long flags;
3613
3614         spin_lock_irqsave(&x->wait.lock, flags);
3615         x->done++;
3616         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3617                          1, 0, NULL);
3618         spin_unlock_irqrestore(&x->wait.lock, flags);
3619 }
3620 EXPORT_SYMBOL(complete);
3621
3622 void fastcall complete_all(struct completion *x)
3623 {
3624         unsigned long flags;
3625
3626         spin_lock_irqsave(&x->wait.lock, flags);
3627         x->done += UINT_MAX/2;
3628         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3629                          0, 0, NULL);
3630         spin_unlock_irqrestore(&x->wait.lock, flags);
3631 }
3632 EXPORT_SYMBOL(complete_all);
3633
3634 void fastcall __sched wait_for_completion(struct completion *x)
3635 {
3636         might_sleep();
3637         spin_lock_irq(&x->wait.lock);
3638         if (!x->done) {
3639                 DECLARE_WAITQUEUE(wait, current);
3640
3641                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3642                 __add_wait_queue_tail(&x->wait, &wait);
3643                 do {
3644                         __set_current_state(TASK_UNINTERRUPTIBLE);
3645                         spin_unlock_irq(&x->wait.lock);
3646                         schedule();
3647                         spin_lock_irq(&x->wait.lock);
3648                 } while (!x->done);
3649                 __remove_wait_queue(&x->wait, &wait);
3650         }
3651         x->done--;
3652         spin_unlock_irq(&x->wait.lock);
3653 }
3654 EXPORT_SYMBOL(wait_for_completion);
3655
3656 #define SLEEP_ON_VAR                                    \
3657         unsigned long flags;                            \
3658         wait_queue_t wait;                              \
3659         init_waitqueue_entry(&wait, current);
3660
3661 #define SLEEP_ON_HEAD                                   \
3662         spin_lock_irqsave(&q->lock,flags);              \
3663         __add_wait_queue(q, &wait);                     \
3664         spin_unlock(&q->lock);
3665
3666 #define SLEEP_ON_TAIL                                   \
3667         spin_lock_irq(&q->lock);                        \
3668         __remove_wait_queue(q, &wait);                  \
3669         spin_unlock_irqrestore(&q->lock, flags);
3670
3671 #define SLEEP_ON_BKLCHECK                               \
3672         if (unlikely(!kernel_locked()) &&               \
3673             sleep_on_bkl_warnings < 10) {               \
3674                 sleep_on_bkl_warnings++;                \
3675                 WARN_ON(1);                             \
3676         }
3677
3678 static int sleep_on_bkl_warnings;
3679
3680 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3681 {
3682         SLEEP_ON_VAR
3683
3684         SLEEP_ON_BKLCHECK
3685
3686         current->state = TASK_INTERRUPTIBLE;
3687
3688         SLEEP_ON_HEAD
3689         schedule();
3690         SLEEP_ON_TAIL
3691 }
3692
3693 EXPORT_SYMBOL(interruptible_sleep_on);
3694
3695 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3696 {
3697         SLEEP_ON_VAR
3698
3699         SLEEP_ON_BKLCHECK
3700
3701         current->state = TASK_INTERRUPTIBLE;
3702
3703         SLEEP_ON_HEAD
3704         timeout = schedule_timeout(timeout);
3705         SLEEP_ON_TAIL
3706
3707         return timeout;
3708 }
3709
3710 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3711
3712 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3713 {
3714         SLEEP_ON_VAR
3715
3716         SLEEP_ON_BKLCHECK
3717
3718         current->state = TASK_UNINTERRUPTIBLE;
3719
3720         SLEEP_ON_HEAD
3721         timeout = schedule_timeout(timeout);
3722         SLEEP_ON_TAIL
3723
3724         return timeout;
3725 }
3726
3727 EXPORT_SYMBOL(sleep_on_timeout);
3728
3729 void set_user_nice(task_t *p, long nice)
3730 {
3731         unsigned long flags;
3732         prio_array_t *array;
3733         runqueue_t *rq;
3734         int old_prio, new_prio, delta;
3735
3736         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3737                 return;
3738         /*
3739          * We have to be careful, if called from sys_setpriority(),
3740          * the task might be in the middle of scheduling on another CPU.
3741          */
3742         rq = task_rq_lock(p, &flags);
3743         /*
3744          * The RT priorities are set via setscheduler(), but we still
3745          * allow the 'normal' nice value to be set - but as expected
3746          * it wont have any effect on scheduling until the task is
3747          * not SCHED_NORMAL:
3748          */
3749         if (rt_task(p)) {
3750                 p->static_prio = NICE_TO_PRIO(nice);
3751                 goto out_unlock;
3752         }
3753         array = p->array;
3754         if (array)
3755                 dequeue_task(p, array);
3756
3757         old_prio = p->prio;
3758         new_prio = NICE_TO_PRIO(nice);
3759         delta = new_prio - old_prio;
3760         p->static_prio = NICE_TO_PRIO(nice);
3761         p->prio += delta;
3762
3763         if (array) {
3764                 enqueue_task(p, array);
3765                 /*
3766                  * If the task increased its priority or is running and
3767                  * lowered its priority, then reschedule its CPU:
3768                  */
3769                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3770                         resched_task(rq->curr);
3771         }
3772 out_unlock:
3773         task_rq_unlock(rq, &flags);
3774 }
3775
3776 EXPORT_SYMBOL(set_user_nice);
3777
3778 #ifdef __ARCH_WANT_SYS_NICE
3779
3780 /*
3781  * sys_nice - change the priority of the current process.
3782  * @increment: priority increment
3783  *
3784  * sys_setpriority is a more generic, but much slower function that
3785  * does similar things.
3786  */
3787 asmlinkage long sys_nice(int increment)
3788 {
3789         int retval;
3790         long nice;
3791
3792         /*
3793          * Setpriority might change our priority at the same moment.
3794          * We don't have to worry. Conceptually one call occurs first
3795          * and we have a single winner.
3796          */
3797         if (increment < 0) {
3798                 if (vx_flags(VXF_IGNEG_NICE, 0))
3799                         return 0;
3800                 if (!capable(CAP_SYS_NICE))
3801                         return -EPERM;
3802                 if (increment < -40)
3803                         increment = -40;
3804         }
3805         if (increment > 40)
3806                 increment = 40;
3807
3808         nice = PRIO_TO_NICE(current->static_prio) + increment;
3809         if (nice < -20)
3810                 nice = -20;
3811         if (nice > 19)
3812                 nice = 19;
3813
3814         retval = security_task_setnice(current, nice);
3815         if (retval)
3816                 return retval;
3817
3818         set_user_nice(current, nice);
3819         return 0;
3820 }
3821
3822 #endif
3823
3824 /**
3825  * task_prio - return the priority value of a given task.
3826  * @p: the task in question.
3827  *
3828  * This is the priority value as seen by users in /proc.
3829  * RT tasks are offset by -200. Normal tasks are centered
3830  * around 0, value goes from -16 to +15.
3831  */
3832 int task_prio(const task_t *p)
3833 {
3834         return p->prio - MAX_RT_PRIO;
3835 }
3836
3837 /**
3838  * task_nice - return the nice value of a given task.
3839  * @p: the task in question.
3840  */
3841 int task_nice(const task_t *p)
3842 {
3843         return TASK_NICE(p);
3844 }
3845
3846 /**
3847  * idle_cpu - is a given cpu idle currently?
3848  * @cpu: the processor in question.
3849  */
3850 int idle_cpu(int cpu)
3851 {
3852         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3853 }
3854
3855 EXPORT_SYMBOL_GPL(idle_cpu);
3856
3857 /**
3858  * find_process_by_pid - find a process with a matching PID value.
3859  * @pid: the pid in question.
3860  */
3861 static inline task_t *find_process_by_pid(pid_t pid)
3862 {
3863         return pid ? find_task_by_pid(pid) : current;
3864 }
3865
3866 /* Actually do priority change: must hold rq lock. */
3867 static void __setscheduler(struct task_struct *p, int policy, int prio)
3868 {
3869         BUG_ON(p->array);
3870         p->policy = policy;
3871         p->rt_priority = prio;
3872         if (policy != SCHED_NORMAL)
3873                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3874         else
3875                 p->prio = p->static_prio;
3876 }
3877
3878 /*
3879  * setscheduler - change the scheduling policy and/or RT priority of a thread.
3880  */
3881 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3882 {
3883         struct sched_param lp;
3884         int retval = -EINVAL;
3885         int oldprio, oldpolicy = -1;
3886         prio_array_t *array;
3887         unsigned long flags;
3888         runqueue_t *rq;
3889         task_t *p;
3890
3891         if (!param || pid < 0)
3892                 goto out_nounlock;
3893
3894         retval = -EFAULT;
3895         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
3896                 goto out_nounlock;
3897
3898         /*
3899          * We play safe to avoid deadlocks.
3900          */
3901         read_lock_irq(&tasklist_lock);
3902
3903         p = find_process_by_pid(pid);
3904
3905         retval = -ESRCH;
3906         if (!p)
3907                 goto out_unlock;
3908 recheck:
3909         /* double check policy once rq lock held */
3910         if (policy < 0)
3911                 policy = oldpolicy = p->policy;
3912         else {
3913                 retval = -EINVAL;
3914                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3915                                 policy != SCHED_NORMAL)
3916                         goto out_unlock;
3917         }
3918         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3919
3920         /*
3921          * Valid priorities for SCHED_FIFO and SCHED_RR are
3922          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3923          */
3924         retval = -EINVAL;
3925         if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
3926                 goto out_unlock;
3927         if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
3928                 goto out_unlock;
3929
3930         retval = -EPERM;
3931         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
3932             !capable(CAP_SYS_NICE))
3933                 goto out_unlock;
3934         if ((current->euid != p->euid) && (current->euid != p->uid) &&
3935             !capable(CAP_SYS_NICE))
3936                 goto out_unlock;
3937
3938         retval = security_task_setscheduler(p, policy, &lp);
3939         if (retval)
3940                 goto out_unlock;
3941
3942         /*
3943          * To be able to change p->policy safely, the apropriate
3944          * runqueue lock must be held.
3945          */
3946         rq = task_rq_lock(p, &flags);
3947         /* recheck policy now with rq lock held */
3948         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3949                 policy = oldpolicy = -1;
3950                 task_rq_unlock(rq, &flags);
3951                 goto recheck;
3952         }
3953         array = p->array;
3954         if (array)
3955                 deactivate_task(p, task_rq(p));
3956         retval = 0;
3957         oldprio = p->prio;
3958         __setscheduler(p, policy, lp.sched_priority);
3959         if (array) {
3960                 vx_activate_task(p);
3961                 __activate_task(p, task_rq(p));
3962                 /*
3963                  * Reschedule if we are currently running on this runqueue and
3964                  * our priority decreased, or if we are not currently running on
3965                  * this runqueue and our priority is higher than the current's
3966                  */
3967                 if (task_running(rq, p)) {
3968                         if (p->prio > oldprio)
3969                                 resched_task(rq->curr);
3970                 } else if (TASK_PREEMPTS_CURR(p, rq))
3971                         resched_task(rq->curr);
3972         }
3973         task_rq_unlock(rq, &flags);
3974 out_unlock:
3975         read_unlock_irq(&tasklist_lock);
3976 out_nounlock:
3977         return retval;
3978 }
3979
3980 /**
3981  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3982  * @pid: the pid in question.
3983  * @policy: new policy
3984  * @param: structure containing the new RT priority.
3985  */
3986 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3987                                        struct sched_param __user *param)
3988 {
3989         return setscheduler(pid, policy, param);
3990 }
3991
3992 /**
3993  * sys_sched_setparam - set/change the RT priority of a thread
3994  * @pid: the pid in question.
3995  * @param: structure containing the new RT priority.
3996  */
3997 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3998 {
3999         return setscheduler(pid, -1, param);
4000 }
4001
4002 /**
4003  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4004  * @pid: the pid in question.
4005  */
4006 asmlinkage long sys_sched_getscheduler(pid_t pid)
4007 {
4008         int retval = -EINVAL;
4009         task_t *p;
4010
4011         if (pid < 0)
4012                 goto out_nounlock;
4013
4014         retval = -ESRCH;
4015         read_lock(&tasklist_lock);
4016         p = find_process_by_pid(pid);
4017         if (p) {
4018                 retval = security_task_getscheduler(p);
4019                 if (!retval)
4020                         retval = p->policy;
4021         }
4022         read_unlock(&tasklist_lock);
4023
4024 out_nounlock:
4025         return retval;
4026 }
4027
4028 /**
4029  * sys_sched_getscheduler - get the RT priority of a thread
4030  * @pid: the pid in question.
4031  * @param: structure containing the RT priority.
4032  */
4033 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4034 {
4035         struct sched_param lp;
4036         int retval = -EINVAL;
4037         task_t *p;
4038
4039         if (!param || pid < 0)
4040                 goto out_nounlock;
4041
4042         read_lock(&tasklist_lock);
4043         p = find_process_by_pid(pid);
4044         retval = -ESRCH;
4045         if (!p)
4046                 goto out_unlock;
4047
4048         retval = security_task_getscheduler(p);
4049         if (retval)
4050                 goto out_unlock;
4051
4052         lp.sched_priority = p->rt_priority;
4053         read_unlock(&tasklist_lock);
4054
4055         /*
4056          * This one might sleep, we cannot do it with a spinlock held ...
4057          */
4058         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4059
4060 out_nounlock:
4061         return retval;
4062
4063 out_unlock:
4064         read_unlock(&tasklist_lock);
4065         return retval;
4066 }
4067
4068 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4069 {
4070         task_t *p;
4071         int retval;
4072
4073         lock_cpu_hotplug();
4074         read_lock(&tasklist_lock);
4075
4076         p = find_process_by_pid(pid);
4077         if (!p) {
4078                 read_unlock(&tasklist_lock);
4079                 unlock_cpu_hotplug();
4080                 return -ESRCH;
4081         }
4082
4083         /*
4084          * It is not safe to call set_cpus_allowed with the
4085          * tasklist_lock held.  We will bump the task_struct's
4086          * usage count and then drop tasklist_lock.
4087          */
4088         get_task_struct(p);
4089         read_unlock(&tasklist_lock);
4090
4091         retval = -EPERM;
4092         if ((current->euid != p->euid) && (current->euid != p->uid) &&
4093                         !capable(CAP_SYS_NICE))
4094                 goto out_unlock;
4095
4096         retval = set_cpus_allowed(p, new_mask);
4097
4098 out_unlock:
4099         put_task_struct(p);
4100         unlock_cpu_hotplug();
4101         return retval;
4102 }
4103
4104 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4105                              cpumask_t *new_mask)
4106 {
4107         if (len < sizeof(cpumask_t)) {
4108                 memset(new_mask, 0, sizeof(cpumask_t));
4109         } else if (len > sizeof(cpumask_t)) {
4110                 len = sizeof(cpumask_t);
4111         }
4112         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4113 }
4114
4115 /**
4116  * sys_sched_setaffinity - set the cpu affinity of a process
4117  * @pid: pid of the process
4118  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4119  * @user_mask_ptr: user-space pointer to the new cpu mask
4120  */
4121 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4122                                       unsigned long __user *user_mask_ptr)
4123 {
4124         cpumask_t new_mask;
4125         int retval;
4126
4127         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4128         if (retval)
4129                 return retval;
4130
4131         return sched_setaffinity(pid, new_mask);
4132 }
4133
4134 /*
4135  * Represents all cpu's present in the system
4136  * In systems capable of hotplug, this map could dynamically grow
4137  * as new cpu's are detected in the system via any platform specific
4138  * method, such as ACPI for e.g.
4139  */
4140
4141 cpumask_t cpu_present_map;
4142 EXPORT_SYMBOL(cpu_present_map);
4143
4144 #ifndef CONFIG_SMP
4145 cpumask_t cpu_online_map = CPU_MASK_ALL;
4146 cpumask_t cpu_possible_map = CPU_MASK_ALL;
4147 #endif
4148
4149 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4150 {
4151         int retval;
4152         task_t *p;
4153
4154         lock_cpu_hotplug();
4155         read_lock(&tasklist_lock);
4156
4157         retval = -ESRCH;
4158         p = find_process_by_pid(pid);
4159         if (!p)
4160                 goto out_unlock;
4161
4162         retval = 0;
4163         cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
4164
4165 out_unlock:
4166         read_unlock(&tasklist_lock);
4167         unlock_cpu_hotplug();
4168         if (retval)
4169                 return retval;
4170
4171         return 0;
4172 }
4173
4174 /**
4175  * sys_sched_getaffinity - get the cpu affinity of a process
4176  * @pid: pid of the process
4177  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4178  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4179  */
4180 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4181                                       unsigned long __user *user_mask_ptr)
4182 {
4183         int ret;
4184         cpumask_t mask;
4185
4186         if (len < sizeof(cpumask_t))
4187                 return -EINVAL;
4188
4189         ret = sched_getaffinity(pid, &mask);
4190         if (ret < 0)
4191                 return ret;
4192
4193         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4194                 return -EFAULT;
4195
4196         return sizeof(cpumask_t);
4197 }
4198
4199 /**
4200  * sys_sched_yield - yield the current processor to other threads.
4201  *
4202  * this function yields the current CPU by moving the calling thread
4203  * to the expired array. If there are no other threads running on this
4204  * CPU then this function will return.
4205  */
4206 asmlinkage long sys_sched_yield(void)
4207 {
4208         runqueue_t *rq = this_rq_lock();
4209         prio_array_t *array = current->array;
4210         prio_array_t *target = rq_expired(current,rq);
4211
4212         schedstat_inc(rq, yld_cnt);
4213         /*
4214          * We implement yielding by moving the task into the expired
4215          * queue.
4216          *
4217          * (special rule: RT tasks will just roundrobin in the active
4218          *  array.)
4219          */
4220         if (rt_task(current))
4221                 target = rq_active(current,rq);
4222
4223 #warning MEF need to fix up SCHEDSTATS code, but I hope this is fixed by the 2.6.10 CKRM patch
4224 #ifdef CONFIG_SCHEDSTATS
4225         if (current->array->nr_active == 1) {
4226                 schedstat_inc(rq, yld_act_empty);
4227                 if (!rq->expired->nr_active)
4228                         schedstat_inc(rq, yld_both_empty);
4229         } else if (!rq->expired->nr_active)
4230                 schedstat_inc(rq, yld_exp_empty);
4231 #endif
4232
4233         dequeue_task(current, array);
4234         enqueue_task(current, target);
4235
4236         /*
4237          * Since we are going to call schedule() anyway, there's
4238          * no need to preempt or enable interrupts:
4239          */
4240         _raw_spin_unlock(&rq->lock);
4241         preempt_enable_no_resched();
4242
4243         schedule();
4244
4245         return 0;
4246 }
4247
4248 void __sched __cond_resched(void)
4249 {
4250 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4251         __might_sleep(__FILE__, __LINE__, 0);
4252 #endif
4253         /*
4254          * The system_state check is somewhat ugly but we might be
4255          * called during early boot when we are not yet ready to reschedule.
4256          */
4257          if (need_resched() && system_state >= SYSTEM_BOOTING_SCHEDULER_OK) {
4258                 set_current_state(TASK_RUNNING);
4259                 schedule();
4260         }
4261
4262 }
4263
4264 EXPORT_SYMBOL(__cond_resched);
4265
4266 void __sched __cond_resched_lock(spinlock_t * lock)
4267 {
4268         if (need_resched()) {
4269                 _raw_spin_unlock(lock);
4270                 preempt_enable_no_resched();
4271                 set_current_state(TASK_RUNNING);
4272                 schedule();
4273                 spin_lock(lock);
4274         }
4275 }
4276
4277 EXPORT_SYMBOL(__cond_resched_lock);
4278
4279
4280 /**
4281  * yield - yield the current processor to other threads.
4282  *
4283  * this is a shortcut for kernel-space yielding - it marks the
4284  * thread runnable and calls sys_sched_yield().
4285  */
4286 void __sched yield(void)
4287 {
4288         set_current_state(TASK_RUNNING);
4289         sys_sched_yield();
4290 }
4291
4292 EXPORT_SYMBOL(yield);
4293
4294 /*
4295  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
4296  * that process accounting knows that this is a task in IO wait state.
4297  *
4298  * But don't do that if it is a deliberate, throttling IO wait (this task
4299  * has set its backing_dev_info: the queue against which it should throttle)
4300  */
4301 void __sched io_schedule(void)
4302 {
4303         struct runqueue *rq = this_rq();
4304
4305         atomic_inc(&rq->nr_iowait);
4306         schedule();
4307         atomic_dec(&rq->nr_iowait);
4308 }
4309
4310 EXPORT_SYMBOL(io_schedule);
4311
4312 long __sched io_schedule_timeout(long timeout)
4313 {
4314         struct runqueue *rq = this_rq();
4315         long ret;
4316
4317         atomic_inc(&rq->nr_iowait);
4318         ret = schedule_timeout(timeout);
4319         atomic_dec(&rq->nr_iowait);
4320         return ret;
4321 }
4322
4323 /**
4324  * sys_sched_get_priority_max - return maximum RT priority.
4325  * @policy: scheduling class.
4326  *
4327  * this syscall returns the maximum rt_priority that can be used
4328  * by a given scheduling class.
4329  */
4330 asmlinkage long sys_sched_get_priority_max(int policy)
4331 {
4332         int ret = -EINVAL;
4333
4334         switch (policy) {
4335         case SCHED_FIFO:
4336         case SCHED_RR:
4337                 ret = MAX_USER_RT_PRIO-1;
4338                 break;
4339         case SCHED_NORMAL:
4340                 ret = 0;
4341                 break;
4342         }
4343         return ret;
4344 }
4345
4346 /**
4347  * sys_sched_get_priority_min - return minimum RT priority.
4348  * @policy: scheduling class.
4349  *
4350  * this syscall returns the minimum rt_priority that can be used
4351  * by a given scheduling class.
4352  */
4353 asmlinkage long sys_sched_get_priority_min(int policy)
4354 {
4355         int ret = -EINVAL;
4356
4357         switch (policy) {
4358         case SCHED_FIFO:
4359         case SCHED_RR:
4360                 ret = 1;
4361                 break;
4362         case SCHED_NORMAL:
4363                 ret = 0;
4364         }
4365         return ret;
4366 }
4367
4368 /**
4369  * sys_sched_rr_get_interval - return the default timeslice of a process.
4370  * @pid: pid of the process.
4371  * @interval: userspace pointer to the timeslice value.
4372  *
4373  * this syscall writes the default timeslice value of a given process
4374  * into the user-space timespec buffer. A value of '0' means infinity.
4375  */
4376 asmlinkage
4377 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4378 {
4379         int retval = -EINVAL;
4380         struct timespec t;
4381         task_t *p;
4382
4383         if (pid < 0)
4384                 goto out_nounlock;
4385
4386         retval = -ESRCH;
4387         read_lock(&tasklist_lock);
4388         p = find_process_by_pid(pid);
4389         if (!p)
4390                 goto out_unlock;
4391
4392         retval = security_task_getscheduler(p);
4393         if (retval)
4394                 goto out_unlock;
4395
4396         jiffies_to_timespec(p->policy & SCHED_FIFO ?
4397                                 0 : task_timeslice(p), &t);
4398         read_unlock(&tasklist_lock);
4399         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4400 out_nounlock:
4401         return retval;
4402 out_unlock:
4403         read_unlock(&tasklist_lock);
4404         return retval;
4405 }
4406
4407 static inline struct task_struct *eldest_child(struct task_struct *p)
4408 {
4409         if (list_empty(&p->children)) return NULL;
4410         return list_entry(p->children.next,struct task_struct,sibling);
4411 }
4412
4413 static inline struct task_struct *older_sibling(struct task_struct *p)
4414 {
4415         if (p->sibling.prev==&p->parent->children) return NULL;
4416         return list_entry(p->sibling.prev,struct task_struct,sibling);
4417 }
4418
4419 static inline struct task_struct *younger_sibling(struct task_struct *p)
4420 {
4421         if (p->sibling.next==&p->parent->children) return NULL;
4422         return list_entry(p->sibling.next,struct task_struct,sibling);
4423 }
4424
4425 static void show_task(task_t * p)
4426 {
4427         task_t *relative;
4428         unsigned state;
4429         unsigned long free = 0;
4430         static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
4431
4432         printk("%-13.13s ", p->comm);
4433         state = p->state ? __ffs(p->state) + 1 : 0;
4434         if (state < ARRAY_SIZE(stat_nam))
4435                 printk(stat_nam[state]);
4436         else
4437                 printk("?");
4438 #if (BITS_PER_LONG == 32)
4439         if (state == TASK_RUNNING)
4440                 printk(" running ");
4441         else
4442                 printk(" %08lX ", thread_saved_pc(p));
4443 #else
4444         if (state == TASK_RUNNING)
4445                 printk("  running task   ");
4446         else
4447                 printk(" %016lx ", thread_saved_pc(p));
4448 #endif
4449 #ifdef CONFIG_DEBUG_STACK_USAGE
4450         {
4451                 unsigned long * n = (unsigned long *) (p->thread_info+1);
4452                 while (!*n)
4453                         n++;
4454                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
4455         }
4456 #endif
4457         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
4458         if ((relative = eldest_child(p)))
4459                 printk("%5d ", relative->pid);
4460         else
4461                 printk("      ");
4462         if ((relative = younger_sibling(p)))
4463                 printk("%7d", relative->pid);
4464         else
4465                 printk("       ");
4466         if ((relative = older_sibling(p)))
4467                 printk(" %5d", relative->pid);
4468         else
4469                 printk("      ");
4470         if (!p->mm)
4471                 printk(" (L-TLB)\n");
4472         else
4473                 printk(" (NOTLB)\n");
4474
4475         if (state != TASK_RUNNING)
4476                 show_stack(p, NULL);
4477 }
4478
4479 void show_state(void)
4480 {
4481         task_t *g, *p;
4482
4483 #if (BITS_PER_LONG == 32)
4484         printk("\n"
4485                "                                               sibling\n");
4486         printk("  task             PC      pid father child younger older\n");
4487 #else
4488         printk("\n"
4489                "                                                       sibling\n");
4490         printk("  task                 PC          pid father child younger older\n");
4491 #endif
4492         read_lock(&tasklist_lock);
4493         do_each_thread(g, p) {
4494                 /*
4495                  * reset the NMI-timeout, listing all files on a slow
4496                  * console might take alot of time:
4497                  */
4498                 touch_nmi_watchdog();
4499                 show_task(p);
4500         } while_each_thread(g, p);
4501
4502         read_unlock(&tasklist_lock);
4503 }
4504
4505 void __devinit init_idle(task_t *idle, int cpu)
4506 {
4507         runqueue_t *rq = cpu_rq(cpu);
4508         unsigned long flags;
4509
4510         idle->sleep_avg = 0;
4511         idle->interactive_credit = 0;
4512         idle->array = NULL;
4513         idle->prio = MAX_PRIO;
4514         idle->state = TASK_RUNNING;
4515         set_task_cpu(idle, cpu);
4516
4517 #ifdef CONFIG_CKRM_CPU_SCHEDULE
4518         cpu_demand_event(&(idle->demand_stat),CPU_DEMAND_INIT,0);
4519         idle->cpu_class = get_default_cpu_class();
4520         idle->array = NULL;
4521 #endif
4522
4523         spin_lock_irqsave(&rq->lock, flags);
4524         rq->curr = rq->idle = idle;
4525         set_tsk_need_resched(idle);
4526         spin_unlock_irqrestore(&rq->lock, flags);
4527
4528         /* Set the preempt count _outside_ the spinlocks! */
4529 #ifdef CONFIG_PREEMPT
4530         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
4531 #else
4532         idle->thread_info->preempt_count = 0;
4533 #endif
4534 }
4535
4536 /*
4537  * In a system that switches off the HZ timer nohz_cpu_mask
4538  * indicates which cpus entered this state. This is used
4539  * in the rcu update to wait only for active cpus. For system
4540  * which do not switch off the HZ timer nohz_cpu_mask should
4541  * always be CPU_MASK_NONE.
4542  */
4543 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4544
4545 #ifdef CONFIG_SMP
4546 /*
4547  * This is how migration works:
4548  *
4549  * 1) we queue a migration_req_t structure in the source CPU's
4550  *    runqueue and wake up that CPU's migration thread.
4551  * 2) we down() the locked semaphore => thread blocks.
4552  * 3) migration thread wakes up (implicitly it forces the migrated
4553  *    thread off the CPU)
4554  * 4) it gets the migration request and checks whether the migrated
4555  *    task is still in the wrong runqueue.
4556  * 5) if it's in the wrong runqueue then the migration thread removes
4557  *    it and puts it into the right queue.
4558  * 6) migration thread up()s the semaphore.
4559  * 7) we wake up and the migration is done.
4560  */
4561
4562 /*
4563  * Change a given task's CPU affinity. Migrate the thread to a
4564  * proper CPU and schedule it away if the CPU it's executing on
4565  * is removed from the allowed bitmask.
4566  *
4567  * NOTE: the caller must have a valid reference to the task, the
4568  * task must not exit() & deallocate itself prematurely.  The
4569  * call is not atomic; no spinlocks may be held.
4570  */
4571 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4572 {
4573         unsigned long flags;
4574         int ret = 0;
4575         migration_req_t req;
4576         runqueue_t *rq;
4577
4578         rq = task_rq_lock(p, &flags);
4579         if (!cpus_intersects(new_mask, cpu_online_map)) {
4580                 ret = -EINVAL;
4581                 goto out;
4582         }
4583
4584         p->cpus_allowed = new_mask;
4585         /* Can the task run on the task's current CPU? If so, we're done */
4586         if (cpu_isset(task_cpu(p), new_mask))
4587                 goto out;
4588
4589         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4590                 /* Need help from migration thread: drop lock and wait. */
4591                 task_rq_unlock(rq, &flags);
4592                 wake_up_process(rq->migration_thread);
4593                 wait_for_completion(&req.done);
4594                 tlb_migrate_finish(p->mm);
4595                 return 0;
4596         }
4597 out:
4598         task_rq_unlock(rq, &flags);
4599         return ret;
4600 }
4601
4602 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4603
4604 /*
4605  * Move (not current) task off this cpu, onto dest cpu.  We're doing
4606  * this because either it can't run here any more (set_cpus_allowed()
4607  * away from this CPU, or CPU going down), or because we're
4608  * attempting to rebalance this task on exec (sched_exec).
4609  *
4610  * So we race with normal scheduler movements, but that's OK, as long
4611  * as the task is no longer on this CPU.
4612  */
4613 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4614 {
4615         runqueue_t *rq_dest, *rq_src;
4616
4617         if (unlikely(cpu_is_offline(dest_cpu)))
4618                 return;
4619
4620         rq_src = cpu_rq(src_cpu);
4621         rq_dest = cpu_rq(dest_cpu);
4622
4623         double_rq_lock(rq_src, rq_dest);
4624         /* Already moved. */
4625         if (task_cpu(p) != src_cpu)
4626                 goto out;
4627         /* Affinity changed (again). */
4628         if (!cpu_isset(dest_cpu, p->cpus_allowed))
4629                 goto out;
4630
4631         if (p->array) {
4632                 /*
4633                  * Sync timestamp with rq_dest's before activating.
4634                  * The same thing could be achieved by doing this step
4635                  * afterwards, and pretending it was a local activate.
4636                  * This way is cleaner and logically correct.
4637                  */
4638                 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4639                                 + rq_dest->timestamp_last_tick;
4640                 deactivate_task(p, rq_src);
4641                 set_task_cpu(p, dest_cpu);
4642                 activate_task(p, rq_dest, 0);
4643                 if (TASK_PREEMPTS_CURR(p, rq_dest))
4644                         resched_task(rq_dest->curr);
4645         } else
4646                 set_task_cpu(p, dest_cpu);
4647
4648 out:
4649         double_rq_unlock(rq_src, rq_dest);
4650 }
4651
4652 /*
4653  * migration_thread - this is a highprio system thread that performs
4654  * thread migration by bumping thread off CPU then 'pushing' onto
4655  * another runqueue.
4656  */
4657 static int migration_thread(void * data)
4658 {
4659         runqueue_t *rq;
4660         int cpu = (long)data;
4661
4662         rq = cpu_rq(cpu);
4663         BUG_ON(rq->migration_thread != current);
4664
4665         set_current_state(TASK_INTERRUPTIBLE);
4666         while (!kthread_should_stop()) {
4667                 struct list_head *head;
4668                 migration_req_t *req;
4669
4670                 if (current->flags & PF_FREEZE)
4671                         refrigerator(PF_FREEZE);
4672
4673                 spin_lock_irq(&rq->lock);
4674
4675                 if (cpu_is_offline(cpu)) {
4676                         spin_unlock_irq(&rq->lock);
4677                         goto wait_to_die;
4678                 }
4679
4680                 if (rq->active_balance) {
4681                         active_load_balance(rq, cpu);
4682                         rq->active_balance = 0;
4683                 }
4684
4685                 head = &rq->migration_queue;
4686
4687                 if (list_empty(head)) {
4688                         spin_unlock_irq(&rq->lock);
4689                         schedule();
4690                         set_current_state(TASK_INTERRUPTIBLE);
4691                         continue;
4692                 }
4693                 req = list_entry(head->next, migration_req_t, list);
4694                 list_del_init(head->next);
4695
4696                 if (req->type == REQ_MOVE_TASK) {
4697                         spin_unlock(&rq->lock);
4698                         __migrate_task(req->task, smp_processor_id(),
4699                                         req->dest_cpu);
4700                         local_irq_enable();
4701                 } else if (req->type == REQ_SET_DOMAIN) {
4702                         rq->sd = req->sd;
4703                         spin_unlock_irq(&rq->lock);
4704                 } else {
4705                         spin_unlock_irq(&rq->lock);
4706                         WARN_ON(1);
4707                 }
4708
4709                 complete(&req->done);
4710         }
4711         __set_current_state(TASK_RUNNING);
4712         return 0;
4713
4714 wait_to_die:
4715         /* Wait for kthread_stop */
4716         set_current_state(TASK_INTERRUPTIBLE);
4717         while (!kthread_should_stop()) {
4718                 schedule();
4719                 set_current_state(TASK_INTERRUPTIBLE);
4720         }
4721         __set_current_state(TASK_RUNNING);
4722         return 0;
4723 }
4724
4725 #ifdef CONFIG_HOTPLUG_CPU
4726 /* Figure out where task on dead CPU should go, use force if neccessary. */
4727 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4728 {
4729         int dest_cpu;
4730         cpumask_t mask;
4731
4732         /* On same node? */
4733         mask = node_to_cpumask(cpu_to_node(dead_cpu));
4734         cpus_and(mask, mask, tsk->cpus_allowed);
4735         dest_cpu = any_online_cpu(mask);
4736
4737         /* On any allowed CPU? */
4738         if (dest_cpu == NR_CPUS)
4739                 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4740
4741         /* No more Mr. Nice Guy. */
4742         if (dest_cpu == NR_CPUS) {
4743                 cpus_setall(tsk->cpus_allowed);
4744                 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4745
4746                 /*
4747                  * Don't tell them about moving exiting tasks or
4748                  * kernel threads (both mm NULL), since they never
4749                  * leave kernel.
4750                  */
4751                 if (tsk->mm && printk_ratelimit())
4752                         printk(KERN_INFO "process %d (%s) no "
4753                                "longer affine to cpu%d\n",
4754                                tsk->pid, tsk->comm, dead_cpu);
4755         }
4756         __migrate_task(tsk, dead_cpu, dest_cpu);
4757 }
4758
4759 /* Run through task list and migrate tasks from the dead cpu. */
4760 static void migrate_live_tasks(int src_cpu)
4761 {
4762         struct task_struct *tsk, *t;
4763
4764         write_lock_irq(&tasklist_lock);
4765
4766         do_each_thread(t, tsk) {
4767                 if (tsk == current)
4768                         continue;
4769
4770                 if (task_cpu(tsk) == src_cpu)
4771                         move_task_off_dead_cpu(src_cpu, tsk);
4772         } while_each_thread(t, tsk);
4773
4774         write_unlock_irq(&tasklist_lock);
4775 }
4776
4777 /* Schedules idle task to be the next runnable task on current CPU.
4778  * It does so by boosting its priority to highest possible and adding it to
4779  * the _front_ of runqueue. Used by CPU offline code.
4780  */
4781 void sched_idle_next(void)
4782 {
4783         int cpu = smp_processor_id();
4784         runqueue_t *rq = this_rq();
4785         struct task_struct *p = rq->idle;
4786         unsigned long flags;
4787
4788         /* cpu has to be offline */
4789         BUG_ON(cpu_online(cpu));
4790
4791         /* Strictly not necessary since rest of the CPUs are stopped by now
4792          * and interrupts disabled on current cpu.
4793          */
4794         spin_lock_irqsave(&rq->lock, flags);
4795
4796         __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4797         /* Add idle task to _front_ of it's priority queue */
4798         __activate_idle_task(p, rq);
4799
4800         spin_unlock_irqrestore(&rq->lock, flags);
4801 }
4802
4803 static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4804 {
4805         struct runqueue *rq = cpu_rq(dead_cpu);
4806
4807         /* Must be exiting, otherwise would be on tasklist. */
4808         BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
4809
4810         /* Cannot have done final schedule yet: would have vanished. */
4811         BUG_ON(tsk->flags & PF_DEAD);
4812
4813         get_task_struct(tsk);
4814
4815         /*
4816          * Drop lock around migration; if someone else moves it,
4817          * that's OK.  No task can be added to this CPU, so iteration is
4818          * fine.
4819          */
4820         spin_unlock_irq(&rq->lock);
4821         move_task_off_dead_cpu(dead_cpu, tsk);
4822         spin_lock_irq(&rq->lock);
4823
4824         put_task_struct(tsk);
4825 }
4826
4827 /* release_task() removes task from tasklist, so we won't find dead tasks. */
4828 static void migrate_dead_tasks(unsigned int dead_cpu)
4829 {
4830         unsigned arr, i;
4831         struct runqueue *rq = cpu_rq(dead_cpu);
4832
4833         for (arr = 0; arr < 2; arr++) {
4834                 for (i = 0; i < MAX_PRIO; i++) {
4835                         struct list_head *list = &rq->arrays[arr].queue[i];
4836                         while (!list_empty(list))
4837                                 migrate_dead(dead_cpu,
4838                                              list_entry(list->next, task_t,
4839                                                         run_list));
4840                 }
4841         }
4842 }
4843 #endif /* CONFIG_HOTPLUG_CPU */
4844
4845 /*
4846  * migration_call - callback that gets triggered when a CPU is added.
4847  * Here we can start up the necessary migration thread for the new CPU.
4848  */
4849 static int migration_call(struct notifier_block *nfb, unsigned long action,
4850                           void *hcpu)
4851 {
4852         int cpu = (long)hcpu;
4853         struct task_struct *p;
4854         struct runqueue *rq;
4855         unsigned long flags;
4856
4857         switch (action) {
4858         case CPU_UP_PREPARE:
4859                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4860                 if (IS_ERR(p))
4861                         return NOTIFY_BAD;
4862                 p->flags |= PF_NOFREEZE;
4863                 kthread_bind(p, cpu);
4864                 /* Must be high prio: stop_machine expects to yield to it. */
4865                 rq = task_rq_lock(p, &flags);
4866                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4867                 task_rq_unlock(rq, &flags);
4868                 cpu_rq(cpu)->migration_thread = p;
4869                 break;
4870         case CPU_ONLINE:
4871                 /* Strictly unneccessary, as first user will wake it. */
4872                 wake_up_process(cpu_rq(cpu)->migration_thread);
4873                 break;
4874 #ifdef CONFIG_HOTPLUG_CPU
4875         case CPU_UP_CANCELED:
4876                 /* Unbind it from offline cpu so it can run.  Fall thru. */
4877                 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4878                 kthread_stop(cpu_rq(cpu)->migration_thread);
4879                 cpu_rq(cpu)->migration_thread = NULL;
4880                 break;
4881         case CPU_DEAD:
4882                 migrate_live_tasks(cpu);
4883                 rq = cpu_rq(cpu);
4884                 kthread_stop(rq->migration_thread);
4885                 rq->migration_thread = NULL;
4886                 /* Idle task back to normal (off runqueue, low prio) */
4887                 rq = task_rq_lock(rq->idle, &flags);
4888                 deactivate_task(rq->idle, rq);
4889                 rq->idle->static_prio = MAX_PRIO;
4890                 __setscheduler(rq->idle, SCHED_NORMAL, 0);
4891                 migrate_dead_tasks(cpu);
4892                 task_rq_unlock(rq, &flags);
4893                 BUG_ON(rq->nr_running != 0);
4894
4895                 /* No need to migrate the tasks: it was best-effort if
4896                  * they didn't do lock_cpu_hotplug().  Just wake up
4897                  * the requestors. */
4898                 spin_lock_irq(&rq->lock);
4899                 while (!list_empty(&rq->migration_queue)) {
4900                         migration_req_t *req;
4901                         req = list_entry(rq->migration_queue.next,
4902                                          migration_req_t, list);
4903                         BUG_ON(req->type != REQ_MOVE_TASK);
4904                         list_del_init(&req->list);
4905                         complete(&req->done);
4906                 }
4907                 spin_unlock_irq(&rq->lock);
4908                 break;
4909 #endif
4910         }
4911         return NOTIFY_OK;
4912 }
4913
4914 /* Register at highest priority so that task migration (migrate_all_tasks)
4915  * happens before everything else.
4916  */
4917 static struct notifier_block __devinitdata migration_notifier = {
4918         .notifier_call = migration_call,
4919         .priority = 10
4920 };
4921
4922 int __init migration_init(void)
4923 {
4924         void *cpu = (void *)(long)smp_processor_id();
4925         /* Start one for boot CPU. */
4926         migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4927         migration_call(&migration_notifier, CPU_ONLINE, cpu);
4928         register_cpu_notifier(&migration_notifier);
4929         return 0;
4930 }
4931 #endif
4932
4933 /*
4934  * The 'big kernel lock'
4935  *
4936  * This spinlock is taken and released recursively by lock_kernel()
4937  * and unlock_kernel().  It is transparently dropped and reaquired
4938  * over schedule().  It is used to protect legacy code that hasn't
4939  * been migrated to a proper locking design yet.
4940  *
4941  * Don't use in new code.
4942  *
4943  * Note: spinlock debugging needs this even on !CONFIG_SMP.
4944  */
4945 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
4946 EXPORT_SYMBOL(kernel_flag);
4947
4948 #ifdef CONFIG_SMP
4949 /*
4950  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
4951  * hold the hotplug lock.
4952  */
4953 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4954 {
4955         migration_req_t req;
4956         unsigned long flags;
4957         runqueue_t *rq = cpu_rq(cpu);
4958         int local = 1;
4959
4960         spin_lock_irqsave(&rq->lock, flags);
4961
4962         if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4963                 rq->sd = sd;
4964         } else {
4965                 init_completion(&req.done);
4966                 req.type = REQ_SET_DOMAIN;
4967                 req.sd = sd;
4968                 list_add(&req.list, &rq->migration_queue);
4969                 local = 0;
4970         }
4971
4972         spin_unlock_irqrestore(&rq->lock, flags);
4973
4974         if (!local) {
4975                 wake_up_process(rq->migration_thread);
4976                 wait_for_completion(&req.done);
4977         }
4978 }
4979
4980 /*
4981  * To enable disjoint top-level NUMA domains, define SD_NODES_PER_DOMAIN
4982  * in arch code. That defines the number of nearby nodes in a node's top
4983  * level scheduling domain.
4984  */
4985 #ifdef CONFIG_NUMA
4986 #ifdef SD_NODES_PER_DOMAIN
4987 /**
4988  * find_next_best_node - find the next node to include in a sched_domain
4989  * @node: node whose sched_domain we're building
4990  * @used_nodes: nodes already in the sched_domain
4991  *
4992  * Find the next node to include in a given scheduling domain.  Simply
4993  * finds the closest node not already in the @used_nodes map.
4994  *
4995  * Should use nodemask_t.
4996  */
4997 static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
4998 {
4999         int i, n, val, min_val, best_node = 0;
5000
5001         min_val = INT_MAX;
5002
5003         for (i = 0; i < numnodes; i++) {
5004                 /* Start at @node */
5005                 n = (node + i) % numnodes;
5006
5007                 /* Skip already used nodes */
5008                 if (test_bit(n, used_nodes))
5009                         continue;
5010
5011                 /* Simple min distance search */
5012                 val = node_distance(node, i);
5013
5014                 if (val < min_val) {
5015                         min_val = val;
5016                         best_node = n;
5017                 }
5018         }
5019
5020         set_bit(best_node, used_nodes);
5021         return best_node;
5022 }
5023
5024 /**
5025  * sched_domain_node_span - get a cpumask for a node's sched_domain
5026  * @node: node whose cpumask we're constructing
5027  * @size: number of nodes to include in this span
5028  *
5029  * Given a node, construct a good cpumask for its sched_domain to span.  It
5030  * should be one that prevents unnecessary balancing, but also spreads tasks
5031  * out optimally.
5032  */
5033 static cpumask_t __devinit sched_domain_node_span(int node)
5034 {
5035         int i;
5036         cpumask_t span;
5037         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5038
5039         cpus_clear(span);
5040         bitmap_zero(used_nodes, MAX_NUMNODES);
5041
5042         for (i = 0; i < SD_NODES_PER_DOMAIN; i++) {
5043                 int next_node = find_next_best_node(node, used_nodes);
5044                 cpumask_t  nodemask;
5045
5046                 nodemask = node_to_cpumask(next_node);
5047                 cpus_or(span, span, nodemask);
5048         }
5049
5050         return span;
5051 }
5052 #else /* SD_NODES_PER_DOMAIN */
5053 static cpumask_t __devinit sched_domain_node_span(int node)
5054 {
5055         return cpu_possible_map;
5056 }
5057 #endif /* SD_NODES_PER_DOMAIN */
5058 #endif /* CONFIG_NUMA */
5059
5060 #ifdef CONFIG_SCHED_SMT
5061 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5062 static struct sched_group sched_group_cpus[NR_CPUS];
5063 static int __devinit cpu_to_cpu_group(int cpu)
5064 {
5065         return cpu;
5066 }
5067 #endif
5068
5069 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5070 static struct sched_group sched_group_phys[NR_CPUS];
5071 static int __devinit cpu_to_phys_group(int cpu)
5072 {
5073 #ifdef CONFIG_SCHED_SMT
5074         return first_cpu(cpu_sibling_map[cpu]);
5075 #else
5076         return cpu;
5077 #endif
5078 }
5079
5080 #ifdef CONFIG_NUMA
5081
5082 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5083 static struct sched_group sched_group_nodes[MAX_NUMNODES];
5084 static int __devinit cpu_to_node_group(int cpu)
5085 {
5086         return cpu_to_node(cpu);
5087 }
5088 #endif
5089
5090 /* Groups for isolated scheduling domains */
5091 static struct sched_group sched_group_isolated[NR_CPUS];
5092
5093 /* cpus with isolated domains */
5094 cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
5095
5096 static int __devinit cpu_to_isolated_group(int cpu)
5097 {
5098         return cpu;
5099 }
5100
5101 /* Setup the mask of cpus configured for isolated domains */
5102 static int __init isolated_cpu_setup(char *str)
5103 {
5104         int ints[NR_CPUS], i;
5105
5106         str = get_options(str, ARRAY_SIZE(ints), ints);
5107         cpus_clear(cpu_isolated_map);
5108         for (i = 1; i <= ints[0]; i++)
5109                 cpu_set(ints[i], cpu_isolated_map);
5110         return 1;
5111 }
5112
5113 __setup ("isolcpus=", isolated_cpu_setup);
5114
5115 /*
5116  * init_sched_build_groups takes an array of groups, the cpumask we wish
5117  * to span, and a pointer to a function which identifies what group a CPU
5118  * belongs to. The return value of group_fn must be a valid index into the
5119  * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
5120  * keep track of groups covered with a cpumask_t).
5121  *
5122  * init_sched_build_groups will build a circular linked list of the groups
5123  * covered by the given span, and will set each group's ->cpumask correctly,
5124  * and ->cpu_power to 0.
5125  */
5126 static void __devinit init_sched_build_groups(struct sched_group groups[],
5127                         cpumask_t span, int (*group_fn)(int cpu))
5128 {
5129         struct sched_group *first = NULL, *last = NULL;
5130         cpumask_t covered = CPU_MASK_NONE;
5131         int i;
5132
5133         for_each_cpu_mask(i, span) {
5134                 int group = group_fn(i);
5135                 struct sched_group *sg = &groups[group];
5136                 int j;
5137
5138                 if (cpu_isset(i, covered))
5139                         continue;
5140
5141                 sg->cpumask = CPU_MASK_NONE;
5142                 sg->cpu_power = 0;
5143
5144                 for_each_cpu_mask(j, span) {
5145                         if (group_fn(j) != group)
5146                                 continue;
5147
5148                         cpu_set(j, covered);
5149                         cpu_set(j, sg->cpumask);
5150                 }
5151                 if (!first)
5152                         first = sg;
5153                 if (last)
5154                         last->next = sg;
5155                 last = sg;
5156         }
5157         last->next = first;
5158 }
5159
5160 /*
5161  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
5162  */
5163 static void __devinit arch_init_sched_domains(void)
5164 {
5165         int i;
5166         cpumask_t cpu_default_map;
5167         cpumask_t cpu_isolated_online_map;
5168
5169         cpus_and(cpu_isolated_online_map, cpu_isolated_map, cpu_online_map);
5170
5171         /*
5172          * Setup mask for cpus without special case scheduling requirements.
5173          * For now this just excludes isolated cpus, but could be used to
5174          * exclude other special cases in the future.
5175          */
5176         cpus_complement(cpu_default_map, cpu_isolated_map);
5177         cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
5178
5179         /* Set up domains */
5180         for_each_online_cpu(i) {
5181                 int group;
5182                 struct sched_domain *sd = NULL, *p;
5183                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5184
5185                 cpus_and(nodemask, nodemask, cpu_default_map);
5186
5187                 /*
5188                  * Set up isolated domains.
5189                  * Unlike those of other cpus, the domains and groups are
5190                  * single level, and span a single cpu.
5191                  */
5192                 if (cpu_isset(i, cpu_isolated_online_map)) {
5193 #ifdef CONFIG_SCHED_SMT
5194                         sd = &per_cpu(cpu_domains, i);
5195 #else
5196                         sd = &per_cpu(phys_domains, i);
5197 #endif
5198                         group = cpu_to_isolated_group(i);
5199                         *sd = SD_CPU_INIT;
5200                         cpu_set(i, sd->span);
5201                         sd->balance_interval = INT_MAX; /* Don't balance */
5202                         sd->flags = 0;                  /* Avoid WAKE_ */
5203                         sd->groups = &sched_group_isolated[group];
5204                         printk(KERN_INFO "Setting up cpu %d isolated.\n", i);
5205                         /* Single level, so continue with next cpu */
5206                         continue;
5207                 }
5208
5209 #ifdef CONFIG_NUMA
5210                 sd = &per_cpu(node_domains, i);
5211                 group = cpu_to_node_group(i);
5212                 *sd = SD_NODE_INIT;
5213                 /* FIXME: should be multilevel, in arch code */
5214                 sd->span = sched_domain_node_span(i);
5215                 cpus_and(sd->span, sd->span, cpu_default_map);
5216                 sd->groups = &sched_group_nodes[group];
5217 #endif
5218
5219                 p = sd;
5220                 sd = &per_cpu(phys_domains, i);
5221                 group = cpu_to_phys_group(i);
5222                 *sd = SD_CPU_INIT;
5223                 sd->span = nodemask;
5224                 sd->parent = p;
5225                 sd->groups = &sched_group_phys[group];
5226
5227 #ifdef CONFIG_SCHED_SMT
5228                 p = sd;
5229                 sd = &per_cpu(cpu_domains, i);
5230                 group = cpu_to_cpu_group(i);
5231                 *sd = SD_SIBLING_INIT;
5232                 sd->span = cpu_sibling_map[i];
5233                 cpus_and(sd->span, sd->span, cpu_default_map);
5234                 sd->parent = p;
5235                 sd->groups = &sched_group_cpus[group];
5236 #endif
5237         }
5238
5239 #ifdef CONFIG_SCHED_SMT
5240         /* Set up CPU (sibling) groups */
5241         for_each_online_cpu(i) {
5242                 cpumask_t this_sibling_map = cpu_sibling_map[i];
5243                 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
5244                 if (i != first_cpu(this_sibling_map))
5245                         continue;
5246
5247                 init_sched_build_groups(sched_group_cpus, this_sibling_map,
5248                                                 &cpu_to_cpu_group);
5249         }
5250 #endif
5251
5252         /* Set up isolated groups */
5253         for_each_cpu_mask(i, cpu_isolated_online_map) {
5254                 cpumask_t mask = cpumask_of_cpu(i);
5255                 init_sched_build_groups(sched_group_isolated, mask,
5256                                                 &cpu_to_isolated_group);
5257         }
5258
5259         /* Set up physical groups */
5260         for (i = 0; i < MAX_NUMNODES; i++) {
5261                 cpumask_t nodemask = node_to_cpumask(i);
5262
5263                 cpus_and(nodemask, nodemask, cpu_default_map);
5264                 if (cpus_empty(nodemask))
5265                         continue;
5266
5267                 init_sched_build_groups(sched_group_phys, nodemask,
5268                                                 &cpu_to_phys_group);
5269         }
5270
5271
5272 #ifdef CONFIG_NUMA
5273         /* Set up node groups */
5274         init_sched_build_groups(sched_group_nodes, cpu_default_map,
5275                                         &cpu_to_node_group);
5276 #endif
5277
5278
5279         /* Calculate CPU power for physical packages and nodes */
5280         for_each_cpu_mask(i, cpu_default_map) {
5281                 int power;
5282                 struct sched_domain *sd;
5283 #ifdef CONFIG_SCHED_SMT
5284                 sd = &per_cpu(cpu_domains, i);
5285                 power = SCHED_LOAD_SCALE;
5286                 sd->groups->cpu_power = power;
5287 #endif
5288
5289                 sd = &per_cpu(phys_domains, i);
5290                 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5291                                 (cpus_weight(sd->groups->cpumask)-1) / 10;
5292                 sd->groups->cpu_power = power;
5293
5294
5295 #ifdef CONFIG_NUMA
5296                 if (i == first_cpu(sd->groups->cpumask)) {
5297                         /* Only add "power" once for each physical package. */
5298                         sd = &per_cpu(node_domains, i);
5299                         sd->groups->cpu_power += power;
5300                 }
5301 #endif
5302         }
5303
5304         /* Attach the domains */
5305         for_each_online_cpu(i) {
5306                 struct sched_domain *sd;
5307 #ifdef CONFIG_SCHED_SMT
5308                 sd = &per_cpu(cpu_domains, i);
5309 #else
5310                 sd = &per_cpu(phys_domains, i);
5311 #endif
5312                 cpu_attach_domain(sd, i);
5313         }
5314         last->next = first;
5315 }
5316
5317 #ifdef CONFIG_HOTPLUG_CPU
5318 static void __devinit arch_destroy_sched_domains(void)
5319 {
5320         /* Do nothing: everything is statically allocated. */
5321 }
5322 #endif
5323
5324 #undef SCHED_DOMAIN_DEBUG
5325 #ifdef SCHED_DOMAIN_DEBUG
5326 void sched_domain_debug(void)
5327 {
5328         int i;
5329
5330         for_each_online_cpu(i) {
5331                 runqueue_t *rq = cpu_rq(i);
5332                 struct sched_domain *sd;
5333                 int level = 0;
5334
5335                 sd = rq->sd;
5336
5337                 printk(KERN_DEBUG "CPU%d:\n", i);
5338
5339                 do {
5340                         int j;
5341                         char str[NR_CPUS];
5342                         struct sched_group *group = sd->groups;
5343                         cpumask_t groupmask;
5344
5345                         cpumask_scnprintf(str, NR_CPUS, sd->span);
5346                         cpus_clear(groupmask);
5347
5348                         printk(KERN_DEBUG);
5349                         for (j = 0; j < level + 1; j++)
5350                                 printk(" ");
5351                         printk("domain %d: span %s\n", level, str);
5352
5353                         if (!cpu_isset(i, sd->span))
5354                                 printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
5355                         if (!cpu_isset(i, group->cpumask))
5356                                 printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
5357                         if (!group->cpu_power)
5358                                 printk(KERN_DEBUG "ERROR domain->cpu_power not set\n");
5359
5360                         printk(KERN_DEBUG);
5361                         for (j = 0; j < level + 2; j++)
5362                                 printk(" ");
5363                         printk("groups:");
5364                         do {
5365                                 if (!group) {
5366                                         printk(" ERROR: NULL");
5367                                         break;
5368                                 }
5369
5370                                 if (!cpus_weight(group->cpumask))
5371                                         printk(" ERROR empty group:");
5372
5373                                 if (cpus_intersects(groupmask, group->cpumask))
5374                                         printk(" ERROR repeated CPUs:");
5375
5376                                 cpus_or(groupmask, groupmask, group->cpumask);
5377
5378                                 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5379                                 printk(" %s", str);
5380
5381                                 group = group->next;
5382                         } while (group != sd->groups);
5383                         printk("\n");
5384
5385                         if (!cpus_equal(sd->span, groupmask))
5386                                 printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
5387
5388                         level++;
5389                         sd = sd->parent;
5390
5391                         if (sd) {
5392                                 if (!cpus_subset(groupmask, sd->span))
5393                                         printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
5394                         }
5395
5396                 } while (sd);
5397         }
5398 }
5399 #else
5400 #define sched_domain_debug() {}
5401 #endif
5402
5403 #ifdef CONFIG_SMP
5404 /* Initial dummy domain for early boot and for hotplug cpu */
5405 static __devinitdata struct sched_domain sched_domain_dummy;
5406 static __devinitdata struct sched_group sched_group_dummy;
5407 #endif
5408
5409 #ifdef CONFIG_HOTPLUG_CPU
5410 /*
5411  * Force a reinitialization of the sched domains hierarchy.  The domains
5412  * and groups cannot be updated in place without racing with the balancing
5413  * code, so we temporarily attach all running cpus to a "dummy" domain
5414  * which will prevent rebalancing while the sched domains are recalculated.
5415  */
5416 static int update_sched_domains(struct notifier_block *nfb,
5417                                 unsigned long action, void *hcpu)
5418 {
5419         int i;
5420
5421         switch (action) {
5422         case CPU_UP_PREPARE:
5423         case CPU_DOWN_PREPARE:
5424                 for_each_online_cpu(i)
5425                         cpu_attach_domain(&sched_domain_dummy, i);
5426                 arch_destroy_sched_domains();
5427                 return NOTIFY_OK;
5428
5429         case CPU_UP_CANCELED:
5430         case CPU_DOWN_FAILED:
5431         case CPU_ONLINE:
5432         case CPU_DEAD:
5433                 /*
5434                  * Fall through and re-initialise the domains.
5435                  */
5436                 break;
5437         default:
5438                 return NOTIFY_DONE;
5439         }
5440
5441         /* The hotplug lock is already held by cpu_up/cpu_down */
5442         arch_init_sched_domains();
5443
5444         sched_domain_debug();
5445
5446         return NOTIFY_OK;
5447 }
5448 #endif
5449
5450 void __init sched_init_smp(void)
5451 {
5452         lock_cpu_hotplug();
5453         arch_init_sched_domains();
5454         sched_domain_debug();
5455         unlock_cpu_hotplug();
5456         /* XXX: Theoretical race here - CPU may be hotplugged now */
5457         hotcpu_notifier(update_sched_domains, 0);
5458 }
5459 #else
5460 void __init sched_init_smp(void)
5461 {
5462 }
5463 #endif /* CONFIG_SMP */
5464
5465 int in_sched_functions(unsigned long addr)
5466 {
5467         /* Linker adds these: start and end of __sched functions */
5468         extern char __sched_text_start[], __sched_text_end[];
5469         return in_lock_functions(addr) ||
5470                 (addr >= (unsigned long)__sched_text_start
5471                 && addr < (unsigned long)__sched_text_end);
5472 }
5473
5474 void __init sched_init(void)
5475 {
5476         runqueue_t *rq;
5477         int i;
5478
5479 #ifdef CONFIG_SMP
5480         /* Set up an initial dummy domain for early boot */
5481
5482         memset(&sched_domain_dummy, 0, sizeof(struct sched_domain));
5483         sched_domain_dummy.span = CPU_MASK_ALL;
5484         sched_domain_dummy.groups = &sched_group_dummy;
5485         sched_domain_dummy.last_balance = jiffies;
5486         sched_domain_dummy.balance_interval = INT_MAX; /* Don't balance */
5487         sched_domain_dummy.busy_factor = 1;
5488
5489         memset(&sched_group_dummy, 0, sizeof(struct sched_group));
5490         sched_group_dummy.cpumask = CPU_MASK_ALL;
5491         sched_group_dummy.next = &sched_group_dummy;
5492         sched_group_dummy.cpu_power = SCHED_LOAD_SCALE;
5493 #endif
5494
5495         init_cpu_classes();
5496
5497         for (i = 0; i < NR_CPUS; i++) {
5498 #ifndef CONFIG_CKRM_CPU_SCHEDULE
5499                 int j, k;
5500                 prio_array_t *array;
5501
5502                 rq = cpu_rq(i);
5503                 spin_lock_init(&rq->lock);
5504
5505                 for (j = 0; j < 2; j++) {
5506                         array = rq->arrays + j;
5507                         for (k = 0; k < MAX_PRIO; k++) {
5508                                 INIT_LIST_HEAD(array->queue + k);
5509                                 __clear_bit(k, array->bitmap);
5510                         }
5511                         // delimiter for bitsearch
5512                         __set_bit(MAX_PRIO, array->bitmap);
5513                 }
5514
5515                 rq->active = rq->arrays;
5516                 rq->expired = rq->arrays + 1;
5517                 rq->best_expired_prio = MAX_PRIO;
5518
5519 #else
5520                 rq = cpu_rq(i);
5521                 spin_lock_init(&rq->lock);
5522 #endif
5523
5524 #ifdef CONFIG_SMP
5525                 rq->sd = &sched_domain_dummy;
5526                 rq->cpu_load = 0;
5527 #ifdef CONFIG_CKRM_CPU_SCHEDULE
5528                 ckrm_load_init(rq_ckrm_load(rq));
5529 #endif
5530                 rq->active_balance = 0;
5531                 rq->push_cpu = 0;
5532                 rq->migration_thread = NULL;
5533                 INIT_LIST_HEAD(&rq->migration_queue);
5534 #endif
5535 #ifdef CONFIG_VSERVER_HARDCPU
5536                 INIT_LIST_HEAD(&rq->hold_queue);
5537 #endif
5538                 atomic_set(&rq->nr_iowait, 0);
5539
5540         }
5541
5542         /*
5543          * The boot idle thread does lazy MMU switching as well:
5544          */
5545         atomic_inc(&init_mm.mm_count);
5546         enter_lazy_tlb(&init_mm, current);
5547
5548         /*
5549          * Make us the idle thread. Technically, schedule() should not be
5550          * called from this thread, however somewhere below it might be,
5551          * but because we are the idle thread, we just pick up running again
5552          * when this runqueue becomes "idle".
5553          */
5554         init_idle(current, smp_processor_id());
5555 }
5556
5557 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
5558 void __might_sleep(char *file, int line, int atomic_depth)
5559 {
5560 #if defined(in_atomic)
5561         static unsigned long prev_jiffy;        /* ratelimiting */
5562
5563 #ifndef CONFIG_PREEMPT
5564         atomic_depth = 0;
5565 #endif
5566         if (((in_atomic() != atomic_depth) || irqs_disabled()) &&
5567             system_state == SYSTEM_RUNNING) {
5568                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
5569                         return;
5570                 prev_jiffy = jiffies;
5571                 printk(KERN_ERR "Debug: sleeping function called from invalid"
5572                                 " context at %s:%d\n", file, line);
5573                 printk("in_atomic():%d[expected: %d], irqs_disabled():%d\n",
5574                         in_atomic(), atomic_depth, irqs_disabled());
5575                 dump_stack();
5576         }
5577 #endif
5578 }
5579 EXPORT_SYMBOL(__might_sleep);
5580 #endif
5581
5582 #ifdef CONFIG_CKRM_CPU_SCHEDULE
5583 /**
5584  * return the classqueue object of a certain processor
5585  */
5586 struct classqueue_struct * get_cpu_classqueue(int cpu)
5587 {
5588         return (& (cpu_rq(cpu)->classqueue) );
5589 }
5590
5591 /**
5592  * _ckrm_cpu_change_class - change the class of a task
5593  */
5594 void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
5595 {
5596         prio_array_t *array;
5597         struct runqueue *rq;
5598         unsigned long flags;
5599
5600         rq = task_rq_lock(tsk,&flags); 
5601         array = tsk->array;
5602         if (array) {
5603                 dequeue_task(tsk,array);
5604                 tsk->cpu_class = newcls;
5605                 enqueue_task(tsk,rq_active(tsk,rq));
5606         } else
5607                 tsk->cpu_class = newcls;
5608
5609         task_rq_unlock(rq,&flags);
5610 }
5611 #endif