This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / sched.c
1 /*
2  *  kernel/sched.c
3  *
4  *  Kernel scheduler and related syscalls
5  *
6  *  Copyright (C) 1991-2002  Linus Torvalds
7  *
8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
9  *              make semaphores SMP safe
10  *  1998-11-19  Implemented schedule_timeout() and related stuff
11  *              by Andrea Arcangeli
12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
13  *              hybrid priority-list and round-robin design with
14  *              an array-switch method of distributing timeslices
15  *              and per-CPU runqueues.  Cleanups and useful suggestions
16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
17  *  2003-09-03  Interactivity tuning by Con Kolivas.
18  *  2004-04-02  Scheduler domains code by Nick Piggin
19  */
20 #include <linux/mm.h>
21 #include <linux/module.h>
22 #include <linux/nmi.h>
23 #include <linux/init.h>
24 #include <asm/uaccess.h>
25 #include <linux/highmem.h>
26 #include <linux/smp_lock.h>
27 #include <linux/pagemap.h>
28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h>
30 #include <linux/completion.h>
31 #include <linux/kernel_stat.h>
32 #include <linux/security.h>
33 #include <linux/notifier.h>
34 #include <linux/suspend.h>
35 #include <linux/blkdev.h>
36 #include <linux/delay.h>
37 #include <linux/smp.h>
38 #include <linux/timer.h>
39 #include <linux/rcupdate.h>
40 #include <linux/cpu.h>
41 #include <linux/percpu.h>
42 #include <linux/kthread.h>
43 #include <linux/vserver/sched.h>
44 #include <linux/vs_base.h>
45 #include <asm/tlb.h>
46
47 #include <asm/unistd.h>
48 #include <linux/ckrm_classqueue.h>
49 #include <linux/ckrm_sched.h>
50
51 #ifdef CONFIG_NUMA
52 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
53 #else
54 #define cpu_to_node_mask(cpu) (cpu_online_map)
55 #endif
56
57 /* used to soft spin in sched while dump is in progress */
58 unsigned long dump_oncpu;
59 EXPORT_SYMBOL(dump_oncpu);
60
61 /*
62  * Convert user-nice values [ -20 ... 0 ... 19 ]
63  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
64  * and back.
65  */
66 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
67 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
68 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
69
70 /*
71  * 'User priority' is the nice value converted to something we
72  * can work with better when scaling various scheduler parameters,
73  * it's a [ 0 ... 39 ] range.
74  */
75 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
76 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
77 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
78 #define AVG_TIMESLICE   (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
79                         (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))
80
81 /*
82  * Some helpers for converting nanosecond timing to jiffy resolution
83  */
84 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
85 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
86
87 /*
88  * These are the 'tuning knobs' of the scheduler:
89  *
90  * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
91  * maximum timeslice is 200 msecs. Timeslices get refilled after
92  * they expire.
93  */
94 #define MIN_TIMESLICE           ( 10 * HZ / 1000)
95 #define MAX_TIMESLICE           (200 * HZ / 1000)
96 #define ON_RUNQUEUE_WEIGHT       30
97 #define CHILD_PENALTY            95
98 #define PARENT_PENALTY          100
99 #define EXIT_WEIGHT               3
100 #define PRIO_BONUS_RATIO         25
101 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
102 #define INTERACTIVE_DELTA         2
103 #define MAX_SLEEP_AVG           (AVG_TIMESLICE * MAX_BONUS)
104 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
105 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
106 #define CREDIT_LIMIT            100
107
108 /*
109  * If a task is 'interactive' then we reinsert it in the active
110  * array after it has expired its current timeslice. (it will not
111  * continue to run immediately, it will still roundrobin with
112  * other interactive tasks.)
113  *
114  * This part scales the interactivity limit depending on niceness.
115  *
116  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
117  * Here are a few examples of different nice levels:
118  *
119  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
120  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
121  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
122  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
123  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
124  *
125  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
126  *  priority range a task can explore, a value of '1' means the
127  *  task is rated interactive.)
128  *
129  * Ie. nice +19 tasks can never get 'interactive' enough to be
130  * reinserted into the active array. And only heavily CPU-hog nice -20
131  * tasks will be expired. Default nice 0 tasks are somewhere between,
132  * it takes some effort for them to get interactive, but it's not
133  * too hard.
134  */
135
136 #define CURRENT_BONUS(p) \
137         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
138                 MAX_SLEEP_AVG)
139
140 #ifdef CONFIG_SMP
141 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
142                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
143                         num_online_cpus())
144 #else
145 #define TIMESLICE_GRANULARITY(p)        (MIN_TIMESLICE * \
146                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
147 #endif
148
149 #define SCALE(v1,v1_max,v2_max) \
150         (v1) * (v2_max) / (v1_max)
151
152 #define DELTA(p) \
153         (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
154
155 #define TASK_INTERACTIVE(p) \
156         ((p)->prio <= (p)->static_prio - DELTA(p))
157
158 #define INTERACTIVE_SLEEP(p) \
159         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
160                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
161
162 #define HIGH_CREDIT(p) \
163         ((p)->interactive_credit > CREDIT_LIMIT)
164
165 #define LOW_CREDIT(p) \
166         ((p)->interactive_credit < -CREDIT_LIMIT)
167
168 #ifdef CONFIG_CKRM_CPU_SCHEDULE
169 /*
170  *  if belong to different class, compare class priority
171  *  otherwise compare task priority 
172  */
173 #define TASK_PREEMPTS_CURR(p, rq) \
174         ( ((p)->cpu_class != (rq)->curr->cpu_class) \
175           && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \
176           ? class_preempts_curr((p),(rq)->curr)  \
177           : ((p)->prio < (rq)->curr->prio)
178 #else
179 #define TASK_PREEMPTS_CURR(p, rq) \
180         ((p)->prio < (rq)->curr->prio)
181 #endif
182
183 /*
184  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
185  * to time slice values.
186  *
187  * The higher a thread's priority, the bigger timeslices
188  * it gets during one round of execution. But even the lowest
189  * priority thread gets MIN_TIMESLICE worth of execution time.
190  *
191  * task_timeslice() is the interface that is used by the scheduler.
192  */
193
194 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
195                 ((MAX_TIMESLICE - MIN_TIMESLICE) * \
196                         (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
197
198 unsigned int task_timeslice(task_t *p)
199 {
200         return BASE_TIMESLICE(p);
201 }
202
203 #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
204
205 /*
206  * These are the runqueue data structures:
207  */
208
209 typedef struct runqueue runqueue_t;
210
211 /*
212  * This is the main, per-CPU runqueue data structure.
213  *
214  * Locking rule: those places that want to lock multiple runqueues
215  * (such as the load balancing or the thread migration code), lock
216  * acquire operations must be ordered by ascending &runqueue.
217  */
218 struct runqueue {
219         spinlock_t lock;
220
221         /*
222          * nr_running and cpu_load should be in the same cacheline because
223          * remote CPUs use both these fields when doing load calculation.
224          */
225         unsigned long nr_running;
226 #if defined(CONFIG_SMP)
227         unsigned long cpu_load;
228 #endif
229         unsigned long long nr_switches, nr_preempt;
230         unsigned long nr_uninterruptible;
231         unsigned long long timestamp_last_tick;
232         task_t *curr, *idle;
233         struct mm_struct *prev_mm;
234 #ifdef CONFIG_CKRM_CPU_SCHEDULE
235         struct classqueue_struct classqueue;   
236         ckrm_load_t ckrm_load;
237         ckrm_lrq_t   dflt_lrq; /* local runqueue of the default class */
238 #else
239         prio_array_t *active, *expired, arrays[2];
240         unsigned long expired_timestamp;
241         int best_expired_prio;
242 #endif
243         atomic_t nr_iowait;
244
245 #ifdef CONFIG_SMP
246         struct sched_domain *sd;
247
248         /* For active balancing */
249         int active_balance;
250         int push_cpu;
251
252         task_t *migration_thread;
253         struct list_head migration_queue;
254 #endif
255
256 #ifdef  CONFIG_VSERVER_HARDCPU          
257         struct list_head hold_queue;
258         int idle_tokens;
259 #endif
260 };
261
262 static DEFINE_PER_CPU(struct runqueue, runqueues);
263
264 #define for_each_domain(cpu, domain) \
265         for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
266
267 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
268 #define this_rq()               (&__get_cpu_var(runqueues))
269 #define task_rq(p)              cpu_rq(task_cpu(p))
270 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
271
272 /*
273  * Default context-switch locking:
274  */
275 #ifndef prepare_arch_switch
276 # define prepare_arch_switch(rq, next)  do { } while (0)
277 # define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
278 # define task_running(rq, p)            ((rq)->curr == (p))
279 #endif
280
281 /*
282  * task_rq_lock - lock the runqueue a given task resides on and disable
283  * interrupts.  Note the ordering: we can safely lookup the task_rq without
284  * explicitly disabling preemption.
285  */
286 static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
287 {
288         struct runqueue *rq;
289
290 repeat_lock_task:
291         local_irq_save(*flags);
292         rq = task_rq(p);
293         spin_lock(&rq->lock);
294         if (unlikely(rq != task_rq(p))) {
295                 spin_unlock_irqrestore(&rq->lock, *flags);
296                 goto repeat_lock_task;
297         }
298         return rq;
299 }
300
301 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
302 {
303         spin_unlock_irqrestore(&rq->lock, *flags);
304 }
305
306 /*
307  * rq_lock - lock a given runqueue and disable interrupts.
308  */
309 static runqueue_t *this_rq_lock(void)
310 {
311         runqueue_t *rq;
312
313         local_irq_disable();
314         rq = this_rq();
315         spin_lock(&rq->lock);
316
317         return rq;
318 }
319
320 static inline void rq_unlock(runqueue_t *rq)
321 {
322         spin_unlock_irq(&rq->lock);
323 }
324
325 static inline void idle_balance(int this_cpu, runqueue_t *this_rq);
326 static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq);
327
328 #ifdef CONFIG_CKRM_CPU_SCHEDULE
329
330 #define ckrm_rq_cpu_disabled(rq) (!rq->classqueue.enabled)
331 #define ckrm_rq_cpu_enabled(rq)  ( rq->classqueue.enabled)
332
333 static inline void class_enqueue_task(struct task_struct *p,
334                                       prio_array_t * array)
335 {
336         ckrm_lrq_t *lrq;
337         int effective_prio;
338         
339         if (ckrm_rq_cpu_disabled(task_rq(p)))
340                 return;
341         
342         lrq = get_task_lrq(p);
343         // BUG_ON(lrq==NULL); 
344         
345         cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
346         lrq->lrq_load += task_load(p);
347         
348         if ((p->prio < lrq->top_priority) && (array == lrq->active))
349                 set_top_priority(lrq, p->prio); 
350         
351         if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
352                 cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
353                 effective_prio = get_effective_prio(lrq);
354                 classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, 
355                                    effective_prio);
356         } 
357         
358 }
359
360 static inline void class_dequeue_task(struct task_struct *p,
361                                       prio_array_t * array)
362 {
363         ckrm_lrq_t *lrq;
364         unsigned long load;
365         
366         if (ckrm_rq_cpu_disabled(task_rq(p)))
367                 return;
368         
369         lrq = get_task_lrq(p);
370         load = task_load(p); 
371         
372         // BUG_ON(lrq->lrq_load < load);        
373         
374         lrq->lrq_load -= load;
375         
376         cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
377         
378         if ((array == lrq->active) && (p->prio == lrq->top_priority)
379             && list_empty(&(array->queue[p->prio])))
380                 set_top_priority(lrq,find_next_bit(array->bitmap, MAX_PRIO,
381                                                    p->prio));
382 }
383
384 static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
385 {
386         cq_node_t *node;
387
388         if (ckrm_rq_cpu_disabled(rq)) 
389                 return &rq->dflt_lrq;
390         node = classqueue_get_head(&rq->classqueue);
391         return ((node) ? class_list_entry(node) : NULL);
392 }
393
394 /*
395  * return the cvt of the current running class
396  * if no current running class, return 0
397  * assume cpu is valid (cpu_online(cpu) == 1)
398  */
399 CVT_t get_local_cur_cvt(int cpu)
400 {
401         ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu));
402
403         if (lrq)
404                 return lrq->local_cvt;
405         else    
406                 return 0;
407 }
408
409 static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
410                                                     int cpu) 
411 {
412         prio_array_t               *array;
413         struct task_struct         *next;
414         ckrm_lrq_t *queue;
415         int idx;
416
417         if (ckrm_rq_cpu_disabled(rq)) {
418                 /* original code from schedule(void) 
419                  * see also code in non CKRM configuration
420                  */
421                 struct list_head *array_queue;
422                 ckrm_lrq_t  *lrq = get_ckrm_lrq(get_default_cpu_class(),cpu);
423
424                 if (unlikely(!rq->nr_running)) {
425                         idle_balance(cpu, rq);
426                         if (!rq->nr_running) {
427                                 rq->dflt_lrq.expired_timestamp = 0;
428                                 wake_sleeping_dependent(cpu, rq);
429                                 return NULL;
430                         }
431                 }
432
433                 array = lrq->active;
434                 if (unlikely(!array->nr_active)) {
435                         /*
436                          * Switch the active and expired arrays.
437                          */
438                         lrq->active = lrq->expired;
439                         lrq->expired = array;
440                         array = lrq->active; 
441                         lrq->expired_timestamp = 0;
442                         lrq->best_expired_prio = MAX_PRIO;
443                 }
444
445                 idx = sched_find_first_bit(array->bitmap);
446                 array_queue = array->queue + idx;
447                 next = list_entry(array_queue->next, task_t, run_list);
448                 return next;
449         }
450
451         /*-- CKRM SCHEDULER --*/
452         
453  retry_next_class:
454         /* we can't use (rq->nr_running == 0) to declare idleness
455          * first we have to make sure that the class runqueue is properly
456          * processed. This is due to two facts/requirements:
457          * (a) when the last task is removed form an lrq we do not remove
458          *     the lrq from the class runqueue. As a result the lrq is 
459          *     selected again and we can perform necessary 
460          *     expired switches.
461          * (b) perform outstanding expired switches
462          * 
463          */
464
465         queue = rq_get_next_class(rq);
466         if (unlikely(queue == NULL)) {
467                 idle_balance(cpu, rq);
468                 if (!rq->nr_running) {
469                         rq->dflt_lrq.expired_timestamp = 0;
470                         wake_sleeping_dependent(cpu, rq);
471                         return NULL;
472                 }
473                 goto retry_next_class; // try again
474         }
475
476         array = queue->active;
477         if (unlikely(!array->nr_active)) {
478                 queue->active = queue->expired;
479                 queue->expired = array;
480                 array = queue->active;
481                 queue->expired_timestamp = 0;
482
483                 if (array->nr_active)
484                         set_top_priority(queue,
485                                          find_first_bit(array->bitmap,MAX_PRIO));
486                 else {
487                         /* since we do not dequeue a lrq when it becomes empty
488                          * but rely on the switching mechanism, we must dequeue
489                          * at this point
490                          */
491                         classqueue_dequeue(queue->classqueue,
492                                            &queue->classqueue_linkobj);
493                         cpu_demand_event(get_rq_local_stat(queue,cpu),
494                                          CPU_DEMAND_DEQUEUE,0);
495                 }
496                 goto retry_next_class;                          
497         }
498
499         idx = queue->top_priority;
500         //BUG_ON(!array->nr_active);
501         //BUG_ON(idx == MAX_PRIO);
502         //BUG_ON(list_empty(array->queue+idx));
503         next = task_list_entry(array->queue[idx].next);
504         return next;
505 }
506
507 static inline void ckrm_account_task(struct runqueue* rq, 
508                                      struct task_struct *prev, 
509                                      unsigned long long now)
510 {
511         if ((prev != rq->idle) && ckrm_rq_cpu_enabled(rq) ) {
512                 unsigned long long run = now - prev->timestamp;
513                 ckrm_lrq_t * lrq = get_task_lrq(prev);
514
515                 lrq->lrq_load -= task_load(prev);
516                 cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
517                 lrq->lrq_load += task_load(prev);
518
519                 cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
520                 update_local_cvt(prev, run);
521         }
522
523 }
524
525 #ifdef CONFIG_SMP
526 #define COND_SMP(dflt,cond) (cond)
527 #else
528 #define COND_SMP(dflt,cond) (dflt)
529 #endif
530
531 static inline void ckrm_sched_tick(unsigned long j,int this_cpu, int idle,
532                                    runqueue_t *rq)
533 {
534         /* first determine whether we have to do anything
535          * without grabing the global lock
536          */
537
538         int sample, update;
539
540 #ifdef __SIMULATOR__
541         if ((this_cpu == 0) && (j % 1000) == 0) {
542                 ckrm_cpu_monitor(1);
543         }
544 #endif
545         
546         if (ckrm_rq_cpu_disabled(rq))
547                 return;
548         
549         update = (j % CVT_UPDATE_TICK);
550         sample = COND_SMP(1,(j % CPU_PID_CTRL_TICK)); 
551         
552 // avoid taking the global class_list lock on every tick 
553         if (likely(update && sample))
554                 return;   // nothing to be done;
555         
556         read_lock(&class_list_lock);
557         
558 #ifdef CONFIG_SMP
559         if (sample==0) {
560                 ckrm_load_sample(rq_ckrm_load(rq),this_cpu);
561         }
562 #endif
563         
564         if (update==0) {
565                 classqueue_update_base(get_cpu_classqueue(this_cpu));
566                 update_class_cputime(this_cpu,idle);
567                 // occasionally we need to call the weight adjustment
568                 // for SMP systems
569                 if (COND_SMP(0,(this_cpu==0)))
570                         adjust_local_weight();   
571         }
572         
573         read_unlock(&class_list_lock);
574 }
575
576 #else /*! CONFIG_CKRM_CPU_SCHEDULE*/
577 static inline struct task_struct * rq_get_next_task(struct runqueue* rq,
578                                                     int cpu) 
579 {
580         prio_array_t *array;
581         struct list_head *queue;
582         int idx;
583
584         if (unlikely(!rq->nr_running)) {
585                 idle_balance(cpu, rq);
586                 if (!rq->nr_running) {
587                         rq->expired_timestamp = 0;
588                         wake_sleeping_dependent(cpu, rq);
589                         return NULL;
590                 }
591         }
592         array = rq->active;
593         if (unlikely(!array->nr_active)) {
594                 /*
595                  * Switch the active and expired arrays.
596                  */
597                 rq->active = rq->expired;
598                 rq->expired = array;
599                 array = rq->active;
600                 rq->expired_timestamp = 0;
601                 rq->best_expired_prio = MAX_PRIO;
602         }
603
604         idx = sched_find_first_bit(array->bitmap);
605         queue = array->queue + idx;
606         return list_entry(queue->next, task_t, run_list);
607 }
608
609 static inline void class_enqueue_task(struct task_struct* p, 
610                                       prio_array_t *array) { }
611 static inline void class_dequeue_task(struct task_struct* p, 
612                                       prio_array_t *array) { }
613 static inline void init_cpu_classes(void) { }
614 static inline void ckrm_sched_tick(int j,int this_cpu,int idle, void* arg) {}
615 static inline void ckrm_account_task(struct runqueue* rq, struct 
616                                      task_struct *prev, 
617                                      unsigned long long now)  { }
618 #define rq_ckrm_load(rq) NULL
619
620 #endif  /* CONFIG_CKRM_CPU_SCHEDULE */
621
622 /*
623  * Adding/removing a task to/from a priority array:
624  */
625 static void dequeue_task(struct task_struct *p, prio_array_t *array)
626 {
627         array->nr_active--;
628         list_del(&p->run_list);
629         if (list_empty(array->queue + p->prio))
630                 __clear_bit(p->prio, array->bitmap);
631         class_dequeue_task(p,array);
632 }
633
634 static void enqueue_task(struct task_struct *p, prio_array_t *array)
635 {
636         list_add_tail(&p->run_list, array->queue + p->prio);
637         __set_bit(p->prio, array->bitmap);
638         array->nr_active++;
639         p->array = array;
640         class_enqueue_task(p,array);
641 }
642
643 /*
644  * Used by the migration code - we pull tasks from the head of the
645  * remote queue so we want these tasks to show up at the head of the
646  * local queue:
647  */
648 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
649 {
650         list_add(&p->run_list, array->queue + p->prio);
651         __set_bit(p->prio, array->bitmap);
652         array->nr_active++;
653         p->array = array;
654         class_enqueue_task(p,array);
655 }
656
657 /*
658  * effective_prio - return the priority that is based on the static
659  * priority but is modified by bonuses/penalties.
660  *
661  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
662  * into the -5 ... 0 ... +5 bonus/penalty range.
663  *
664  * We use 25% of the full 0...39 priority range so that:
665  *
666  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
667  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
668  *
669  * Both properties are important to certain workloads.
670  */
671 static int effective_prio(task_t *p)
672 {
673         int bonus, prio;
674
675         if (rt_task(p))
676                 return p->prio;
677
678         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
679
680         prio = p->static_prio - bonus;
681         if (__vx_task_flags(p, VXF_SCHED_PRIO, 0))
682                 prio += effective_vavavoom(p, MAX_USER_PRIO);
683
684         if (prio < MAX_RT_PRIO)
685                 prio = MAX_RT_PRIO;
686         if (prio > MAX_PRIO-1)
687                 prio = MAX_PRIO-1;
688         return prio;
689 }
690
691 /*
692  * __activate_task - move a task to the runqueue.
693  */
694 static inline void __activate_task(task_t *p, runqueue_t *rq)
695 {
696         enqueue_task(p, rq_active(p,rq));
697         rq->nr_running++;
698 }
699
700 /*
701  * __activate_idle_task - move idle task to the _front_ of runqueue.
702  */
703 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
704 {
705         enqueue_task_head(p, rq_active(p,rq));
706         rq->nr_running++;
707 }
708
709 static void recalc_task_prio(task_t *p, unsigned long long now)
710 {
711         unsigned long long __sleep_time = now - p->timestamp;
712         unsigned long sleep_time;
713
714         if (__sleep_time > NS_MAX_SLEEP_AVG)
715                 sleep_time = NS_MAX_SLEEP_AVG;
716         else
717                 sleep_time = (unsigned long)__sleep_time;
718
719         if (likely(sleep_time > 0)) {
720                 /*
721                  * User tasks that sleep a long time are categorised as
722                  * idle and will get just interactive status to stay active &
723                  * prevent them suddenly becoming cpu hogs and starving
724                  * other processes.
725                  */
726                 if (p->mm && p->activated != -1 &&
727                         sleep_time > INTERACTIVE_SLEEP(p)) {
728                                 p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
729                                                 AVG_TIMESLICE);
730                                 if (!HIGH_CREDIT(p))
731                                         p->interactive_credit++;
732                 } else {
733                         /*
734                          * The lower the sleep avg a task has the more
735                          * rapidly it will rise with sleep time.
736                          */
737                         sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
738
739                         /*
740                          * Tasks with low interactive_credit are limited to
741                          * one timeslice worth of sleep avg bonus.
742                          */
743                         if (LOW_CREDIT(p) &&
744                             sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
745                                 sleep_time = JIFFIES_TO_NS(task_timeslice(p));
746
747                         /*
748                          * Non high_credit tasks waking from uninterruptible
749                          * sleep are limited in their sleep_avg rise as they
750                          * are likely to be cpu hogs waiting on I/O
751                          */
752                         if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
753                                 if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
754                                         sleep_time = 0;
755                                 else if (p->sleep_avg + sleep_time >=
756                                                 INTERACTIVE_SLEEP(p)) {
757                                         p->sleep_avg = INTERACTIVE_SLEEP(p);
758                                         sleep_time = 0;
759                                 }
760                         }
761
762                         /*
763                          * This code gives a bonus to interactive tasks.
764                          *
765                          * The boost works by updating the 'average sleep time'
766                          * value here, based on ->timestamp. The more time a
767                          * task spends sleeping, the higher the average gets -
768                          * and the higher the priority boost gets as well.
769                          */
770                         p->sleep_avg += sleep_time;
771
772                         if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
773                                 p->sleep_avg = NS_MAX_SLEEP_AVG;
774                                 if (!HIGH_CREDIT(p))
775                                         p->interactive_credit++;
776                         }
777                 }
778         }
779
780         p->prio = effective_prio(p);
781 }
782
783 /*
784  * activate_task - move a task to the runqueue and do priority recalculation
785  *
786  * Update all the scheduling statistics stuff. (sleep average
787  * calculation, priority modifiers, etc.)
788  */
789 static void activate_task(task_t *p, runqueue_t *rq, int local)
790 {
791         unsigned long long now;
792
793         now = sched_clock();
794 #ifdef CONFIG_SMP
795         if (!local) {
796                 /* Compensate for drifting sched_clock */
797                 runqueue_t *this_rq = this_rq();
798                 now = (now - this_rq->timestamp_last_tick)
799                         + rq->timestamp_last_tick;
800         }
801 #endif
802
803         recalc_task_prio(p, now);
804
805         /*
806          * This checks to make sure it's not an uninterruptible task
807          * that is now waking up.
808          */
809         if (!p->activated) {
810                 /*
811                  * Tasks which were woken up by interrupts (ie. hw events)
812                  * are most likely of interactive nature. So we give them
813                  * the credit of extending their sleep time to the period
814                  * of time they spend on the runqueue, waiting for execution
815                  * on a CPU, first time around:
816                  */
817                 if (in_interrupt())
818                         p->activated = 2;
819                 else {
820                         /*
821                          * Normal first-time wakeups get a credit too for
822                          * on-runqueue time, but it will be weighted down:
823                          */
824                         p->activated = 1;
825                 }
826         }
827         p->timestamp = now;
828
829         __activate_task(p, rq);
830 }
831
832 /*
833  * deactivate_task - remove a task from the runqueue.
834  */
835 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
836 {
837         rq->nr_running--;
838         if (p->state == TASK_UNINTERRUPTIBLE)
839                 rq->nr_uninterruptible++;
840         dequeue_task(p, p->array);
841         p->array = NULL;
842 }
843
844 /*
845  * resched_task - mark a task 'to be rescheduled now'.
846  *
847  * On UP this means the setting of the need_resched flag, on SMP it
848  * might also involve a cross-CPU call to trigger the scheduler on
849  * the target CPU.
850  */
851 #ifdef CONFIG_SMP
852 static void resched_task(task_t *p)
853 {
854         int need_resched, nrpolling;
855
856         preempt_disable();
857         /* minimise the chance of sending an interrupt to poll_idle() */
858         nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
859         need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
860         nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
861
862         if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
863                 smp_send_reschedule(task_cpu(p));
864         preempt_enable();
865 }
866 #else
867 static inline void resched_task(task_t *p)
868 {
869         set_tsk_need_resched(p);
870 }
871 #endif
872
873 /**
874  * task_curr - is this task currently executing on a CPU?
875  * @p: the task in question.
876  */
877 inline int task_curr(const task_t *p)
878 {
879         return cpu_curr(task_cpu(p)) == p;
880 }
881
882 #ifdef CONFIG_SMP
883 enum request_type {
884         REQ_MOVE_TASK,
885         REQ_SET_DOMAIN,
886 };
887
888 typedef struct {
889         struct list_head list;
890         enum request_type type;
891
892         /* For REQ_MOVE_TASK */
893         task_t *task;
894         int dest_cpu;
895
896         /* For REQ_SET_DOMAIN */
897         struct sched_domain *sd;
898
899         struct completion done;
900 } migration_req_t;
901
902 /*
903  * The task's runqueue lock must be held.
904  * Returns true if you have to wait for migration thread.
905  */
906 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
907 {
908         runqueue_t *rq = task_rq(p);
909
910         /*
911          * If the task is not on a runqueue (and not running), then
912          * it is sufficient to simply update the task's cpu field.
913          */
914         if (!p->array && !task_running(rq, p)) {
915                 set_task_cpu(p, dest_cpu);
916                 return 0;
917         }
918
919         init_completion(&req->done);
920         req->type = REQ_MOVE_TASK;
921         req->task = p;
922         req->dest_cpu = dest_cpu;
923         list_add(&req->list, &rq->migration_queue);
924         return 1;
925 }
926
927 /*
928  * wait_task_inactive - wait for a thread to unschedule.
929  *
930  * The caller must ensure that the task *will* unschedule sometime soon,
931  * else this function might spin for a *long* time. This function can't
932  * be called with interrupts off, or it may introduce deadlock with
933  * smp_call_function() if an IPI is sent by the same process we are
934  * waiting to become inactive.
935  */
936 void wait_task_inactive(task_t * p)
937 {
938         unsigned long flags;
939         runqueue_t *rq;
940         int preempted;
941
942 repeat:
943         rq = task_rq_lock(p, &flags);
944         /* Must be off runqueue entirely, not preempted. */
945         if (unlikely(p->array)) {
946                 /* If it's preempted, we yield.  It could be a while. */
947                 preempted = !task_running(rq, p);
948                 task_rq_unlock(rq, &flags);
949                 cpu_relax();
950                 if (preempted)
951                         yield();
952                 goto repeat;
953         }
954         task_rq_unlock(rq, &flags);
955 }
956
957 /***
958  * kick_process - kick a running thread to enter/exit the kernel
959  * @p: the to-be-kicked thread
960  *
961  * Cause a process which is running on another CPU to enter
962  * kernel-mode, without any delay. (to get signals handled.)
963  */
964 void kick_process(task_t *p)
965 {
966         int cpu;
967
968         preempt_disable();
969         cpu = task_cpu(p);
970         if ((cpu != smp_processor_id()) && task_curr(p))
971                 smp_send_reschedule(cpu);
972         preempt_enable();
973 }
974
975 EXPORT_SYMBOL_GPL(kick_process);
976
977 /*
978  * Return a low guess at the load of a migration-source cpu.
979  *
980  * We want to under-estimate the load of migration sources, to
981  * balance conservatively.
982  */
983 static inline unsigned long source_load(int cpu)
984 {
985         runqueue_t *rq = cpu_rq(cpu);
986         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
987
988         return min(rq->cpu_load, load_now);
989 }
990
991 /*
992  * Return a high guess at the load of a migration-target cpu
993  */
994 static inline unsigned long target_load(int cpu)
995 {
996         runqueue_t *rq = cpu_rq(cpu);
997         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
998
999         return max(rq->cpu_load, load_now);
1000 }
1001
1002 #endif
1003
1004 /*
1005  * wake_idle() is useful especially on SMT architectures to wake a
1006  * task onto an idle sibling if we would otherwise wake it onto a
1007  * busy sibling.
1008  *
1009  * Returns the CPU we should wake onto.
1010  */
1011 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1012 static int wake_idle(int cpu, task_t *p)
1013 {
1014         cpumask_t tmp;
1015         runqueue_t *rq = cpu_rq(cpu);
1016         struct sched_domain *sd;
1017         int i;
1018
1019         if (idle_cpu(cpu))
1020                 return cpu;
1021
1022         sd = rq->sd;
1023         if (!(sd->flags & SD_WAKE_IDLE))
1024                 return cpu;
1025
1026         cpus_and(tmp, sd->span, cpu_online_map);
1027         cpus_and(tmp, tmp, p->cpus_allowed);
1028
1029         for_each_cpu_mask(i, tmp) {
1030                 if (idle_cpu(i))
1031                         return i;
1032         }
1033
1034         return cpu;
1035 }
1036 #else
1037 static inline int wake_idle(int cpu, task_t *p)
1038 {
1039         return cpu;
1040 }
1041 #endif
1042
1043 /***
1044  * try_to_wake_up - wake up a thread
1045  * @p: the to-be-woken-up thread
1046  * @state: the mask of task states that can be woken
1047  * @sync: do a synchronous wakeup?
1048  *
1049  * Put it on the run-queue if it's not already there. The "current"
1050  * thread is always on the run-queue (except when the actual
1051  * re-schedule is in progress), and as such you're allowed to do
1052  * the simpler "current->state = TASK_RUNNING" to mark yourself
1053  * runnable without the overhead of this.
1054  *
1055  * returns failure only if the task is already active.
1056  */
1057 static int try_to_wake_up(task_t * p, unsigned int state, int sync)
1058 {
1059         int cpu, this_cpu, success = 0;
1060         unsigned long flags;
1061         long old_state;
1062         runqueue_t *rq;
1063 #ifdef CONFIG_SMP
1064         unsigned long load, this_load;
1065         struct sched_domain *sd;
1066         int new_cpu;
1067 #endif
1068
1069         rq = task_rq_lock(p, &flags);
1070         old_state = p->state;
1071         if (!(old_state & state))
1072                 goto out;
1073
1074         if (p->array)
1075                 goto out_running;
1076
1077         cpu = task_cpu(p);
1078         this_cpu = smp_processor_id();
1079
1080 #ifdef CONFIG_SMP
1081         if (unlikely(task_running(rq, p)))
1082                 goto out_activate;
1083
1084         new_cpu = cpu;
1085
1086         if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1087                 goto out_set_cpu;
1088
1089         load = source_load(cpu);
1090         this_load = target_load(this_cpu);
1091
1092         /*
1093          * If sync wakeup then subtract the (maximum possible) effect of
1094          * the currently running task from the load of the current CPU:
1095          */
1096         if (sync)
1097                 this_load -= SCHED_LOAD_SCALE;
1098
1099         /* Don't pull the task off an idle CPU to a busy one */
1100         if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1101                 goto out_set_cpu;
1102
1103         new_cpu = this_cpu; /* Wake to this CPU if we can */
1104
1105         /*
1106          * Scan domains for affine wakeup and passive balancing
1107          * possibilities.
1108          */
1109         for_each_domain(this_cpu, sd) {
1110                 unsigned int imbalance;
1111                 /*
1112                  * Start passive balancing when half the imbalance_pct
1113                  * limit is reached.
1114                  */
1115                 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1116
1117                 if ( ((sd->flags & SD_WAKE_AFFINE) &&
1118                                 !task_hot(p, rq->timestamp_last_tick, sd))
1119                         || ((sd->flags & SD_WAKE_BALANCE) &&
1120                                 imbalance*this_load <= 100*load) ) {
1121                         /*
1122                          * Now sd has SD_WAKE_AFFINE and p is cache cold in sd
1123                          * or sd has SD_WAKE_BALANCE and there is an imbalance
1124                          */
1125                         if (cpu_isset(cpu, sd->span))
1126                                 goto out_set_cpu;
1127                 }
1128         }
1129
1130         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1131 out_set_cpu:
1132         new_cpu = wake_idle(new_cpu, p);
1133         if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
1134                 set_task_cpu(p, new_cpu);
1135                 task_rq_unlock(rq, &flags);
1136                 /* might preempt at this point */
1137                 rq = task_rq_lock(p, &flags);
1138                 old_state = p->state;
1139                 if (!(old_state & state))
1140                         goto out;
1141                 if (p->array)
1142                         goto out_running;
1143
1144                 this_cpu = smp_processor_id();
1145                 cpu = task_cpu(p);
1146         }
1147
1148 out_activate:
1149 #endif /* CONFIG_SMP */
1150         if (old_state == TASK_UNINTERRUPTIBLE) {
1151                 rq->nr_uninterruptible--;
1152                 /*
1153                  * Tasks on involuntary sleep don't earn
1154                  * sleep_avg beyond just interactive state.
1155                  */
1156                 p->activated = -1;
1157         }
1158
1159         /*
1160          * Sync wakeups (i.e. those types of wakeups where the waker
1161          * has indicated that it will leave the CPU in short order)
1162          * don't trigger a preemption, if the woken up task will run on
1163          * this cpu. (in this case the 'I will reschedule' promise of
1164          * the waker guarantees that the freshly woken up task is going
1165          * to be considered on this CPU.)
1166          */
1167         activate_task(p, rq, cpu == this_cpu);
1168         if (!sync || cpu != this_cpu) {
1169                 if (TASK_PREEMPTS_CURR(p, rq))
1170                         resched_task(rq->curr);
1171         }
1172         success = 1;
1173
1174 out_running:
1175         p->state = TASK_RUNNING;
1176 out:
1177         task_rq_unlock(rq, &flags);
1178
1179         return success;
1180 }
1181
1182 int fastcall wake_up_process(task_t * p)
1183 {
1184         return try_to_wake_up(p, TASK_STOPPED |
1185                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1186 }
1187
1188 EXPORT_SYMBOL(wake_up_process);
1189
1190 int fastcall wake_up_state(task_t *p, unsigned int state)
1191 {
1192         return try_to_wake_up(p, state, 0);
1193 }
1194
1195 /*
1196  * Perform scheduler related setup for a newly forked process p.
1197  * p is forked by current.
1198  */
1199 void fastcall sched_fork(task_t *p)
1200 {
1201         /*
1202          * We mark the process as running here, but have not actually
1203          * inserted it onto the runqueue yet. This guarantees that
1204          * nobody will actually run it, and a signal or other external
1205          * event cannot wake it up and insert it on the runqueue either.
1206          */
1207         p->state = TASK_RUNNING;
1208         INIT_LIST_HEAD(&p->run_list);
1209         p->array = NULL;
1210         spin_lock_init(&p->switch_lock);
1211 #ifdef CONFIG_CKRM_CPU_SCHEDULE
1212         cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0);
1213 #endif
1214
1215 #ifdef CONFIG_PREEMPT
1216         /*
1217          * During context-switch we hold precisely one spinlock, which
1218          * schedule_tail drops. (in the common case it's this_rq()->lock,
1219          * but it also can be p->switch_lock.) So we compensate with a count
1220          * of 1. Also, we want to start with kernel preemption disabled.
1221          */
1222         p->thread_info->preempt_count = 1;
1223 #endif
1224         /*
1225          * Share the timeslice between parent and child, thus the
1226          * total amount of pending timeslices in the system doesn't change,
1227          * resulting in more scheduling fairness.
1228          */
1229         local_irq_disable();
1230         p->time_slice = (current->time_slice + 1) >> 1;
1231         /*
1232          * The remainder of the first timeslice might be recovered by
1233          * the parent if the child exits early enough.
1234          */
1235         p->first_time_slice = 1;
1236         current->time_slice >>= 1;
1237         p->timestamp = sched_clock();
1238         if (!current->time_slice) {
1239                 /*
1240                  * This case is rare, it happens when the parent has only
1241                  * a single jiffy left from its timeslice. Taking the
1242                  * runqueue lock is not a problem.
1243                  */
1244                 current->time_slice = 1;
1245                 preempt_disable();
1246                 scheduler_tick(0, 0);
1247                 local_irq_enable();
1248                 preempt_enable();
1249         } else
1250                 local_irq_enable();
1251 }
1252
1253 /*
1254  * wake_up_forked_process - wake up a freshly forked process.
1255  *
1256  * This function will do some initial scheduler statistics housekeeping
1257  * that must be done for every newly created process.
1258  */
1259 void fastcall wake_up_forked_process(task_t * p)
1260 {
1261         unsigned long flags;
1262         runqueue_t *rq = task_rq_lock(current, &flags);
1263
1264         BUG_ON(p->state != TASK_RUNNING);
1265
1266         /*
1267          * We decrease the sleep average of forking parents
1268          * and children as well, to keep max-interactive tasks
1269          * from forking tasks that are max-interactive.
1270          */
1271         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1272                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1273
1274         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1275                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1276
1277         p->interactive_credit = 0;
1278
1279         p->prio = effective_prio(p);
1280         set_task_cpu(p, smp_processor_id());
1281
1282         if (unlikely(!current->array))
1283                 __activate_task(p, rq);
1284         else {
1285                 p->prio = current->prio;
1286                 list_add_tail(&p->run_list, &current->run_list);
1287                 p->array = current->array;
1288                 p->array->nr_active++;
1289                 rq->nr_running++;
1290                 class_enqueue_task(p,p->array);
1291         }
1292         task_rq_unlock(rq, &flags);
1293 }
1294
1295 /*
1296  * Potentially available exiting-child timeslices are
1297  * retrieved here - this way the parent does not get
1298  * penalized for creating too many threads.
1299  *
1300  * (this cannot be used to 'generate' timeslices
1301  * artificially, because any timeslice recovered here
1302  * was given away by the parent in the first place.)
1303  */
1304 void fastcall sched_exit(task_t * p)
1305 {
1306         unsigned long flags;
1307         runqueue_t *rq;
1308
1309         local_irq_save(flags);
1310         if (p->first_time_slice) {
1311                 p->parent->time_slice += p->time_slice;
1312                 if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
1313                         p->parent->time_slice = MAX_TIMESLICE;
1314         }
1315         local_irq_restore(flags);
1316         /*
1317          * If the child was a (relative-) CPU hog then decrease
1318          * the sleep_avg of the parent as well.
1319          */
1320         rq = task_rq_lock(p->parent, &flags);
1321         if (p->sleep_avg < p->parent->sleep_avg)
1322                 p->parent->sleep_avg = p->parent->sleep_avg /
1323                 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1324                 (EXIT_WEIGHT + 1);
1325         task_rq_unlock(rq, &flags);
1326 }
1327
1328 /**
1329  * finish_task_switch - clean up after a task-switch
1330  * @prev: the thread we just switched away from.
1331  *
1332  * We enter this with the runqueue still locked, and finish_arch_switch()
1333  * will unlock it along with doing any other architecture-specific cleanup
1334  * actions.
1335  *
1336  * Note that we may have delayed dropping an mm in context_switch(). If
1337  * so, we finish that here outside of the runqueue lock.  (Doing it
1338  * with the lock held can cause deadlocks; see schedule() for
1339  * details.)
1340  */
1341 static void finish_task_switch(task_t *prev)
1342 {
1343         runqueue_t *rq = this_rq();
1344         struct mm_struct *mm = rq->prev_mm;
1345         unsigned long prev_task_flags;
1346
1347         rq->prev_mm = NULL;
1348
1349         /*
1350          * A task struct has one reference for the use as "current".
1351          * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls
1352          * schedule one last time. The schedule call will never return,
1353          * and the scheduled task must drop that reference.
1354          * The test for TASK_ZOMBIE must occur while the runqueue locks are
1355          * still held, otherwise prev could be scheduled on another cpu, die
1356          * there before we look at prev->state, and then the reference would
1357          * be dropped twice.
1358          *              Manfred Spraul <manfred@colorfullife.com>
1359          */
1360         prev_task_flags = prev->flags;
1361         finish_arch_switch(rq, prev);
1362         if (mm)
1363                 mmdrop(mm);
1364         if (unlikely(prev_task_flags & PF_DEAD))
1365                 put_task_struct(prev);
1366 }
1367
1368 /**
1369  * schedule_tail - first thing a freshly forked thread must call.
1370  * @prev: the thread we just switched away from.
1371  */
1372 asmlinkage void schedule_tail(task_t *prev)
1373 {
1374         finish_task_switch(prev);
1375
1376         if (current->set_child_tid)
1377                 put_user(current->pid, current->set_child_tid);
1378 }
1379
1380 /*
1381  * context_switch - switch to the new MM and the new
1382  * thread's register state.
1383  */
1384 static inline
1385 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1386 {
1387         struct mm_struct *mm = next->mm;
1388         struct mm_struct *oldmm = prev->active_mm;
1389
1390         if (unlikely(!mm)) {
1391                 next->active_mm = oldmm;
1392                 atomic_inc(&oldmm->mm_count);
1393                 enter_lazy_tlb(oldmm, next);
1394         } else
1395                 switch_mm(oldmm, mm, next);
1396
1397         if (unlikely(!prev->mm)) {
1398                 prev->active_mm = NULL;
1399                 WARN_ON(rq->prev_mm);
1400                 rq->prev_mm = oldmm;
1401         }
1402
1403         /* Here we just switch the register state and the stack. */
1404         switch_to(prev, next, prev);
1405
1406         return prev;
1407 }
1408
1409 /*
1410  * nr_running, nr_uninterruptible and nr_context_switches:
1411  *
1412  * externally visible scheduler statistics: current number of runnable
1413  * threads, current number of uninterruptible-sleeping threads, total
1414  * number of context switches performed since bootup.
1415  */
1416 unsigned long nr_running(void)
1417 {
1418         unsigned long i, sum = 0;
1419
1420         for_each_cpu(i)
1421                 sum += cpu_rq(i)->nr_running;
1422
1423         return sum;
1424 }
1425
1426 unsigned long nr_uninterruptible(void)
1427 {
1428         unsigned long i, sum = 0;
1429
1430         for_each_cpu(i)
1431                 sum += cpu_rq(i)->nr_uninterruptible;
1432
1433         return sum;
1434 }
1435
1436 unsigned long long nr_context_switches(void)
1437 {
1438         unsigned long long i, sum = 0;
1439
1440         for_each_cpu(i)
1441                 sum += cpu_rq(i)->nr_switches;
1442
1443         return sum;
1444 }
1445
1446 unsigned long nr_iowait(void)
1447 {
1448         unsigned long i, sum = 0;
1449
1450         for_each_cpu(i)
1451                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1452
1453         return sum;
1454 }
1455
1456 /*
1457  * double_rq_lock - safely lock two runqueues
1458  *
1459  * Note this does not disable interrupts like task_rq_lock,
1460  * you need to do so manually before calling.
1461  */
1462 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1463 {
1464         if (rq1 == rq2)
1465                 spin_lock(&rq1->lock);
1466         else {
1467                 if (rq1 < rq2) {
1468                         spin_lock(&rq1->lock);
1469                         spin_lock(&rq2->lock);
1470                 } else {
1471                         spin_lock(&rq2->lock);
1472                         spin_lock(&rq1->lock);
1473                 }
1474         }
1475 }
1476
1477 /*
1478  * double_rq_unlock - safely unlock two runqueues
1479  *
1480  * Note this does not restore interrupts like task_rq_unlock,
1481  * you need to do so manually after calling.
1482  */
1483 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1484 {
1485         spin_unlock(&rq1->lock);
1486         if (rq1 != rq2)
1487                 spin_unlock(&rq2->lock);
1488 }
1489
1490 unsigned long long nr_preempt(void)
1491 {
1492         unsigned long long i, sum = 0;
1493
1494         for_each_online_cpu(i)
1495                 sum += cpu_rq(i)->nr_preempt;
1496
1497         return sum;
1498 }
1499
1500 enum idle_type
1501 {
1502         IDLE,
1503         NOT_IDLE,
1504         NEWLY_IDLE,
1505 };
1506
1507 #ifdef CONFIG_SMP
1508
1509 /*
1510  * find_idlest_cpu - find the least busy runqueue.
1511  */
1512 static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1513                            struct sched_domain *sd)
1514 {
1515         unsigned long load, min_load, this_load;
1516         int i, min_cpu;
1517         cpumask_t mask;
1518
1519         min_cpu = UINT_MAX;
1520         min_load = ULONG_MAX;
1521
1522         cpus_and(mask, sd->span, cpu_online_map);
1523         cpus_and(mask, mask, p->cpus_allowed);
1524
1525         for_each_cpu_mask(i, mask) {
1526                 load = target_load(i);
1527
1528                 if (load < min_load) {
1529                         min_cpu = i;
1530                         min_load = load;
1531
1532                         /* break out early on an idle CPU: */
1533                         if (!min_load)
1534                                 break;
1535                 }
1536         }
1537
1538         /* add +1 to account for the new task */
1539         this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1540
1541         /*
1542          * Would with the addition of the new task to the
1543          * current CPU there be an imbalance between this
1544          * CPU and the idlest CPU?
1545          *
1546          * Use half of the balancing threshold - new-context is
1547          * a good opportunity to balance.
1548          */
1549         if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1550                 return min_cpu;
1551
1552         return this_cpu;
1553 }
1554
1555 /*
1556  * wake_up_forked_thread - wake up a freshly forked thread.
1557  *
1558  * This function will do some initial scheduler statistics housekeeping
1559  * that must be done for every newly created context, and it also does
1560  * runqueue balancing.
1561  */
1562 void fastcall wake_up_forked_thread(task_t * p)
1563 {
1564         unsigned long flags;
1565         int this_cpu = get_cpu(), cpu;
1566         struct sched_domain *tmp, *sd = NULL;
1567         runqueue_t *this_rq = cpu_rq(this_cpu), *rq;
1568
1569         /*
1570          * Find the largest domain that this CPU is part of that
1571          * is willing to balance on clone:
1572          */
1573         for_each_domain(this_cpu, tmp)
1574                 if (tmp->flags & SD_BALANCE_CLONE)
1575                         sd = tmp;
1576         if (sd)
1577                 cpu = find_idlest_cpu(p, this_cpu, sd);
1578         else
1579                 cpu = this_cpu;
1580
1581         local_irq_save(flags);
1582 lock_again:
1583         rq = cpu_rq(cpu);
1584         double_rq_lock(this_rq, rq);
1585
1586         BUG_ON(p->state != TASK_RUNNING);
1587
1588         /*
1589          * We did find_idlest_cpu() unlocked, so in theory
1590          * the mask could have changed - just dont migrate
1591          * in this case:
1592          */
1593         if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) {
1594                 cpu = this_cpu;
1595                 double_rq_unlock(this_rq, rq);
1596                 goto lock_again;
1597         }
1598         /*
1599          * We decrease the sleep average of forking parents
1600          * and children as well, to keep max-interactive tasks
1601          * from forking tasks that are max-interactive.
1602          */
1603         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1604                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1605
1606         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1607                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1608
1609         p->interactive_credit = 0;
1610
1611         p->prio = effective_prio(p);
1612         set_task_cpu(p, cpu);
1613
1614         if (cpu == this_cpu) {
1615                 if (unlikely(!current->array))
1616                         __activate_task(p, rq);
1617                 else {
1618                         p->prio = current->prio;
1619                         list_add_tail(&p->run_list, &current->run_list);
1620                         p->array = current->array;
1621                         p->array->nr_active++;
1622                         rq->nr_running++;
1623                         class_enqueue_task(p,p->array);
1624                 }
1625         } else {
1626                 /* Not the local CPU - must adjust timestamp */
1627                 p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
1628                                         + rq->timestamp_last_tick;
1629                 __activate_task(p, rq);
1630                 if (TASK_PREEMPTS_CURR(p, rq))
1631                         resched_task(rq->curr);
1632         }
1633
1634         double_rq_unlock(this_rq, rq);
1635         local_irq_restore(flags);
1636         put_cpu();
1637 }
1638
1639 /*
1640  * If dest_cpu is allowed for this process, migrate the task to it.
1641  * This is accomplished by forcing the cpu_allowed mask to only
1642  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
1643  * the cpu_allowed mask is restored.
1644  */
1645 static void sched_migrate_task(task_t *p, int dest_cpu)
1646 {
1647         migration_req_t req;
1648         runqueue_t *rq;
1649         unsigned long flags;
1650
1651         rq = task_rq_lock(p, &flags);
1652         if (!cpu_isset(dest_cpu, p->cpus_allowed)
1653             || unlikely(cpu_is_offline(dest_cpu)))
1654                 goto out;
1655
1656         /* force the process onto the specified CPU */
1657         if (migrate_task(p, dest_cpu, &req)) {
1658                 /* Need to wait for migration thread (might exit: take ref). */
1659                 struct task_struct *mt = rq->migration_thread;
1660                 get_task_struct(mt);
1661                 task_rq_unlock(rq, &flags);
1662                 wake_up_process(mt);
1663                 put_task_struct(mt);
1664                 wait_for_completion(&req.done);
1665                 return;
1666         }
1667 out:
1668         task_rq_unlock(rq, &flags);
1669 }
1670
1671 /*
1672  * sched_balance_exec(): find the highest-level, exec-balance-capable
1673  * domain and try to migrate the task to the least loaded CPU.
1674  *
1675  * execve() is a valuable balancing opportunity, because at this point
1676  * the task has the smallest effective memory and cache footprint.
1677  */
1678 void sched_balance_exec(void)
1679 {
1680         struct sched_domain *tmp, *sd = NULL;
1681         int new_cpu, this_cpu = get_cpu();
1682
1683         /* Prefer the current CPU if there's only this task running */
1684         if (this_rq()->nr_running <= 1)
1685                 goto out;
1686
1687         for_each_domain(this_cpu, tmp)
1688                 if (tmp->flags & SD_BALANCE_EXEC)
1689                         sd = tmp;
1690
1691         if (sd) {
1692                 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1693                 if (new_cpu != this_cpu) {
1694                         put_cpu();
1695                         sched_migrate_task(current, new_cpu);
1696                         return;
1697                 }
1698         }
1699 out:
1700         put_cpu();
1701 }
1702
1703 /*
1704  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1705  */
1706 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1707 {
1708         if (unlikely(!spin_trylock(&busiest->lock))) {
1709                 if (busiest < this_rq) {
1710                         spin_unlock(&this_rq->lock);
1711                         spin_lock(&busiest->lock);
1712                         spin_lock(&this_rq->lock);
1713                 } else
1714                         spin_lock(&busiest->lock);
1715         }
1716 }
1717
1718 /*
1719  * pull_task - move a task from a remote runqueue to the local runqueue.
1720  * Both runqueues must be locked.
1721  */
1722 static inline
1723 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1724                runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1725 {
1726         dequeue_task(p, src_array);
1727         src_rq->nr_running--;
1728         set_task_cpu(p, this_cpu);
1729         this_rq->nr_running++;
1730         enqueue_task(p, this_array);
1731         p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1732                                 + this_rq->timestamp_last_tick;
1733         /*
1734          * Note that idle threads have a prio of MAX_PRIO, for this test
1735          * to be always true for them.
1736          */
1737         if (TASK_PREEMPTS_CURR(p, this_rq))
1738                 resched_task(this_rq->curr);
1739 }
1740
1741 /*
1742  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1743  */
1744 static inline
1745 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1746                      struct sched_domain *sd, enum idle_type idle)
1747 {
1748         /*
1749          * We do not migrate tasks that are:
1750          * 1) running (obviously), or
1751          * 2) cannot be migrated to this CPU due to cpus_allowed, or
1752          * 3) are cache-hot on their current CPU.
1753          */
1754         if (task_running(rq, p))
1755                 return 0;
1756         if (!cpu_isset(this_cpu, p->cpus_allowed))
1757                 return 0;
1758
1759         /* Aggressive migration if we've failed balancing */
1760         if (idle == NEWLY_IDLE ||
1761                         sd->nr_balance_failed < sd->cache_nice_tries) {
1762                 if (task_hot(p, rq->timestamp_last_tick, sd))
1763                         return 0;
1764         }
1765
1766         return 1;
1767 }
1768
1769 /*
1770  * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
1771  * as part of a balancing operation within "domain". Returns the number of
1772  * tasks moved.
1773  *
1774  * Called with both runqueues locked.
1775  */
1776 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1777                       unsigned long max_nr_move, struct sched_domain *sd,
1778                       enum idle_type idle)
1779 {
1780         prio_array_t *array, *dst_array;
1781         struct list_head *head, *curr;
1782         int idx, pulled = 0;
1783         task_t *tmp;
1784 #if CONFIG_CKRM_CPU_SCHEDULE
1785         /* need to distinguish between the runqueues and the class
1786          * local runqueues.
1787          * we know we can get here only if the dflt class is present
1788          */
1789         ckrm_lrq_t *l_this_rq = &this_rq->dflt_lrq;
1790         ckrm_lrq_t *l_busiest = &busiest->dflt_lrq;
1791 #else
1792 #define l_busiest busiest
1793 #define l_this_rq this_rq
1794 #endif
1795
1796         if (max_nr_move <= 0 || busiest->nr_running <= 1)
1797                 goto out;
1798
1799         /*
1800          * We first consider expired tasks. Those will likely not be
1801          * executed in the near future, and they are most likely to
1802          * be cache-cold, thus switching CPUs has the least effect
1803          * on them.
1804          */
1805         if (l_busiest->expired->nr_active) {
1806                 array = l_busiest->expired;
1807                 dst_array = l_this_rq->expired;
1808         } else {
1809                 array = l_busiest->active;
1810                 dst_array = l_this_rq->active;
1811         }
1812
1813 new_array:
1814         /* Start searching at priority 0: */
1815         idx = 0;
1816 skip_bitmap:
1817         if (!idx)
1818                 idx = sched_find_first_bit(array->bitmap);
1819         else
1820                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
1821         if (idx >= MAX_PRIO) {
1822                 if (array == l_busiest->expired && l_busiest->active->nr_active) {
1823                         array = l_busiest->active;
1824                         dst_array = l_this_rq->active;
1825                         goto new_array;
1826                 }
1827                 goto out;
1828         }
1829
1830         head = array->queue + idx;
1831         curr = head->prev;
1832 skip_queue:
1833         tmp = list_entry(curr, task_t, run_list);
1834
1835         curr = curr->prev;
1836
1837         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
1838                 if (curr != head)
1839                         goto skip_queue;
1840                 idx++;
1841                 goto skip_bitmap;
1842         }
1843         pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1844         pulled++;
1845
1846         /* We only want to steal up to the prescribed number of tasks. */
1847         if (pulled < max_nr_move) {
1848                 if (curr != head)
1849                         goto skip_queue;
1850                 idx++;
1851                 goto skip_bitmap;
1852         }
1853 out:
1854         return pulled;
1855 }
1856
1857 /*
1858  * find_busiest_group finds and returns the busiest CPU group within the
1859  * domain. It calculates and returns the number of tasks which should be
1860  * moved to restore balance via the imbalance parameter.
1861  */
1862 static struct sched_group *
1863 find_busiest_group(struct sched_domain *sd, int this_cpu,
1864                    unsigned long *imbalance, enum idle_type idle)
1865 {
1866         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1867         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1868
1869         max_load = this_load = total_load = total_pwr = 0;
1870
1871         do {
1872                 cpumask_t tmp;
1873                 unsigned long load;
1874                 int local_group;
1875                 int i, nr_cpus = 0;
1876
1877                 local_group = cpu_isset(this_cpu, group->cpumask);
1878
1879                 /* Tally up the load of all CPUs in the group */
1880                 avg_load = 0;
1881                 cpus_and(tmp, group->cpumask, cpu_online_map);
1882                 if (unlikely(cpus_empty(tmp)))
1883                         goto nextgroup;
1884
1885                 for_each_cpu_mask(i, tmp) {
1886                         /* Bias balancing toward cpus of our domain */
1887                         if (local_group)
1888                                 load = target_load(i);
1889                         else
1890                                 load = source_load(i);
1891
1892                         nr_cpus++;
1893                         avg_load += load;
1894                 }
1895
1896                 if (!nr_cpus)
1897                         goto nextgroup;
1898
1899                 total_load += avg_load;
1900                 total_pwr += group->cpu_power;
1901
1902                 /* Adjust by relative CPU power of the group */
1903                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1904
1905                 if (local_group) {
1906                         this_load = avg_load;
1907                         this = group;
1908                         goto nextgroup;
1909                 } else if (avg_load > max_load) {
1910                         max_load = avg_load;
1911                         busiest = group;
1912                 }
1913 nextgroup:
1914                 group = group->next;
1915         } while (group != sd->groups);
1916
1917         if (!busiest || this_load >= max_load)
1918                 goto out_balanced;
1919
1920         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
1921
1922         if (this_load >= avg_load ||
1923                         100*max_load <= sd->imbalance_pct*this_load)
1924                 goto out_balanced;
1925
1926         /*
1927          * We're trying to get all the cpus to the average_load, so we don't
1928          * want to push ourselves above the average load, nor do we wish to
1929          * reduce the max loaded cpu below the average load, as either of these
1930          * actions would just result in more rebalancing later, and ping-pong
1931          * tasks around. Thus we look for the minimum possible imbalance.
1932          * Negative imbalances (*we* are more loaded than anyone else) will
1933          * be counted as no imbalance for these purposes -- we can't fix that
1934          * by pulling tasks to us.  Be careful of negative numbers as they'll
1935          * appear as very large values with unsigned longs.
1936          */
1937         *imbalance = min(max_load - avg_load, avg_load - this_load);
1938
1939         /* How much load to actually move to equalise the imbalance */
1940         *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
1941                                 / SCHED_LOAD_SCALE;
1942
1943         if (*imbalance < SCHED_LOAD_SCALE - 1) {
1944                 unsigned long pwr_now = 0, pwr_move = 0;
1945                 unsigned long tmp;
1946
1947                 if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
1948                         *imbalance = 1;
1949                         return busiest;
1950                 }
1951
1952                 /*
1953                  * OK, we don't have enough imbalance to justify moving tasks,
1954                  * however we may be able to increase total CPU power used by
1955                  * moving them.
1956                  */
1957
1958                 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
1959                 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
1960                 pwr_now /= SCHED_LOAD_SCALE;
1961
1962                 /* Amount of load we'd subtract */
1963                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
1964                 if (max_load > tmp)
1965                         pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
1966                                                         max_load - tmp);
1967
1968                 /* Amount of load we'd add */
1969                 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
1970                 if (max_load < tmp)
1971                         tmp = max_load;
1972                 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
1973                 pwr_move /= SCHED_LOAD_SCALE;
1974
1975                 /* Move if we gain another 8th of a CPU worth of throughput */
1976                 if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
1977                         goto out_balanced;
1978
1979                 *imbalance = 1;
1980                 return busiest;
1981         }
1982
1983         /* Get rid of the scaling factor, rounding down as we divide */
1984         *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE;
1985
1986         return busiest;
1987
1988 out_balanced:
1989         if (busiest && (idle == NEWLY_IDLE ||
1990                         (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1991                 *imbalance = 1;
1992                 return busiest;
1993         }
1994
1995         *imbalance = 0;
1996         return NULL;
1997 }
1998
1999 /*
2000  * find_busiest_queue - find the busiest runqueue among the cpus in group.
2001  */
2002 static runqueue_t *find_busiest_queue(struct sched_group *group)
2003 {
2004         cpumask_t tmp;
2005         unsigned long load, max_load = 0;
2006         runqueue_t *busiest = NULL;
2007         int i;
2008
2009         cpus_and(tmp, group->cpumask, cpu_online_map);
2010         for_each_cpu_mask(i, tmp) {
2011                 load = source_load(i);
2012
2013                 if (load > max_load) {
2014                         max_load = load;
2015                         busiest = cpu_rq(i);
2016                 }
2017         }
2018
2019         return busiest;
2020 }
2021
2022 /*
2023  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2024  * tasks if there is an imbalance.
2025  *
2026  * Called with this_rq unlocked.
2027  */
2028
2029 static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
2030                                     struct sched_domain *sd, 
2031                                     enum idle_type idle)
2032 #ifndef CONFIG_CKRM_CPU_SCHEDULE
2033 {
2034         return -1;
2035 }
2036 #endif
2037 ;
2038
2039 static int load_balance(int this_cpu, runqueue_t *this_rq,
2040                         struct sched_domain *sd, enum idle_type idle)
2041 {
2042         struct sched_group *group;
2043         runqueue_t *busiest;
2044         unsigned long imbalance;
2045         int nr_moved;
2046
2047         spin_lock(&this_rq->lock);
2048
2049         if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,idle)) != -1)
2050                 goto out_balanced;
2051
2052         group = find_busiest_group(sd, this_cpu, &imbalance, idle);
2053         if (!group)
2054                 goto out_balanced;
2055
2056         busiest = find_busiest_queue(group);
2057         if (!busiest)
2058                 goto out_balanced;
2059         /*
2060          * This should be "impossible", but since load
2061          * balancing is inherently racy and statistical,
2062          * it could happen in theory.
2063          */
2064         if (unlikely(busiest == this_rq)) {
2065                 WARN_ON(1);
2066                 goto out_balanced;
2067         }
2068
2069         nr_moved = 0;
2070         if (busiest->nr_running > 1) {
2071                 /*
2072                  * Attempt to move tasks. If find_busiest_group has found
2073                  * an imbalance but busiest->nr_running <= 1, the group is
2074                  * still unbalanced. nr_moved simply stays zero, so it is
2075                  * correctly treated as an imbalance.
2076                  */
2077                 double_lock_balance(this_rq, busiest);
2078                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2079                                                 imbalance, sd, idle);
2080                 spin_unlock(&busiest->lock);
2081         }
2082         spin_unlock(&this_rq->lock);
2083
2084         if (!nr_moved) {
2085                 sd->nr_balance_failed++;
2086
2087                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2088                         int wake = 0;
2089
2090                         spin_lock(&busiest->lock);
2091                         if (!busiest->active_balance) {
2092                                 busiest->active_balance = 1;
2093                                 busiest->push_cpu = this_cpu;
2094                                 wake = 1;
2095                         }
2096                         spin_unlock(&busiest->lock);
2097                         if (wake)
2098                                 wake_up_process(busiest->migration_thread);
2099
2100                         /*
2101                          * We've kicked active balancing, reset the failure
2102                          * counter.
2103                          */
2104                         sd->nr_balance_failed = sd->cache_nice_tries;
2105                 }
2106         } else
2107                 sd->nr_balance_failed = 0;
2108
2109         /* We were unbalanced, so reset the balancing interval */
2110         sd->balance_interval = sd->min_interval;
2111
2112         return nr_moved;
2113
2114 out_balanced:
2115         spin_unlock(&this_rq->lock);
2116
2117         /* tune up the balancing interval */
2118         if (sd->balance_interval < sd->max_interval)
2119                 sd->balance_interval *= 2;
2120
2121         return 0;
2122 }
2123
2124 /*
2125  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2126  * tasks if there is an imbalance.
2127  *
2128  * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2129  * this_rq is locked.
2130  */
2131 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2132                                 struct sched_domain *sd)
2133 {
2134         struct sched_group *group;
2135         runqueue_t *busiest = NULL;
2136         unsigned long imbalance;
2137         int nr_moved;
2138
2139         if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE)) != -1)
2140                 goto out;
2141
2142         nr_moved = 0;
2143         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2144         if (!group)
2145                 goto out;
2146
2147         busiest = find_busiest_queue(group);
2148         if (!busiest || busiest == this_rq)
2149                 goto out;
2150
2151         /* Attempt to move tasks */
2152         double_lock_balance(this_rq, busiest);
2153
2154         nr_moved = move_tasks(this_rq, this_cpu, busiest,
2155                                         imbalance, sd, NEWLY_IDLE);
2156
2157         spin_unlock(&busiest->lock);
2158
2159 out:
2160         return nr_moved;
2161 }
2162
2163 /*
2164  * idle_balance is called by schedule() if this_cpu is about to become
2165  * idle. Attempts to pull tasks from other CPUs.
2166  */
2167 static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2168 {
2169         struct sched_domain *sd;
2170
2171         for_each_domain(this_cpu, sd) {
2172                 if (sd->flags & SD_BALANCE_NEWIDLE) {
2173                         if (load_balance_newidle(this_cpu, this_rq, sd)) {
2174                                 /* We've pulled tasks over so stop searching */
2175                                 break;
2176                         }
2177                 }
2178         }
2179 }
2180
2181 /*
2182  * active_load_balance is run by migration threads. It pushes a running
2183  * task off the cpu. It can be required to correctly have at least 1 task
2184  * running on each physical CPU where possible, and not have a physical /
2185  * logical imbalance.
2186  *
2187  * Called with busiest locked.
2188  */
2189 static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
2190 {
2191         struct sched_domain *sd;
2192         struct sched_group *group, *busy_group;
2193         int i;
2194
2195         if (busiest->nr_running <= 1)
2196                 return;
2197
2198         for_each_domain(busiest_cpu, sd)
2199                 if (cpu_isset(busiest->push_cpu, sd->span))
2200                         break;
2201         if (!sd) {
2202                 WARN_ON(1);
2203                 return;
2204         }
2205
2206         group = sd->groups;
2207         while (!cpu_isset(busiest_cpu, group->cpumask))
2208                 group = group->next;
2209         busy_group = group;
2210
2211         group = sd->groups;
2212         do {
2213                 cpumask_t tmp;
2214                 runqueue_t *rq;
2215                 int push_cpu = 0;
2216
2217                 if (group == busy_group)
2218                         goto next_group;
2219
2220                 cpus_and(tmp, group->cpumask, cpu_online_map);
2221                 if (!cpus_weight(tmp))
2222                         goto next_group;
2223
2224                 for_each_cpu_mask(i, tmp) {
2225                         if (!idle_cpu(i))
2226                                 goto next_group;
2227                         push_cpu = i;
2228                 }
2229
2230                 rq = cpu_rq(push_cpu);
2231
2232                 /*
2233                  * This condition is "impossible", but since load
2234                  * balancing is inherently a bit racy and statistical,
2235                  * it can trigger.. Reported by Bjorn Helgaas on a
2236                  * 128-cpu setup.
2237                  */
2238                 if (unlikely(busiest == rq))
2239                         goto next_group;
2240                 double_lock_balance(busiest, rq);
2241                 move_tasks(rq, push_cpu, busiest, 1, sd, IDLE);
2242                 spin_unlock(&rq->lock);
2243 next_group:
2244                 group = group->next;
2245         } while (group != sd->groups);
2246 }
2247
2248 /*
2249  * rebalance_tick will get called every timer tick, on every CPU.
2250  *
2251  * It checks each scheduling domain to see if it is due to be balanced,
2252  * and initiates a balancing operation if so.
2253  *
2254  * Balancing parameters are set up in arch_init_sched_domains.
2255  */
2256
2257 /* Don't have all balancing operations going off at once */
2258 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
2259
2260 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2261                            enum idle_type idle)
2262 {
2263         unsigned long old_load, this_load;
2264         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2265         struct sched_domain *sd;
2266
2267         ckrm_sched_tick(j,this_cpu,(idle != NOT_IDLE),this_rq);
2268
2269         /* Update our load */
2270         old_load = this_rq->cpu_load;
2271         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2272         /*
2273          * Round up the averaging division if load is increasing. This
2274          * prevents us from getting stuck on 9 if the load is 10, for
2275          * example.
2276          */
2277         if (this_load > old_load)
2278                 old_load++;
2279         this_rq->cpu_load = (old_load + this_load) / 2;
2280
2281         for_each_domain(this_cpu, sd) {
2282                 unsigned long interval = sd->balance_interval;
2283
2284                 if (idle != IDLE)
2285                         interval *= sd->busy_factor;
2286
2287                 /* scale ms to jiffies */
2288                 interval = msecs_to_jiffies(interval);
2289                 if (unlikely(!interval))
2290                         interval = 1;
2291
2292                 if (j - sd->last_balance >= interval) {
2293                         if (load_balance(this_cpu, this_rq, sd, idle)) {
2294                                 /* We've pulled tasks over so no longer idle */
2295                                 idle = NOT_IDLE;
2296                         }
2297                         sd->last_balance += interval;
2298                 }
2299         }
2300 }
2301 #else /* SMP*/
2302 /*
2303  * on UP we do not need to balance between CPUs:
2304  */
2305 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
2306 {
2307         ckrm_sched_tick(jiffies,cpu,(idle != NOT_IDLE),rq);
2308 }
2309
2310 static inline void idle_balance(int cpu, runqueue_t *rq)
2311 {
2312 }
2313 #endif
2314
2315 static inline int wake_priority_sleeper(runqueue_t *rq)
2316 {
2317 #ifdef CONFIG_SCHED_SMT
2318         /*
2319          * If an SMT sibling task has been put to sleep for priority
2320          * reasons reschedule the idle task to see if it can now run.
2321          */
2322         if (rq->nr_running) {
2323                 resched_task(rq->idle);
2324                 return 1;
2325         }
2326 #endif
2327         return 0;
2328 }
2329
2330 DEFINE_PER_CPU(struct kernel_stat, kstat);
2331 EXPORT_PER_CPU_SYMBOL(kstat);
2332
2333 /*
2334  * We place interactive tasks back into the active array, if possible.
2335  *
2336  * To guarantee that this does not starve expired tasks we ignore the
2337  * interactivity of a task if the first expired task had to wait more
2338  * than a 'reasonable' amount of time. This deadline timeout is
2339  * load-dependent, as the frequency of array switched decreases with
2340  * increasing number of running tasks. We also ignore the interactivity
2341  * if a better static_prio task has expired:
2342  */
2343
2344 #ifndef CONFIG_CKRM_CPU_SCHEDULE
2345 #define EXPIRED_STARVING(rq) \
2346                 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2347                 (jiffies - (rq)->expired_timestamp >= \
2348                         STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
2349                         ((rq)->curr->static_prio > (rq)->best_expired_prio))
2350 #else
2351 /* we need to scale the starvation based on weight 
2352  * classes with small weight have longer expiration starvation
2353  */
2354 #define EXPIRED_STARVING(rq) \
2355                 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
2356                 (jiffies - (rq)->expired_timestamp >= \
2357                         (((STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)*CKRM_MAX_WEIGHT)/rq->local_weight)))) || \
2358                         (this_rq()->curr->static_prio > (rq)->best_expired_prio))
2359 #endif
2360
2361 /*
2362  * This function gets called by the timer code, with HZ frequency.
2363  * We call it with interrupts disabled.
2364  *
2365  * It also gets called by the fork code, when changing the parent's
2366  * timeslices.
2367  */
2368 void scheduler_tick(int user_ticks, int sys_ticks)
2369 {
2370         int cpu = smp_processor_id();
2371         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2372         runqueue_t *rq = this_rq();
2373         task_t *p = current;
2374
2375         rq->timestamp_last_tick = sched_clock();
2376
2377         if (rcu_pending(cpu))
2378                 rcu_check_callbacks(cpu, user_ticks);
2379
2380         /* note: this timer irq context must be accounted for as well */
2381         if (hardirq_count() - HARDIRQ_OFFSET) {
2382                 cpustat->irq += sys_ticks;
2383                 sys_ticks = 0;
2384         } else if (softirq_count()) {
2385                 cpustat->softirq += sys_ticks;
2386                 sys_ticks = 0;
2387         }
2388
2389         if (p == rq->idle) {
2390 #ifdef  CONFIG_VSERVER_HARDCPU
2391                 if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
2392                         set_need_resched();     
2393 #endif
2394
2395                 if (atomic_read(&rq->nr_iowait) > 0)
2396                         cpustat->iowait += sys_ticks;
2397                 else
2398                         cpustat->idle += sys_ticks;
2399                 if (wake_priority_sleeper(rq))
2400                         goto out;
2401                 rebalance_tick(cpu, rq, IDLE);
2402                 return;
2403         }
2404         if (TASK_NICE(p) > 0)
2405                 cpustat->nice += user_ticks;
2406         else
2407                 cpustat->user += user_ticks;
2408         cpustat->system += sys_ticks;
2409
2410         /* Task might have expired already, but not scheduled off yet */
2411         if (p->array != rq_active(p,rq)) {
2412                 set_tsk_need_resched(p);
2413                 goto out;
2414         }
2415         spin_lock(&rq->lock);
2416         /*
2417          * The task was running during this tick - update the
2418          * time slice counter. Note: we do not update a thread's
2419          * priority until it either goes to sleep or uses up its
2420          * timeslice. This makes it possible for interactive tasks
2421          * to use up their timeslices at their highest priority levels.
2422          */
2423         if (unlikely(rt_task(p))) {
2424                 /*
2425                  * RR tasks need a special form of timeslice management.
2426                  * FIFO tasks have no timeslices.
2427                  */
2428                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
2429                         p->time_slice = task_timeslice(p);
2430                         p->first_time_slice = 0;
2431                         set_tsk_need_resched(p);
2432
2433                         /* put it at the end of the queue: */
2434                         dequeue_task(p, rq_active(p,rq));
2435                         enqueue_task(p, rq_active(p,rq));
2436                 }
2437                 goto out_unlock;
2438         }
2439         if (vx_need_resched(p)) {
2440 #ifdef CONFIG_CKRM_CPU_SCHEDULE
2441                 /* we redefine RQ to be a local runqueue */
2442                 ckrm_lrq_t* rq;
2443                 runqueue_t *cpu_rq = this_rq();
2444                 rq = ckrm_rq_cpu_enabled(cpu_rq) ? get_task_lrq(p) 
2445                                                  : &(cpu_rq->dflt_lrq);
2446 #endif
2447                 dequeue_task(p, rq->active);
2448                 set_tsk_need_resched(p);
2449                 p->prio = effective_prio(p);
2450                 p->time_slice = task_timeslice(p);
2451                 p->first_time_slice = 0;
2452
2453                 if (!rq->expired_timestamp)
2454                         rq->expired_timestamp = jiffies;
2455                 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
2456                         enqueue_task(p, rq->expired);
2457                         if (p->static_prio < rq->best_expired_prio)
2458                                 rq->best_expired_prio = p->static_prio;
2459                 } else
2460                         enqueue_task(p, rq->active);
2461         } else {
2462                 /*
2463                  * Prevent a too long timeslice allowing a task to monopolize
2464                  * the CPU. We do this by splitting up the timeslice into
2465                  * smaller pieces.
2466                  *
2467                  * Note: this does not mean the task's timeslices expire or
2468                  * get lost in any way, they just might be preempted by
2469                  * another task of equal priority. (one with higher
2470                  * priority would have preempted this task already.) We
2471                  * requeue this task to the end of the list on this priority
2472                  * level, which is in essence a round-robin of tasks with
2473                  * equal priority.
2474                  *
2475                  * This only applies to tasks in the interactive
2476                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
2477                  */
2478                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
2479                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
2480                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
2481                         (p->array == rq_active(p,rq))) {
2482
2483                         dequeue_task(p, rq_active(p,rq));
2484                         set_tsk_need_resched(p);
2485                         p->prio = effective_prio(p);
2486                         enqueue_task(p, rq_active(p,rq));
2487                 }
2488         }
2489 out_unlock:
2490         spin_unlock(&rq->lock);
2491 out:
2492         rebalance_tick(cpu, rq, NOT_IDLE);
2493 }
2494
2495 #ifdef CONFIG_SCHED_SMT
2496 static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
2497 {
2498         int i;
2499         struct sched_domain *sd = rq->sd;
2500         cpumask_t sibling_map;
2501
2502         if (!(sd->flags & SD_SHARE_CPUPOWER))
2503                 return;
2504
2505         cpus_and(sibling_map, sd->span, cpu_online_map);
2506         for_each_cpu_mask(i, sibling_map) {
2507                 runqueue_t *smt_rq;
2508
2509                 if (i == cpu)
2510                         continue;
2511
2512                 smt_rq = cpu_rq(i);
2513
2514                 /*
2515                  * If an SMT sibling task is sleeping due to priority
2516                  * reasons wake it up now.
2517                  */
2518                 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2519                         resched_task(smt_rq->idle);
2520         }
2521 }
2522
2523 static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
2524 {
2525         struct sched_domain *sd = rq->sd;
2526         cpumask_t sibling_map;
2527         int ret = 0, i;
2528
2529         if (!(sd->flags & SD_SHARE_CPUPOWER))
2530                 return 0;
2531
2532         cpus_and(sibling_map, sd->span, cpu_online_map);
2533         for_each_cpu_mask(i, sibling_map) {
2534                 runqueue_t *smt_rq;
2535                 task_t *smt_curr;
2536
2537                 if (i == cpu)
2538                         continue;
2539
2540                 smt_rq = cpu_rq(i);
2541                 smt_curr = smt_rq->curr;
2542
2543                 /*
2544                  * If a user task with lower static priority than the
2545                  * running task on the SMT sibling is trying to schedule,
2546                  * delay it till there is proportionately less timeslice
2547                  * left of the sibling task to prevent a lower priority
2548                  * task from using an unfair proportion of the
2549                  * physical cpu's resources. -ck
2550                  */
2551                 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
2552                         task_timeslice(p) || rt_task(smt_curr)) &&
2553                         p->mm && smt_curr->mm && !rt_task(p))
2554                                 ret = 1;
2555
2556                 /*
2557                  * Reschedule a lower priority task on the SMT sibling,
2558                  * or wake it up if it has been put to sleep for priority
2559                  * reasons.
2560                  */
2561                 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
2562                         task_timeslice(smt_curr) || rt_task(p)) &&
2563                         smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
2564                         (smt_curr == smt_rq->idle && smt_rq->nr_running))
2565                                 resched_task(smt_curr);
2566         }
2567         return ret;
2568 }
2569 #else
2570 static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
2571 {
2572 }
2573
2574 static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
2575 {
2576         return 0;
2577 }
2578 #endif
2579
2580 /*
2581  * schedule() is the main scheduler function.
2582  */
2583 asmlinkage void __sched schedule(void)
2584 {
2585         long *switch_count;
2586         task_t *prev, *next;
2587         runqueue_t *rq;
2588         prio_array_t *array;
2589         unsigned long long now;
2590         unsigned long run_time;
2591         int cpu;
2592
2593
2594         /*
2595          * If crash dump is in progress, this other cpu's
2596          * need to wait until it completes.
2597          * NB: this code is optimized away for kernels without
2598          * dumping enabled.
2599          */
2600          if (unlikely(dump_oncpu))
2601                  goto dump_scheduling_disabled;
2602
2603         /*
2604          * Test if we are atomic.  Since do_exit() needs to call into
2605          * schedule() atomically, we ignore that path for now.
2606          * Otherwise, whine if we are scheduling when we should not be.
2607          */
2608         if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
2609                 if (unlikely(in_atomic())) {
2610                         printk(KERN_ERR "bad: scheduling while atomic!\n");
2611                         dump_stack();
2612                 }
2613         }
2614
2615 need_resched:
2616         preempt_disable();
2617         prev = current;
2618         rq = this_rq();
2619
2620         release_kernel_lock(prev);
2621         now = sched_clock();
2622         if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
2623                 run_time = now - prev->timestamp;
2624         else
2625                 run_time = NS_MAX_SLEEP_AVG;
2626
2627         /*
2628          * Tasks with interactive credits get charged less run_time
2629          * at high sleep_avg to delay them losing their interactive
2630          * status
2631          */
2632         if (HIGH_CREDIT(prev))
2633                 run_time /= (CURRENT_BONUS(prev) ? : 1);
2634
2635         spin_lock_irq(&rq->lock);
2636
2637         ckrm_account_task(rq,prev,now);
2638
2639         /*
2640          * if entering off of a kernel preemption go straight
2641          * to picking the next task.
2642          */
2643         switch_count = &prev->nivcsw;
2644         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2645                 switch_count = &prev->nvcsw;
2646                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
2647                                 unlikely(signal_pending(prev))))
2648                         prev->state = TASK_RUNNING;
2649                 else
2650                         deactivate_task(prev, rq);
2651         }
2652
2653         cpu = smp_processor_id();
2654
2655 #ifdef  CONFIG_VSERVER_HARDCPU          
2656         if (!list_empty(&rq->hold_queue)) {
2657                 struct list_head *l, *n;
2658                 int ret;
2659
2660                 vxi = NULL;
2661                 list_for_each_safe(l, n, &rq->hold_queue) {
2662                         next = list_entry(l, task_t, run_list);
2663                         if (vxi == next->vx_info)
2664                                 continue;
2665                         
2666                         vxi = next->vx_info;
2667                         ret = vx_tokens_recalc(vxi);
2668                         // tokens = vx_tokens_avail(next);
2669
2670                         if (ret > 0) {
2671                                 list_del(&next->run_list);
2672                                 next->state &= ~TASK_ONHOLD;
2673                                 recalc_task_prio(next, now);
2674                                 __activate_task(next, rq);
2675                                 // printk("×·· unhold %p\n", next);
2676                                 break;
2677                         }
2678                         if ((ret < 0) && (maxidle < ret))
2679                                 maxidle = ret;
2680                 }
2681         }
2682         rq->idle_tokens = -maxidle;
2683         
2684  pick_next:
2685 #endif
2686         next = rq_get_next_task(rq,cpu);
2687         if (unlikely(next == NULL)) {
2688                 next = rq->idle;
2689                 goto switch_tasks;
2690         }
2691
2692         if (dependent_sleeper(cpu, rq, next)) {
2693                 next = rq->idle;
2694                 goto switch_tasks;
2695         }
2696
2697 #ifdef  CONFIG_VSERVER_HARDCPU          
2698         vxi = next->vx_info;
2699         if (vxi && __vx_flags(vxi->vx_flags,
2700                               VXF_SCHED_PAUSE|VXF_SCHED_HARD, 0)) {
2701                 int ret = vx_tokens_recalc(vxi);
2702                 
2703                 if (unlikely(ret <= 0)) {
2704                         if (ret && (rq->idle_tokens > -ret))
2705                                 rq->idle_tokens = -ret;
2706                         deactivate_task(next, rq);
2707                         list_add_tail(&next->run_list, &rq->hold_queue);
2708                         next->state |= TASK_ONHOLD;                     
2709                         goto pick_next;
2710                 }
2711         }
2712 #endif
2713
2714         if (!rt_task(next) && next->activated > 0) {
2715                 unsigned long long delta = now - next->timestamp;
2716
2717                 if (next->activated == 1)
2718                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2719
2720                 array = next->array;
2721                 dequeue_task(next, array);
2722                 recalc_task_prio(next, next->timestamp + delta);
2723                 enqueue_task(next, array);
2724         }
2725         next->activated = 0;
2726 switch_tasks:
2727         prefetch(next);
2728         if (test_and_clear_tsk_thread_flag(prev,TIF_NEED_RESCHED))
2729                 rq->nr_preempt++;
2730         RCU_qsctr(task_cpu(prev))++;
2731
2732         prev->sleep_avg -= run_time;
2733         if ((long)prev->sleep_avg <= 0) {
2734                 prev->sleep_avg = 0;
2735                 if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
2736                         prev->interactive_credit--;
2737         }
2738         add_delay_ts(prev,runcpu_total,prev->timestamp,now);
2739         prev->timestamp = now;
2740
2741         if (likely(prev != next)) {
2742                 add_delay_ts(next,waitcpu_total,next->timestamp,now);
2743                 inc_delay(next,runs);
2744                 next->timestamp = now;
2745                 rq->nr_switches++;
2746                 rq->curr = next;
2747                 ++*switch_count;
2748
2749                 prepare_arch_switch(rq, next);
2750                 prev = context_switch(rq, prev, next);
2751                 barrier();
2752
2753                 finish_task_switch(prev);
2754         } else
2755                 spin_unlock_irq(&rq->lock);
2756
2757         reacquire_kernel_lock(current);
2758         preempt_enable_no_resched();
2759         if (test_thread_flag(TIF_NEED_RESCHED))
2760                 goto need_resched;
2761
2762         
2763         return;
2764         
2765  dump_scheduling_disabled:
2766         /* allow scheduling only if this is the dumping cpu */
2767         if (dump_oncpu != smp_processor_id()+1) {
2768                 while (dump_oncpu)
2769                         cpu_relax();
2770         }
2771         return;
2772 }
2773
2774 EXPORT_SYMBOL(schedule);
2775 #ifdef CONFIG_PREEMPT
2776 /*
2777  * this is is the entry point to schedule() from in-kernel preemption
2778  * off of preempt_enable.  Kernel preemptions off return from interrupt
2779  * occur there and call schedule directly.
2780  */
2781 asmlinkage void __sched preempt_schedule(void)
2782 {
2783         struct thread_info *ti = current_thread_info();
2784
2785         /*
2786          * If there is a non-zero preempt_count or interrupts are disabled,
2787          * we do not want to preempt the current task.  Just return..
2788          */
2789         if (unlikely(ti->preempt_count || irqs_disabled()))
2790                 return;
2791
2792 need_resched:
2793         ti->preempt_count = PREEMPT_ACTIVE;
2794         schedule();
2795         ti->preempt_count = 0;
2796
2797         /* we could miss a preemption opportunity between schedule and now */
2798         barrier();
2799         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
2800                 goto need_resched;
2801 }
2802
2803 EXPORT_SYMBOL(preempt_schedule);
2804 #endif /* CONFIG_PREEMPT */
2805
2806 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2807 {
2808         task_t *p = curr->task;
2809         return try_to_wake_up(p, mode, sync);
2810 }
2811
2812 EXPORT_SYMBOL(default_wake_function);
2813
2814 /*
2815  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
2816  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
2817  * number) then we wake all the non-exclusive tasks and one exclusive task.
2818  *
2819  * There are circumstances in which we can try to wake a task which has already
2820  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
2821  * zero in this (rare) case, and we handle it by continuing to scan the queue.
2822  */
2823 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2824                              int nr_exclusive, int sync, void *key)
2825 {
2826         struct list_head *tmp, *next;
2827
2828         list_for_each_safe(tmp, next, &q->task_list) {
2829                 wait_queue_t *curr;
2830                 unsigned flags;
2831                 curr = list_entry(tmp, wait_queue_t, task_list);
2832                 flags = curr->flags;
2833                 if (curr->func(curr, mode, sync, key) &&
2834                     (flags & WQ_FLAG_EXCLUSIVE) &&
2835                     !--nr_exclusive)
2836                         break;
2837         }
2838 }
2839
2840 /**
2841  * __wake_up - wake up threads blocked on a waitqueue.
2842  * @q: the waitqueue
2843  * @mode: which threads
2844  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2845  */
2846 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
2847                                 int nr_exclusive, void *key)
2848 {
2849         unsigned long flags;
2850
2851         spin_lock_irqsave(&q->lock, flags);
2852         __wake_up_common(q, mode, nr_exclusive, 0, key);
2853         spin_unlock_irqrestore(&q->lock, flags);
2854 }
2855
2856 EXPORT_SYMBOL(__wake_up);
2857
2858 /*
2859  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2860  */
2861 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
2862 {
2863         __wake_up_common(q, mode, 1, 0, NULL);
2864 }
2865
2866 /**
2867  * __wake_up - sync- wake up threads blocked on a waitqueue.
2868  * @q: the waitqueue
2869  * @mode: which threads
2870  * @nr_exclusive: how many wake-one or wake-many threads to wake up
2871  *
2872  * The sync wakeup differs that the waker knows that it will schedule
2873  * away soon, so while the target thread will be woken up, it will not
2874  * be migrated to another CPU - ie. the two threads are 'synchronized'
2875  * with each other. This can prevent needless bouncing between CPUs.
2876  *
2877  * On UP it can prevent extra preemption.
2878  */
2879 void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2880 {
2881         unsigned long flags;
2882         int sync = 1;
2883
2884         if (unlikely(!q))
2885                 return;
2886
2887         if (unlikely(!nr_exclusive))
2888                 sync = 0;
2889
2890         spin_lock_irqsave(&q->lock, flags);
2891         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
2892         spin_unlock_irqrestore(&q->lock, flags);
2893 }
2894 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
2895
2896 void fastcall complete(struct completion *x)
2897 {
2898         unsigned long flags;
2899
2900         spin_lock_irqsave(&x->wait.lock, flags);
2901         x->done++;
2902         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2903                          1, 0, NULL);
2904         spin_unlock_irqrestore(&x->wait.lock, flags);
2905 }
2906 EXPORT_SYMBOL(complete);
2907
2908 void fastcall complete_all(struct completion *x)
2909 {
2910         unsigned long flags;
2911
2912         spin_lock_irqsave(&x->wait.lock, flags);
2913         x->done += UINT_MAX/2;
2914         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2915                          0, 0, NULL);
2916         spin_unlock_irqrestore(&x->wait.lock, flags);
2917 }
2918 EXPORT_SYMBOL(complete_all);
2919
2920 void fastcall __sched wait_for_completion(struct completion *x)
2921 {
2922         might_sleep();
2923         spin_lock_irq(&x->wait.lock);
2924         if (!x->done) {
2925                 DECLARE_WAITQUEUE(wait, current);
2926
2927                 wait.flags |= WQ_FLAG_EXCLUSIVE;
2928                 __add_wait_queue_tail(&x->wait, &wait);
2929                 do {
2930                         __set_current_state(TASK_UNINTERRUPTIBLE);
2931                         spin_unlock_irq(&x->wait.lock);
2932                         schedule();
2933                         spin_lock_irq(&x->wait.lock);
2934                 } while (!x->done);
2935                 __remove_wait_queue(&x->wait, &wait);
2936         }
2937         x->done--;
2938         spin_unlock_irq(&x->wait.lock);
2939 }
2940 EXPORT_SYMBOL(wait_for_completion);
2941
2942 #define SLEEP_ON_VAR                                    \
2943         unsigned long flags;                            \
2944         wait_queue_t wait;                              \
2945         init_waitqueue_entry(&wait, current);
2946
2947 #define SLEEP_ON_HEAD                                   \
2948         spin_lock_irqsave(&q->lock,flags);              \
2949         __add_wait_queue(q, &wait);                     \
2950         spin_unlock(&q->lock);
2951
2952 #define SLEEP_ON_TAIL                                   \
2953         spin_lock_irq(&q->lock);                        \
2954         __remove_wait_queue(q, &wait);                  \
2955         spin_unlock_irqrestore(&q->lock, flags);
2956
2957 #define SLEEP_ON_BKLCHECK                               \
2958         if (unlikely(!kernel_locked()) &&               \
2959             sleep_on_bkl_warnings < 10) {               \
2960                 sleep_on_bkl_warnings++;                \
2961                 WARN_ON(1);                             \
2962         }
2963
2964 static int sleep_on_bkl_warnings;
2965
2966 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
2967 {
2968         SLEEP_ON_VAR
2969
2970         SLEEP_ON_BKLCHECK
2971
2972         current->state = TASK_INTERRUPTIBLE;
2973
2974         SLEEP_ON_HEAD
2975         schedule();
2976         SLEEP_ON_TAIL
2977 }
2978
2979 EXPORT_SYMBOL(interruptible_sleep_on);
2980
2981 long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2982 {
2983         SLEEP_ON_VAR
2984
2985         SLEEP_ON_BKLCHECK
2986
2987         current->state = TASK_INTERRUPTIBLE;
2988
2989         SLEEP_ON_HEAD
2990         timeout = schedule_timeout(timeout);
2991         SLEEP_ON_TAIL
2992
2993         return timeout;
2994 }
2995
2996 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2997
2998 void fastcall __sched sleep_on(wait_queue_head_t *q)
2999 {
3000         SLEEP_ON_VAR
3001
3002         SLEEP_ON_BKLCHECK
3003
3004         current->state = TASK_UNINTERRUPTIBLE;
3005
3006         SLEEP_ON_HEAD
3007         schedule();
3008         SLEEP_ON_TAIL
3009 }
3010
3011 EXPORT_SYMBOL(sleep_on);
3012
3013 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3014 {
3015         SLEEP_ON_VAR
3016
3017         SLEEP_ON_BKLCHECK
3018
3019         current->state = TASK_UNINTERRUPTIBLE;
3020
3021         SLEEP_ON_HEAD
3022         timeout = schedule_timeout(timeout);
3023         SLEEP_ON_TAIL
3024
3025         return timeout;
3026 }
3027
3028 EXPORT_SYMBOL(sleep_on_timeout);
3029
3030 void set_user_nice(task_t *p, long nice)
3031 {
3032         unsigned long flags;
3033         prio_array_t *array;
3034         runqueue_t *rq;
3035         int old_prio, new_prio, delta;
3036
3037         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3038                 return;
3039         /*
3040          * We have to be careful, if called from sys_setpriority(),
3041          * the task might be in the middle of scheduling on another CPU.
3042          */
3043         rq = task_rq_lock(p, &flags);
3044         /*
3045          * The RT priorities are set via setscheduler(), but we still
3046          * allow the 'normal' nice value to be set - but as expected
3047          * it wont have any effect on scheduling until the task is
3048          * not SCHED_NORMAL:
3049          */
3050         if (rt_task(p)) {
3051                 p->static_prio = NICE_TO_PRIO(nice);
3052                 goto out_unlock;
3053         }
3054         array = p->array;
3055         if (array)
3056                 dequeue_task(p, array);
3057
3058         old_prio = p->prio;
3059         new_prio = NICE_TO_PRIO(nice);
3060         delta = new_prio - old_prio;
3061         p->static_prio = NICE_TO_PRIO(nice);
3062         p->prio += delta;
3063
3064         if (array) {
3065                 enqueue_task(p, array);
3066                 /*
3067                  * If the task increased its priority or is running and
3068                  * lowered its priority, then reschedule its CPU:
3069                  */
3070                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3071                         resched_task(rq->curr);
3072         }
3073 out_unlock:
3074         task_rq_unlock(rq, &flags);
3075 }
3076
3077 EXPORT_SYMBOL(set_user_nice);
3078
3079 #ifdef __ARCH_WANT_SYS_NICE
3080
3081 /*
3082  * sys_nice - change the priority of the current process.
3083  * @increment: priority increment
3084  *
3085  * sys_setpriority is a more generic, but much slower function that
3086  * does similar things.
3087  */
3088 asmlinkage long sys_nice(int increment)
3089 {
3090         int retval;
3091         long nice;
3092
3093         /*
3094          * Setpriority might change our priority at the same moment.
3095          * We don't have to worry. Conceptually one call occurs first
3096          * and we have a single winner.
3097          */
3098         if (increment < 0) {
3099                 if (!capable(CAP_SYS_NICE))
3100                         return -EPERM;
3101                 if (increment < -40)
3102                         increment = -40;
3103         }
3104         if (increment > 40)
3105                 increment = 40;
3106
3107         nice = PRIO_TO_NICE(current->static_prio) + increment;
3108         if (nice < -20)
3109                 nice = -20;
3110         if (nice > 19)
3111                 nice = 19;
3112
3113         retval = security_task_setnice(current, nice);
3114         if (retval)
3115                 return retval;
3116
3117         set_user_nice(current, nice);
3118         return 0;
3119 }
3120
3121 #endif
3122
3123 /**
3124  * task_prio - return the priority value of a given task.
3125  * @p: the task in question.
3126  *
3127  * This is the priority value as seen by users in /proc.
3128  * RT tasks are offset by -200. Normal tasks are centered
3129  * around 0, value goes from -16 to +15.
3130  */
3131 int task_prio(const task_t *p)
3132 {
3133         return p->prio - MAX_RT_PRIO;
3134 }
3135
3136 /**
3137  * task_nice - return the nice value of a given task.
3138  * @p: the task in question.
3139  */
3140 int task_nice(const task_t *p)
3141 {
3142         return TASK_NICE(p);
3143 }
3144 EXPORT_SYMBOL(task_nice);
3145
3146 /**
3147  * idle_cpu - is a given cpu idle currently?
3148  * @cpu: the processor in question.
3149  */
3150 int idle_cpu(int cpu)
3151 {
3152         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3153 }
3154
3155 EXPORT_SYMBOL_GPL(idle_cpu);
3156
3157 /**
3158  * find_process_by_pid - find a process with a matching PID value.
3159  * @pid: the pid in question.
3160  */
3161 static inline task_t *find_process_by_pid(pid_t pid)
3162 {
3163         return pid ? find_task_by_pid(pid) : current;
3164 }
3165
3166 /* Actually do priority change: must hold rq lock. */
3167 static void __setscheduler(struct task_struct *p, int policy, int prio)
3168 {
3169         BUG_ON(p->array);
3170         p->policy = policy;
3171         p->rt_priority = prio;
3172         if (policy != SCHED_NORMAL)
3173                 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
3174         else
3175                 p->prio = p->static_prio;
3176 }
3177
3178 /*
3179  * setscheduler - change the scheduling policy and/or RT priority of a thread.
3180  */
3181 static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3182 {
3183         struct sched_param lp;
3184         int retval = -EINVAL;
3185         int oldprio;
3186         prio_array_t *array;
3187         unsigned long flags;
3188         runqueue_t *rq;
3189         task_t *p;
3190
3191         if (!param || pid < 0)
3192                 goto out_nounlock;
3193
3194         retval = -EFAULT;
3195         if (copy_from_user(&lp, param, sizeof(struct sched_param)))
3196                 goto out_nounlock;
3197
3198         /*
3199          * We play safe to avoid deadlocks.
3200          */
3201         read_lock_irq(&tasklist_lock);
3202
3203         p = find_process_by_pid(pid);
3204
3205         retval = -ESRCH;
3206         if (!p)
3207                 goto out_unlock_tasklist;
3208
3209         /*
3210          * To be able to change p->policy safely, the apropriate
3211          * runqueue lock must be held.
3212          */
3213         rq = task_rq_lock(p, &flags);
3214
3215         if (policy < 0)
3216                 policy = p->policy;
3217         else {
3218                 retval = -EINVAL;
3219                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3220                                 policy != SCHED_NORMAL)
3221                         goto out_unlock;
3222         }
3223
3224         /*
3225          * Valid priorities for SCHED_FIFO and SCHED_RR are
3226          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3227          */
3228         retval = -EINVAL;
3229         if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
3230                 goto out_unlock;
3231         if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
3232                 goto out_unlock;
3233
3234         retval = -EPERM;
3235         if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
3236             !capable(CAP_SYS_NICE))
3237                 goto out_unlock;
3238         if ((current->euid != p->euid) && (current->euid != p->uid) &&
3239             !capable(CAP_SYS_NICE))
3240                 goto out_unlock;
3241
3242         retval = security_task_setscheduler(p, policy, &lp);
3243         if (retval)
3244                 goto out_unlock;
3245
3246         array = p->array;
3247         if (array)
3248                 deactivate_task(p, task_rq(p));
3249         retval = 0;
3250         oldprio = p->prio;
3251         __setscheduler(p, policy, lp.sched_priority);
3252         if (array) {
3253                 __activate_task(p, task_rq(p));
3254                 /*
3255                  * Reschedule if we are currently running on this runqueue and
3256                  * our priority decreased, or if we are not currently running on
3257                  * this runqueue and our priority is higher than the current's
3258                  */
3259                 if (task_running(rq, p)) {
3260                         if (p->prio > oldprio)
3261                                 resched_task(rq->curr);
3262                 } else if (TASK_PREEMPTS_CURR(p, rq))
3263                         resched_task(rq->curr);
3264         }
3265
3266 out_unlock:
3267         task_rq_unlock(rq, &flags);
3268 out_unlock_tasklist:
3269         read_unlock_irq(&tasklist_lock);
3270
3271 out_nounlock:
3272         return retval;
3273 }
3274
3275 /**
3276  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3277  * @pid: the pid in question.
3278  * @policy: new policy
3279  * @param: structure containing the new RT priority.
3280  */
3281 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
3282                                        struct sched_param __user *param)
3283 {
3284         return setscheduler(pid, policy, param);
3285 }
3286
3287 /**
3288  * sys_sched_setparam - set/change the RT priority of a thread
3289  * @pid: the pid in question.
3290  * @param: structure containing the new RT priority.
3291  */
3292 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3293 {
3294         return setscheduler(pid, -1, param);
3295 }
3296
3297 /**
3298  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3299  * @pid: the pid in question.
3300  */
3301 asmlinkage long sys_sched_getscheduler(pid_t pid)
3302 {
3303         int retval = -EINVAL;
3304         task_t *p;
3305
3306         if (pid < 0)
3307                 goto out_nounlock;
3308
3309         retval = -ESRCH;
3310         read_lock(&tasklist_lock);
3311         p = find_process_by_pid(pid);
3312         if (p) {
3313                 retval = security_task_getscheduler(p);
3314                 if (!retval)
3315                         retval = p->policy;
3316         }
3317         read_unlock(&tasklist_lock);
3318
3319 out_nounlock:
3320         return retval;
3321 }
3322
3323 /**
3324  * sys_sched_getscheduler - get the RT priority of a thread
3325  * @pid: the pid in question.
3326  * @param: structure containing the RT priority.
3327  */
3328 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3329 {
3330         struct sched_param lp;
3331         int retval = -EINVAL;
3332         task_t *p;
3333
3334         if (!param || pid < 0)
3335                 goto out_nounlock;
3336
3337         read_lock(&tasklist_lock);
3338         p = find_process_by_pid(pid);
3339         retval = -ESRCH;
3340         if (!p)
3341                 goto out_unlock;
3342
3343         retval = security_task_getscheduler(p);
3344         if (retval)
3345                 goto out_unlock;
3346
3347         lp.sched_priority = p->rt_priority;
3348         read_unlock(&tasklist_lock);
3349
3350         /*
3351          * This one might sleep, we cannot do it with a spinlock held ...
3352          */
3353         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3354
3355 out_nounlock:
3356         return retval;
3357
3358 out_unlock:
3359         read_unlock(&tasklist_lock);
3360         return retval;
3361 }
3362
3363 /**
3364  * sys_sched_setaffinity - set the cpu affinity of a process
3365  * @pid: pid of the process
3366  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3367  * @user_mask_ptr: user-space pointer to the new cpu mask
3368  */
3369 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
3370                                       unsigned long __user *user_mask_ptr)
3371 {
3372         cpumask_t new_mask;
3373         int retval;
3374         task_t *p;
3375
3376         if (len < sizeof(new_mask))
3377                 return -EINVAL;
3378
3379         if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
3380                 return -EFAULT;
3381
3382         lock_cpu_hotplug();
3383         read_lock(&tasklist_lock);
3384
3385         p = find_process_by_pid(pid);
3386         if (!p) {
3387                 read_unlock(&tasklist_lock);
3388                 unlock_cpu_hotplug();
3389                 return -ESRCH;
3390         }
3391
3392         /*
3393          * It is not safe to call set_cpus_allowed with the
3394          * tasklist_lock held.  We will bump the task_struct's
3395          * usage count and then drop tasklist_lock.
3396          */
3397         get_task_struct(p);
3398         read_unlock(&tasklist_lock);
3399
3400         retval = -EPERM;
3401         if ((current->euid != p->euid) && (current->euid != p->uid) &&
3402                         !capable(CAP_SYS_NICE))
3403                 goto out_unlock;
3404
3405         retval = set_cpus_allowed(p, new_mask);
3406
3407 out_unlock:
3408         put_task_struct(p);
3409         unlock_cpu_hotplug();
3410         return retval;
3411 }
3412
3413 /*
3414  * Represents all cpu's present in the system
3415  * In systems capable of hotplug, this map could dynamically grow
3416  * as new cpu's are detected in the system via any platform specific
3417  * method, such as ACPI for e.g.
3418  */
3419
3420 cpumask_t cpu_present_map;
3421 EXPORT_SYMBOL(cpu_present_map);
3422
3423 #ifndef CONFIG_SMP
3424 cpumask_t cpu_online_map = CPU_MASK_ALL;
3425 cpumask_t cpu_possible_map = CPU_MASK_ALL;
3426 #endif
3427
3428 /**
3429  * sys_sched_getaffinity - get the cpu affinity of a process
3430  * @pid: pid of the process
3431  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3432  * @user_mask_ptr: user-space pointer to hold the current cpu mask
3433  */
3434 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
3435                                       unsigned long __user *user_mask_ptr)
3436 {
3437         unsigned int real_len;
3438         cpumask_t mask;
3439         int retval;
3440         task_t *p;
3441
3442         real_len = sizeof(mask);
3443         if (len < real_len)
3444                 return -EINVAL;
3445
3446         lock_cpu_hotplug();
3447         read_lock(&tasklist_lock);
3448
3449         retval = -ESRCH;
3450         p = find_process_by_pid(pid);
3451         if (!p)
3452                 goto out_unlock;
3453
3454         retval = 0;
3455         cpus_and(mask, p->cpus_allowed, cpu_possible_map);
3456
3457 out_unlock:
3458         read_unlock(&tasklist_lock);
3459         unlock_cpu_hotplug();
3460         if (retval)
3461                 return retval;
3462         if (copy_to_user(user_mask_ptr, &mask, real_len))
3463                 return -EFAULT;
3464         return real_len;
3465 }
3466
3467 /**
3468  * sys_sched_yield - yield the current processor to other threads.
3469  *
3470  * this function yields the current CPU by moving the calling thread
3471  * to the expired array. If there are no other threads running on this
3472  * CPU then this function will return.
3473  */
3474 asmlinkage long sys_sched_yield(void)
3475 {
3476         runqueue_t *rq = this_rq_lock();
3477         prio_array_t *array = current->array;
3478         prio_array_t *target = rq_expired(current,rq);
3479
3480         /*
3481          * We implement yielding by moving the task into the expired
3482          * queue.
3483          *
3484          * (special rule: RT tasks will just roundrobin in the active
3485          *  array.)
3486          */
3487         if (unlikely(rt_task(current)))
3488                 target = rq_active(current,rq);
3489
3490         dequeue_task(current, array);
3491         enqueue_task(current, target);
3492
3493         /*
3494          * Since we are going to call schedule() anyway, there's
3495          * no need to preempt or enable interrupts:
3496          */
3497         _raw_spin_unlock(&rq->lock);
3498         preempt_enable_no_resched();
3499
3500         schedule();
3501
3502         return 0;
3503 }
3504
3505 void __sched __cond_resched(void)
3506 {
3507 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
3508         __might_sleep(__FILE__, __LINE__, 0);
3509 #endif
3510         /*
3511          * The system_state check is somewhat ugly but we might be
3512          * called during early boot when we are not yet ready to reschedule.
3513          */
3514         if (need_resched() && system_state >= SYSTEM_BOOTING_SCHEDULER_OK) {
3515                 set_current_state(TASK_RUNNING);
3516                 schedule();
3517         }
3518 }
3519
3520 EXPORT_SYMBOL(__cond_resched);
3521
3522 void __sched __cond_resched_lock(spinlock_t * lock)
3523 {
3524         if (need_resched()) {
3525                 _raw_spin_unlock(lock);
3526                 preempt_enable_no_resched();
3527                 set_current_state(TASK_RUNNING);
3528                 schedule();
3529                 spin_lock(lock);
3530         }
3531 }
3532
3533 EXPORT_SYMBOL(__cond_resched_lock);
3534
3535 /**
3536  * yield - yield the current processor to other threads.
3537  *
3538  * this is a shortcut for kernel-space yielding - it marks the
3539  * thread runnable and calls sys_sched_yield().
3540  */
3541 void __sched yield(void)
3542 {
3543         set_current_state(TASK_RUNNING);
3544         sys_sched_yield();
3545 }
3546
3547 EXPORT_SYMBOL(yield);
3548
3549 /*
3550  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
3551  * that process accounting knows that this is a task in IO wait state.
3552  *
3553  * But don't do that if it is a deliberate, throttling IO wait (this task
3554  * has set its backing_dev_info: the queue against which it should throttle)
3555  */
3556 void __sched io_schedule(void)
3557 {
3558         struct runqueue *rq = this_rq();
3559         def_delay_var(dstart);
3560
3561         start_delay_set(dstart,PF_IOWAIT);
3562         atomic_inc(&rq->nr_iowait);
3563         schedule();
3564         atomic_dec(&rq->nr_iowait);
3565         add_io_delay(dstart);
3566 }
3567
3568 EXPORT_SYMBOL(io_schedule);
3569
3570 long __sched io_schedule_timeout(long timeout)
3571 {
3572         struct runqueue *rq = this_rq();
3573         long ret;
3574         def_delay_var(dstart);
3575
3576         start_delay_set(dstart,PF_IOWAIT);
3577         atomic_inc(&rq->nr_iowait);
3578         ret = schedule_timeout(timeout);
3579         atomic_dec(&rq->nr_iowait);
3580         add_io_delay(dstart);
3581         return ret;
3582 }
3583
3584 /**
3585  * sys_sched_get_priority_max - return maximum RT priority.
3586  * @policy: scheduling class.
3587  *
3588  * this syscall returns the maximum rt_priority that can be used
3589  * by a given scheduling class.
3590  */
3591 asmlinkage long sys_sched_get_priority_max(int policy)
3592 {
3593         int ret = -EINVAL;
3594
3595         switch (policy) {
3596         case SCHED_FIFO:
3597         case SCHED_RR:
3598                 ret = MAX_USER_RT_PRIO-1;
3599                 break;
3600         case SCHED_NORMAL:
3601                 ret = 0;
3602                 break;
3603         }
3604         return ret;
3605 }
3606
3607 /**
3608  * sys_sched_get_priority_min - return minimum RT priority.
3609  * @policy: scheduling class.
3610  *
3611  * this syscall returns the minimum rt_priority that can be used
3612  * by a given scheduling class.
3613  */
3614 asmlinkage long sys_sched_get_priority_min(int policy)
3615 {
3616         int ret = -EINVAL;
3617
3618         switch (policy) {
3619         case SCHED_FIFO:
3620         case SCHED_RR:
3621                 ret = 1;
3622                 break;
3623         case SCHED_NORMAL:
3624                 ret = 0;
3625         }
3626         return ret;
3627 }
3628
3629 /**
3630  * sys_sched_rr_get_interval - return the default timeslice of a process.
3631  * @pid: pid of the process.
3632  * @interval: userspace pointer to the timeslice value.
3633  *
3634  * this syscall writes the default timeslice value of a given process
3635  * into the user-space timespec buffer. A value of '0' means infinity.
3636  */
3637 asmlinkage
3638 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
3639 {
3640         int retval = -EINVAL;
3641         struct timespec t;
3642         task_t *p;
3643
3644         if (pid < 0)
3645                 goto out_nounlock;
3646
3647         retval = -ESRCH;
3648         read_lock(&tasklist_lock);
3649         p = find_process_by_pid(pid);
3650         if (!p)
3651                 goto out_unlock;
3652
3653         retval = security_task_getscheduler(p);
3654         if (retval)
3655                 goto out_unlock;
3656
3657         jiffies_to_timespec(p->policy & SCHED_FIFO ?
3658                                 0 : task_timeslice(p), &t);
3659         read_unlock(&tasklist_lock);
3660         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
3661 out_nounlock:
3662         return retval;
3663 out_unlock:
3664         read_unlock(&tasklist_lock);
3665         return retval;
3666 }
3667
3668 static inline struct task_struct *eldest_child(struct task_struct *p)
3669 {
3670         if (list_empty(&p->children)) return NULL;
3671         return list_entry(p->children.next,struct task_struct,sibling);
3672 }
3673
3674 static inline struct task_struct *older_sibling(struct task_struct *p)
3675 {
3676         if (p->sibling.prev==&p->parent->children) return NULL;
3677         return list_entry(p->sibling.prev,struct task_struct,sibling);
3678 }
3679
3680 static inline struct task_struct *younger_sibling(struct task_struct *p)
3681 {
3682         if (p->sibling.next==&p->parent->children) return NULL;
3683         return list_entry(p->sibling.next,struct task_struct,sibling);
3684 }
3685
3686 static void show_task(task_t * p)
3687 {
3688         task_t *relative;
3689         unsigned state;
3690         unsigned long free = 0;
3691         static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" };
3692
3693         printk("%-13.13s ", p->comm);
3694         state = p->state ? __ffs(p->state) + 1 : 0;
3695         if (state < ARRAY_SIZE(stat_nam))
3696                 printk(stat_nam[state]);
3697         else
3698                 printk("?");
3699 #if (BITS_PER_LONG == 32)
3700         if (state == TASK_RUNNING)
3701                 printk(" running ");
3702         else
3703                 printk(" %08lX ", thread_saved_pc(p));
3704 #else
3705         if (state == TASK_RUNNING)
3706                 printk("  running task   ");
3707         else
3708                 printk(" %016lx ", thread_saved_pc(p));
3709 #endif
3710 #ifdef CONFIG_DEBUG_STACK_USAGE
3711         {
3712                 unsigned long * n = (unsigned long *) (p->thread_info+1);
3713                 while (!*n)
3714                         n++;
3715                 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
3716         }
3717 #endif
3718         printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
3719         if ((relative = eldest_child(p)))
3720                 printk("%5d ", relative->pid);
3721         else
3722                 printk("      ");
3723         if ((relative = younger_sibling(p)))
3724                 printk("%7d", relative->pid);
3725         else
3726                 printk("       ");
3727         if ((relative = older_sibling(p)))
3728                 printk(" %5d", relative->pid);
3729         else
3730                 printk("      ");
3731         if (!p->mm)
3732                 printk(" (L-TLB)\n");
3733         else
3734                 printk(" (NOTLB)\n");
3735
3736         if (state != TASK_RUNNING)
3737                 show_stack(p, NULL);
3738 }
3739
3740 void show_state(void)
3741 {
3742         task_t *g, *p;
3743
3744 #if (BITS_PER_LONG == 32)
3745         printk("\n"
3746                "                                               sibling\n");
3747         printk("  task             PC      pid father child younger older\n");
3748 #else
3749         printk("\n"
3750                "                                                       sibling\n");
3751         printk("  task                 PC          pid father child younger older\n");
3752 #endif
3753         read_lock(&tasklist_lock);
3754         do_each_thread(g, p) {
3755                 /*
3756                  * reset the NMI-timeout, listing all files on a slow
3757                  * console might take alot of time:
3758                  */
3759                 touch_nmi_watchdog();
3760                 show_task(p);
3761         } while_each_thread(g, p);
3762
3763         read_unlock(&tasklist_lock);
3764 }
3765
3766 void __devinit init_idle(task_t *idle, int cpu)
3767 {
3768         runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
3769         unsigned long flags;
3770
3771         local_irq_save(flags);
3772         double_rq_lock(idle_rq, rq);
3773
3774         idle_rq->curr = idle_rq->idle = idle;
3775         deactivate_task(idle, rq);
3776         idle->array = NULL;
3777         idle->prio = MAX_PRIO;
3778         idle->state = TASK_RUNNING;
3779         set_task_cpu(idle, cpu);
3780         double_rq_unlock(idle_rq, rq);
3781         set_tsk_need_resched(idle);
3782         local_irq_restore(flags);
3783
3784         /* Set the preempt count _outside_ the spinlocks! */
3785 #ifdef CONFIG_PREEMPT
3786         idle->thread_info->preempt_count = (idle->lock_depth >= 0);
3787 #else
3788         idle->thread_info->preempt_count = 0;
3789 #endif
3790 }
3791
3792 /*
3793  * In a system that switches off the HZ timer nohz_cpu_mask
3794  * indicates which cpus entered this state. This is used
3795  * in the rcu update to wait only for active cpus. For system
3796  * which do not switch off the HZ timer nohz_cpu_mask should
3797  * always be CPU_MASK_NONE.
3798  */
3799 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
3800
3801 #ifdef CONFIG_SMP
3802 /*
3803  * This is how migration works:
3804  *
3805  * 1) we queue a migration_req_t structure in the source CPU's
3806  *    runqueue and wake up that CPU's migration thread.
3807  * 2) we down() the locked semaphore => thread blocks.
3808  * 3) migration thread wakes up (implicitly it forces the migrated
3809  *    thread off the CPU)
3810  * 4) it gets the migration request and checks whether the migrated
3811  *    task is still in the wrong runqueue.
3812  * 5) if it's in the wrong runqueue then the migration thread removes
3813  *    it and puts it into the right queue.
3814  * 6) migration thread up()s the semaphore.
3815  * 7) we wake up and the migration is done.
3816  */
3817
3818 /*
3819  * Change a given task's CPU affinity. Migrate the thread to a
3820  * proper CPU and schedule it away if the CPU it's executing on
3821  * is removed from the allowed bitmask.
3822  *
3823  * NOTE: the caller must have a valid reference to the task, the
3824  * task must not exit() & deallocate itself prematurely.  The
3825  * call is not atomic; no spinlocks may be held.
3826  */
3827 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
3828 {
3829         unsigned long flags;
3830         int ret = 0;
3831         migration_req_t req;
3832         runqueue_t *rq;
3833
3834         rq = task_rq_lock(p, &flags);
3835         if (!cpus_intersects(new_mask, cpu_online_map)) {
3836                 ret = -EINVAL;
3837                 goto out;
3838         }
3839
3840         p->cpus_allowed = new_mask;
3841         /* Can the task run on the task's current CPU? If so, we're done */
3842         if (cpu_isset(task_cpu(p), new_mask))
3843                 goto out;
3844
3845         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
3846                 /* Need help from migration thread: drop lock and wait. */
3847                 task_rq_unlock(rq, &flags);
3848                 wake_up_process(rq->migration_thread);
3849                 wait_for_completion(&req.done);
3850                 tlb_migrate_finish(p->mm);
3851                 return 0;
3852         }
3853 out:
3854         task_rq_unlock(rq, &flags);
3855         return ret;
3856 }
3857
3858 EXPORT_SYMBOL_GPL(set_cpus_allowed);
3859
3860 /*
3861  * Move (not current) task off this cpu, onto dest cpu.  We're doing
3862  * this because either it can't run here any more (set_cpus_allowed()
3863  * away from this CPU, or CPU going down), or because we're
3864  * attempting to rebalance this task on exec (sched_balance_exec).
3865  *
3866  * So we race with normal scheduler movements, but that's OK, as long
3867  * as the task is no longer on this CPU.
3868  */
3869 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
3870 {
3871         runqueue_t *rq_dest, *rq_src;
3872
3873         if (unlikely(cpu_is_offline(dest_cpu)))
3874                 return;
3875
3876         rq_src  = cpu_rq(src_cpu);
3877         rq_dest = cpu_rq(dest_cpu);
3878
3879         double_rq_lock(rq_src, rq_dest);
3880         /* Already moved. */
3881         if (task_cpu(p) != src_cpu)
3882                 goto out;
3883         /* Affinity changed (again). */
3884         if (!cpu_isset(dest_cpu, p->cpus_allowed))
3885                 goto out;
3886
3887         if (p->array) {
3888                 /*
3889                  * Sync timestamp with rq_dest's before activating.
3890                  * The same thing could be achieved by doing this step
3891                  * afterwards, and pretending it was a local activate.
3892                  * This way is cleaner and logically correct.
3893                  */
3894                 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
3895                                 + rq_dest->timestamp_last_tick;
3896                 deactivate_task(p, rq_src);
3897                 set_task_cpu(p, dest_cpu);
3898                 activate_task(p, rq_dest, 0);
3899                 if (TASK_PREEMPTS_CURR(p, rq_dest))
3900                         resched_task(rq_dest->curr);
3901         } else
3902                 set_task_cpu(p, dest_cpu);
3903
3904 out:
3905         double_rq_unlock(rq_src, rq_dest);
3906 }
3907
3908 /*
3909  * migration_thread - this is a highprio system thread that performs
3910  * thread migration by bumping thread off CPU then 'pushing' onto
3911  * another runqueue.
3912  */
3913 static int migration_thread(void * data)
3914 {
3915         runqueue_t *rq;
3916         int cpu = (long)data;
3917
3918         rq = cpu_rq(cpu);
3919         BUG_ON(rq->migration_thread != current);
3920
3921         set_current_state(TASK_INTERRUPTIBLE);
3922         while (!kthread_should_stop()) {
3923                 struct list_head *head;
3924                 migration_req_t *req;
3925
3926                 if (current->flags & PF_FREEZE)
3927                         refrigerator(PF_FREEZE);
3928
3929                 spin_lock_irq(&rq->lock);
3930
3931                 if (cpu_is_offline(cpu)) {
3932                         spin_unlock_irq(&rq->lock);
3933                         goto wait_to_die;
3934                 }
3935
3936                 if (rq->active_balance) {
3937                         active_load_balance(rq, cpu);
3938                         rq->active_balance = 0;
3939                 }
3940
3941                 head = &rq->migration_queue;
3942
3943                 if (list_empty(head)) {
3944                         spin_unlock_irq(&rq->lock);
3945                         schedule();
3946                         set_current_state(TASK_INTERRUPTIBLE);
3947                         continue;
3948                 }
3949                 req = list_entry(head->next, migration_req_t, list);
3950                 list_del_init(head->next);
3951
3952                 if (req->type == REQ_MOVE_TASK) {
3953                         spin_unlock(&rq->lock);
3954                         __migrate_task(req->task, smp_processor_id(),
3955                                         req->dest_cpu);
3956                         local_irq_enable();
3957                 } else if (req->type == REQ_SET_DOMAIN) {
3958                         rq->sd = req->sd;
3959                         spin_unlock_irq(&rq->lock);
3960                 } else {
3961                         spin_unlock_irq(&rq->lock);
3962                         WARN_ON(1);
3963                 }
3964
3965                 complete(&req->done);
3966         }
3967         __set_current_state(TASK_RUNNING);
3968         return 0;
3969
3970 wait_to_die:
3971         /* Wait for kthread_stop */
3972         set_current_state(TASK_INTERRUPTIBLE);
3973         while (!kthread_should_stop()) {
3974                 schedule();
3975                 set_current_state(TASK_INTERRUPTIBLE);
3976         }
3977         __set_current_state(TASK_RUNNING);
3978         return 0;
3979 }
3980
3981 #ifdef CONFIG_HOTPLUG_CPU
3982 /* migrate_all_tasks - function to migrate all tasks from the dead cpu.  */
3983 static void migrate_all_tasks(int src_cpu)
3984 {
3985         struct task_struct *tsk, *t;
3986         int dest_cpu;
3987         unsigned int node;
3988
3989         write_lock_irq(&tasklist_lock);
3990
3991         /* watch out for per node tasks, let's stay on this node */
3992         node = cpu_to_node(src_cpu);
3993
3994         do_each_thread(t, tsk) {
3995                 cpumask_t mask;
3996                 if (tsk == current)
3997                         continue;
3998
3999                 if (task_cpu(tsk) != src_cpu)
4000                         continue;
4001
4002                 /* Figure out where this task should go (attempting to
4003                  * keep it on-node), and check if it can be migrated
4004                  * as-is.  NOTE that kernel threads bound to more than
4005                  * one online cpu will be migrated. */
4006                 mask = node_to_cpumask(node);
4007                 cpus_and(mask, mask, tsk->cpus_allowed);
4008                 dest_cpu = any_online_cpu(mask);
4009                 if (dest_cpu == NR_CPUS)
4010                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
4011                 if (dest_cpu == NR_CPUS) {
4012                         cpus_setall(tsk->cpus_allowed);
4013                         dest_cpu = any_online_cpu(tsk->cpus_allowed);
4014
4015                         /* Don't tell them about moving exiting tasks
4016                            or kernel threads (both mm NULL), since
4017                            they never leave kernel. */
4018                         if (tsk->mm && printk_ratelimit())
4019                                 printk(KERN_INFO "process %d (%s) no "
4020                                        "longer affine to cpu%d\n",
4021                                        tsk->pid, tsk->comm, src_cpu);
4022                 }
4023
4024                 __migrate_task(tsk, src_cpu, dest_cpu);
4025         } while_each_thread(t, tsk);
4026
4027         write_unlock_irq(&tasklist_lock);
4028 }
4029
4030 /* Schedules idle task to be the next runnable task on current CPU.
4031  * It does so by boosting its priority to highest possible and adding it to
4032  * the _front_ of runqueue. Used by CPU offline code.
4033  */
4034 void sched_idle_next(void)
4035 {
4036         int cpu = smp_processor_id();
4037         runqueue_t *rq = this_rq();
4038         struct task_struct *p = rq->idle;
4039         unsigned long flags;
4040
4041         /* cpu has to be offline */
4042         BUG_ON(cpu_online(cpu));
4043
4044         /* Strictly not necessary since rest of the CPUs are stopped by now
4045          * and interrupts disabled on current cpu.
4046          */
4047         spin_lock_irqsave(&rq->lock, flags);
4048
4049         __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4050         /* Add idle task to _front_ of it's priority queue */
4051         __activate_idle_task(p, rq);
4052
4053         spin_unlock_irqrestore(&rq->lock, flags);
4054 }
4055 #endif /* CONFIG_HOTPLUG_CPU */
4056
4057 /*
4058  * migration_call - callback that gets triggered when a CPU is added.
4059  * Here we can start up the necessary migration thread for the new CPU.
4060  */
4061 static int migration_call(struct notifier_block *nfb, unsigned long action,
4062                           void *hcpu)
4063 {
4064         int cpu = (long)hcpu;
4065         struct task_struct *p;
4066         struct runqueue *rq;
4067         unsigned long flags;
4068
4069         switch (action) {
4070         case CPU_UP_PREPARE:
4071                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
4072                 if (IS_ERR(p))
4073                         return NOTIFY_BAD;
4074                 p->flags |= PF_NOFREEZE;
4075                 kthread_bind(p, cpu);
4076                 /* Must be high prio: stop_machine expects to yield to it. */
4077                 rq = task_rq_lock(p, &flags);
4078                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4079                 task_rq_unlock(rq, &flags);
4080                 cpu_rq(cpu)->migration_thread = p;
4081                 break;
4082         case CPU_ONLINE:
4083                 /* Strictly unneccessary, as first user will wake it. */
4084                 wake_up_process(cpu_rq(cpu)->migration_thread);
4085                 break;
4086 #ifdef CONFIG_HOTPLUG_CPU
4087         case CPU_UP_CANCELED:
4088                 /* Unbind it from offline cpu so it can run.  Fall thru. */
4089                 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
4090                 kthread_stop(cpu_rq(cpu)->migration_thread);
4091                 cpu_rq(cpu)->migration_thread = NULL;
4092                 break;
4093         case CPU_DEAD:
4094                 migrate_all_tasks(cpu);
4095                 rq = cpu_rq(cpu);
4096                 kthread_stop(rq->migration_thread);
4097                 rq->migration_thread = NULL;
4098                 /* Idle task back to normal (off runqueue, low prio) */
4099                 rq = task_rq_lock(rq->idle, &flags);
4100                 deactivate_task(rq->idle, rq);
4101                 rq->idle->static_prio = MAX_PRIO;
4102                 __setscheduler(rq->idle, SCHED_NORMAL, 0);
4103                 task_rq_unlock(rq, &flags);
4104                 BUG_ON(rq->nr_running != 0);
4105
4106                 /* No need to migrate the tasks: it was best-effort if
4107                  * they didn't do lock_cpu_hotplug().  Just wake up
4108                  * the requestors. */
4109                 spin_lock_irq(&rq->lock);
4110                 while (!list_empty(&rq->migration_queue)) {
4111                         migration_req_t *req;
4112                         req = list_entry(rq->migration_queue.next,
4113                                          migration_req_t, list);
4114                         BUG_ON(req->type != REQ_MOVE_TASK);
4115                         list_del_init(&req->list);
4116                         complete(&req->done);
4117                 }
4118                 spin_unlock_irq(&rq->lock);
4119                 break;
4120 #endif
4121         }
4122         return NOTIFY_OK;
4123 }
4124
4125 /* Register at highest priority so that task migration (migrate_all_tasks)
4126  * happens before everything else.
4127  */
4128 static struct notifier_block __devinitdata migration_notifier = {
4129         .notifier_call = migration_call,
4130         .priority = 10
4131 };
4132
4133 int __init migration_init(void)
4134 {
4135         void *cpu = (void *)(long)smp_processor_id();
4136         /* Start one for boot CPU. */
4137         migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4138         migration_call(&migration_notifier, CPU_ONLINE, cpu);
4139         register_cpu_notifier(&migration_notifier);
4140         return 0;
4141 }
4142 #endif
4143
4144 /*
4145  * The 'big kernel lock'
4146  *
4147  * This spinlock is taken and released recursively by lock_kernel()
4148  * and unlock_kernel().  It is transparently dropped and reaquired
4149  * over schedule().  It is used to protect legacy code that hasn't
4150  * been migrated to a proper locking design yet.
4151  *
4152  * Don't use in new code.
4153  *
4154  * Note: spinlock debugging needs this even on !CONFIG_SMP.
4155  */
4156 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
4157 EXPORT_SYMBOL(kernel_flag);
4158
4159 #ifdef CONFIG_SMP
4160 /* Attach the domain 'sd' to 'cpu' as its base domain */
4161 void cpu_attach_domain(struct sched_domain *sd, int cpu)
4162 {
4163         migration_req_t req;
4164         unsigned long flags;
4165         runqueue_t *rq = cpu_rq(cpu);
4166         int local = 1;
4167
4168         lock_cpu_hotplug();
4169
4170         spin_lock_irqsave(&rq->lock, flags);
4171
4172         if (cpu == smp_processor_id() || !cpu_online(cpu)) {
4173                 rq->sd = sd;
4174         } else {
4175                 init_completion(&req.done);
4176                 req.type = REQ_SET_DOMAIN;
4177                 req.sd = sd;
4178                 list_add(&req.list, &rq->migration_queue);
4179                 local = 0;
4180         }
4181
4182         spin_unlock_irqrestore(&rq->lock, flags);
4183
4184         if (!local) {
4185                 wake_up_process(rq->migration_thread);
4186                 wait_for_completion(&req.done);
4187         }
4188
4189         unlock_cpu_hotplug();
4190 }
4191
4192 #ifdef ARCH_HAS_SCHED_DOMAIN
4193 extern void __init arch_init_sched_domains(void);
4194 #else
4195 static struct sched_group sched_group_cpus[NR_CPUS];
4196 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4197 #ifdef CONFIG_NUMA
4198 static struct sched_group sched_group_nodes[MAX_NUMNODES];
4199 static DEFINE_PER_CPU(struct sched_domain, node_domains);
4200 static void __init arch_init_sched_domains(void)
4201 {
4202         int i;
4203         struct sched_group *first_node = NULL, *last_node = NULL;
4204
4205         /* Set up domains */
4206         for_each_cpu(i) {
4207                 int node = cpu_to_node(i);
4208                 cpumask_t nodemask = node_to_cpumask(node);
4209                 struct sched_domain *node_sd = &per_cpu(node_domains, i);
4210                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
4211
4212                 *node_sd = SD_NODE_INIT;
4213                 node_sd->span = cpu_possible_map;
4214                 node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
4215
4216                 *cpu_sd = SD_CPU_INIT;
4217                 cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
4218                 cpu_sd->groups = &sched_group_cpus[i];
4219                 cpu_sd->parent = node_sd;
4220         }
4221
4222         /* Set up groups */
4223         for (i = 0; i < MAX_NUMNODES; i++) {
4224                 cpumask_t tmp = node_to_cpumask(i);
4225                 cpumask_t nodemask;
4226                 struct sched_group *first_cpu = NULL, *last_cpu = NULL;
4227                 struct sched_group *node = &sched_group_nodes[i];
4228                 int j;
4229
4230                 cpus_and(nodemask, tmp, cpu_possible_map);
4231
4232                 if (cpus_empty(nodemask))
4233                         continue;
4234
4235                 node->cpumask = nodemask;
4236                 node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
4237
4238                 for_each_cpu_mask(j, node->cpumask) {
4239                         struct sched_group *cpu = &sched_group_cpus[j];
4240
4241                         cpus_clear(cpu->cpumask);
4242                         cpu_set(j, cpu->cpumask);
4243                         cpu->cpu_power = SCHED_LOAD_SCALE;
4244
4245                         if (!first_cpu)
4246                                 first_cpu = cpu;
4247                         if (last_cpu)
4248                                 last_cpu->next = cpu;
4249                         last_cpu = cpu;
4250                 }
4251                 last_cpu->next = first_cpu;
4252
4253                 if (!first_node)
4254                         first_node = node;
4255                 if (last_node)
4256                         last_node->next = node;
4257                 last_node = node;
4258         }
4259         last_node->next = first_node;
4260
4261         mb();
4262         for_each_cpu(i) {
4263                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
4264                 cpu_attach_domain(cpu_sd, i);
4265         }
4266 }
4267
4268 #else /* !CONFIG_NUMA */
4269 static void __init arch_init_sched_domains(void)
4270 {
4271         int i;
4272         struct sched_group *first_cpu = NULL, *last_cpu = NULL;
4273
4274         /* Set up domains */
4275         for_each_cpu(i) {
4276                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
4277
4278                 *cpu_sd = SD_CPU_INIT;
4279                 cpu_sd->span = cpu_possible_map;
4280                 cpu_sd->groups = &sched_group_cpus[i];
4281         }
4282
4283         /* Set up CPU groups */
4284         for_each_cpu_mask(i, cpu_possible_map) {
4285                 struct sched_group *cpu = &sched_group_cpus[i];
4286
4287                 cpus_clear(cpu->cpumask);
4288                 cpu_set(i, cpu->cpumask);
4289                 cpu->cpu_power = SCHED_LOAD_SCALE;
4290
4291                 if (!first_cpu)
4292                         first_cpu = cpu;
4293                 if (last_cpu)
4294                         last_cpu->next = cpu;
4295                 last_cpu = cpu;
4296         }
4297         last_cpu->next = first_cpu;
4298
4299         mb(); /* domains were modified outside the lock */
4300         for_each_cpu(i) {
4301                 struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
4302                 cpu_attach_domain(cpu_sd, i);
4303         }
4304 }
4305
4306 #endif /* CONFIG_NUMA */
4307 #endif /* ARCH_HAS_SCHED_DOMAIN */
4308
4309 #define SCHED_DOMAIN_DEBUG
4310 #ifdef SCHED_DOMAIN_DEBUG
4311 void sched_domain_debug(void)
4312 {
4313         int i;
4314
4315         for_each_cpu(i) {
4316                 runqueue_t *rq = cpu_rq(i);
4317                 struct sched_domain *sd;
4318                 int level = 0;
4319
4320                 sd = rq->sd;
4321
4322                 printk(KERN_DEBUG "CPU%d: %s\n",
4323                                 i, (cpu_online(i) ? " online" : "offline"));
4324
4325                 do {
4326                         int j;
4327                         char str[NR_CPUS];
4328                         struct sched_group *group = sd->groups;
4329                         cpumask_t groupmask;
4330
4331                         cpumask_scnprintf(str, NR_CPUS, sd->span);
4332                         cpus_clear(groupmask);
4333
4334                         printk(KERN_DEBUG);
4335                         for (j = 0; j < level + 1; j++)
4336                                 printk(" ");
4337                         printk("domain %d: span %s\n", level, str);
4338
4339                         if (!cpu_isset(i, sd->span))
4340                                 printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
4341                         if (!cpu_isset(i, group->cpumask))
4342                                 printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
4343                         if (!group->cpu_power)
4344                                 printk(KERN_DEBUG "ERROR domain->cpu_power not set\n");
4345
4346                         printk(KERN_DEBUG);
4347                         for (j = 0; j < level + 2; j++)
4348                                 printk(" ");
4349                         printk("groups:");
4350                         do {
4351                                 if (!group) {
4352                                         printk(" ERROR: NULL");
4353                                         break;
4354                                 }
4355
4356                                 if (!cpus_weight(group->cpumask))
4357                                         printk(" ERROR empty group:");
4358
4359                                 if (cpus_intersects(groupmask, group->cpumask))
4360                                         printk(" ERROR repeated CPUs:");
4361
4362                                 cpus_or(groupmask, groupmask, group->cpumask);
4363
4364                                 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
4365                                 printk(" %s", str);
4366
4367                                 group = group->next;
4368                         } while (group != sd->groups);
4369                         printk("\n");
4370
4371                         if (!cpus_equal(sd->span, groupmask))
4372                                 printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
4373
4374                         level++;
4375                         sd = sd->parent;
4376
4377                         if (sd) {
4378                                 if (!cpus_subset(groupmask, sd->span))
4379                                         printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
4380                         }
4381
4382                 } while (sd);
4383         }
4384 }
4385 #else
4386 #define sched_domain_debug() {}
4387 #endif
4388
4389 void __init sched_init_smp(void)
4390 {
4391         arch_init_sched_domains();
4392         sched_domain_debug();
4393 }
4394 #else
4395 void __init sched_init_smp(void)
4396 {
4397 }
4398 #endif /* CONFIG_SMP */
4399
4400 int in_sched_functions(unsigned long addr)
4401 {
4402         /* Linker adds these: start and end of __sched functions */
4403         extern char __sched_text_start[], __sched_text_end[];
4404         return addr >= (unsigned long)__sched_text_start
4405                 && addr < (unsigned long)__sched_text_end;
4406 }
4407
4408 void __init sched_init(void)
4409 {
4410         runqueue_t *rq;
4411         int i;
4412
4413 #ifdef CONFIG_SMP
4414         /* Set up an initial dummy domain for early boot */
4415         static struct sched_domain sched_domain_init;
4416         static struct sched_group sched_group_init;
4417
4418         memset(&sched_domain_init, 0, sizeof(struct sched_domain));
4419         sched_domain_init.span = CPU_MASK_ALL;
4420         sched_domain_init.groups = &sched_group_init;
4421         sched_domain_init.last_balance = jiffies;
4422         sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
4423         sched_domain_init.busy_factor = 1;
4424
4425         memset(&sched_group_init, 0, sizeof(struct sched_group));
4426         sched_group_init.cpumask = CPU_MASK_ALL;
4427         sched_group_init.next = &sched_group_init;
4428         sched_group_init.cpu_power = SCHED_LOAD_SCALE;
4429 #endif
4430         init_cpu_classes();
4431
4432         for (i = 0; i < NR_CPUS; i++) {
4433 #ifndef CONFIG_CKRM_CPU_SCHEDULE
4434                 int j, k;
4435                 prio_array_t *array;
4436
4437                 rq = cpu_rq(i);
4438                 spin_lock_init(&rq->lock);
4439
4440                 for (j = 0; j < 2; j++) {
4441                         array = rq->arrays + j;
4442                         for (k = 0; k < MAX_PRIO; k++) {
4443                                 INIT_LIST_HEAD(array->queue + k);
4444                                 __clear_bit(k, array->bitmap);
4445                         }
4446                         // delimiter for bitsearch
4447                         __set_bit(MAX_PRIO, array->bitmap);
4448                 }
4449
4450                 rq->active = rq->arrays;
4451                 rq->expired = rq->arrays + 1;
4452                 rq->best_expired_prio = MAX_PRIO;
4453 #else
4454                 rq = cpu_rq(i);
4455                 spin_lock_init(&rq->lock);
4456 #endif
4457
4458 #ifdef CONFIG_SMP
4459                 rq->sd = &sched_domain_init;
4460                 rq->cpu_load = 0;
4461 #ifdef CONFIG_CKRM_CPU_SCHEDULE
4462                 ckrm_load_init(rq_ckrm_load(rq));
4463 #endif
4464                 rq->active_balance = 0;
4465                 rq->push_cpu = 0;
4466                 rq->migration_thread = NULL;
4467                 INIT_LIST_HEAD(&rq->migration_queue);
4468 #endif
4469 #ifdef  CONFIG_VSERVER_HARDCPU          
4470                 INIT_LIST_HEAD(&rq->hold_queue);
4471 #endif
4472                 atomic_set(&rq->nr_iowait, 0);
4473         }
4474
4475         /*
4476          * We have to do a little magic to get the first
4477          * thread right in SMP mode.
4478          */
4479         rq = this_rq();
4480         rq->curr = current;
4481         rq->idle = current;
4482         set_task_cpu(current, smp_processor_id());
4483 #ifdef CONFIG_CKRM_CPU_SCHEDULE
4484         cpu_demand_event(&(current)->demand_stat,CPU_DEMAND_INIT,0);
4485         current->cpu_class = get_default_cpu_class();
4486         current->array = NULL;
4487 #endif
4488         wake_up_forked_process(current);
4489
4490         /*
4491          * The boot idle thread does lazy MMU switching as well:
4492          */
4493         atomic_inc(&init_mm.mm_count);
4494         enter_lazy_tlb(&init_mm, current);
4495 }
4496
4497 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4498 void __might_sleep(char *file, int line, int atomic_depth)
4499 {
4500 #if defined(in_atomic)
4501         static unsigned long prev_jiffy;        /* ratelimiting */
4502
4503 #ifndef CONFIG_PREEMPT
4504         atomic_depth = 0;
4505 #endif
4506         if ((in_atomic() || irqs_disabled()) &&
4507             system_state == SYSTEM_RUNNING) {
4508                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
4509                         return;
4510                 prev_jiffy = jiffies;
4511                 printk(KERN_ERR "Debug: sleeping function called from invalid"
4512                                 " context at %s:%d\n", file, line);
4513                 printk("in_atomic():%d, irqs_disabled():%d\n",
4514                         in_atomic(), irqs_disabled());
4515                 dump_stack();
4516         }
4517 #endif
4518 }
4519 EXPORT_SYMBOL(__might_sleep);
4520 #endif
4521
4522
4523 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
4524 /*
4525  * This could be a long-held lock.  If another CPU holds it for a long time,
4526  * and that CPU is not asked to reschedule then *this* CPU will spin on the
4527  * lock for a long time, even if *this* CPU is asked to reschedule.
4528  *
4529  * So what we do here, in the slow (contended) path is to spin on the lock by
4530  * hand while permitting preemption.
4531  *
4532  * Called inside preempt_disable().
4533  */
4534 void __sched __preempt_spin_lock(spinlock_t *lock)
4535 {
4536         if (preempt_count() > 1) {
4537                 _raw_spin_lock(lock);
4538                 return;
4539         }
4540         do {
4541                 preempt_enable();
4542                 while (spin_is_locked(lock))
4543                         cpu_relax();
4544                 preempt_disable();
4545         } while (!_raw_spin_trylock(lock));
4546 }
4547
4548 EXPORT_SYMBOL(__preempt_spin_lock);
4549
4550 void __sched __preempt_write_lock(rwlock_t *lock)
4551 {
4552         if (preempt_count() > 1) {
4553                 _raw_write_lock(lock);
4554                 return;
4555         }
4556
4557         do {
4558                 preempt_enable();
4559                 while (rwlock_is_locked(lock))
4560                         cpu_relax();
4561                 preempt_disable();
4562         } while (!_raw_write_trylock(lock));
4563 }
4564
4565 EXPORT_SYMBOL(__preempt_write_lock);
4566 #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */
4567
4568 #ifdef CONFIG_DELAY_ACCT
4569 int task_running_sys(struct task_struct *p)
4570 {
4571        return task_running(task_rq(p),p);
4572 }
4573 EXPORT_SYMBOL(task_running_sys);
4574 #endif
4575
4576 #ifdef CONFIG_CKRM_CPU_SCHEDULE
4577
4578 /********************************************************************
4579  *
4580  *  CKRM Scheduler additions
4581  * 
4582  *  (a) helper functions
4583  *  (b) load balancing code
4584  *
4585  *  These are required here to avoid having to externalize many
4586  *  of the definitions in sched.c
4587  *
4588  * 
4589  ********************************************************************/
4590
4591 /**
4592  * return the classqueue object of a certain processor
4593  */
4594 struct classqueue_struct * get_cpu_classqueue(int cpu)
4595 {
4596         return (& (cpu_rq(cpu)->classqueue) );
4597 }
4598
4599 /**
4600  * _ckrm_cpu_change_class - change the class of a task
4601  */
4602 void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
4603 {
4604         prio_array_t *array;
4605         struct runqueue *rq;
4606         unsigned long flags;
4607
4608         rq = task_rq_lock(tsk,&flags); 
4609         array = tsk->array;
4610         if (array) {
4611                 dequeue_task(tsk,array);
4612                 tsk->cpu_class = newcls;
4613                 enqueue_task(tsk,rq_active(tsk,rq));
4614         } else
4615                 tsk->cpu_class = newcls;
4616
4617         task_rq_unlock(rq,&flags);
4618 }
4619
4620 /**
4621  * get_min_cvt_locking  - get the mininum cvt on a particular cpu under rqlock
4622  */
4623
4624 CVT_t get_min_cvt(int cpu);
4625
4626 CVT_t get_min_cvt_locking(int cpu)
4627 {
4628         CVT_t cvt;
4629         struct runqueue *rq = cpu_rq(cpu);
4630         spin_lock(&rq->lock);
4631         cvt = get_min_cvt(cpu);
4632         spin_unlock(&rq->lock);
4633         return cvt;
4634 }
4635
4636 ckrm_lrq_t *rq_get_dflt_lrq(int cpu)
4637 {
4638         return &(cpu_rq(cpu)->dflt_lrq);
4639 }
4640
4641 #ifdef CONFIG_SMP
4642
4643 /**************  CKRM Load Balancing code ************************/
4644
4645 static inline int ckrm_preferred_task(task_t *tmp,long min, long max, 
4646                                       int phase, enum idle_type idle)
4647 {
4648         long pressure = task_load(tmp);
4649         
4650         if (pressure > max) 
4651                 return 0;
4652
4653         if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
4654                 return 0;
4655         return 1;
4656 }
4657
4658 /*
4659  * move tasks for a specic local class
4660  * return number of tasks pulled
4661  */
4662 static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
4663                                       runqueue_t *this_rq,
4664                                       runqueue_t *busiest,
4665                                       struct sched_domain *sd,
4666                                       int this_cpu,
4667                                       enum idle_type idle,
4668                                       long* pressure_imbalance) 
4669 {
4670         prio_array_t *array, *dst_array;
4671         struct list_head *head, *curr;
4672         task_t *tmp;
4673         int idx;
4674         int pulled = 0;
4675         int phase = -1;
4676         long pressure_min, pressure_max;
4677         /*hzheng: magic : 90% balance is enough*/
4678         long balance_min = *pressure_imbalance / 10; 
4679 /*
4680  * we don't want to migrate tasks that will reverse the balance
4681  *     or the tasks that make too small difference
4682  */
4683 #define CKRM_BALANCE_MAX_RATIO  100
4684 #define CKRM_BALANCE_MIN_RATIO  1
4685  start:
4686         phase ++;
4687         /*
4688          * We first consider expired tasks. Those will likely not be
4689          * executed in the near future, and they are most likely to
4690          * be cache-cold, thus switching CPUs has the least effect
4691          * on them.
4692          */
4693         if (src_lrq->expired->nr_active) {
4694                 array = src_lrq->expired;
4695                 dst_array = dst_lrq->expired;
4696         } else {
4697                 array = src_lrq->active;
4698                 dst_array = dst_lrq->active;
4699         }
4700         
4701  new_array:
4702         /* Start searching at priority 0: */
4703         idx = 0;
4704  skip_bitmap:
4705         if (!idx)
4706                 idx = sched_find_first_bit(array->bitmap);
4707         else
4708                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
4709         if (idx >= MAX_PRIO) {
4710                 if (array == src_lrq->expired && src_lrq->active->nr_active) {
4711                         array = src_lrq->active;
4712                         dst_array = dst_lrq->active;
4713                         goto new_array;
4714                 }
4715                 if ((! phase) && (! pulled) && (idle != IDLE))
4716                         goto start; //try again
4717                 else 
4718                         goto out; //finished search for this lrq
4719         }
4720         
4721         head = array->queue + idx;
4722         curr = head->prev;
4723  skip_queue:
4724         tmp = list_entry(curr, task_t, run_list);
4725         
4726         curr = curr->prev;
4727         
4728         if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
4729                 if (curr != head)
4730                         goto skip_queue;
4731                 idx++;
4732                 goto skip_bitmap;
4733         }
4734
4735         pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
4736         pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
4737         /*
4738          * skip the tasks that will reverse the balance too much
4739          */
4740         if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
4741                 *pressure_imbalance -= task_load(tmp);
4742                 pull_task(busiest, array, tmp, 
4743                           this_rq, dst_array, this_cpu);
4744                 pulled++;
4745
4746                 if (*pressure_imbalance <= balance_min)
4747                         goto out;
4748         }
4749                 
4750         if (curr != head)
4751                 goto skip_queue;
4752         idx++;
4753         goto skip_bitmap;
4754  out:          
4755         return pulled;
4756 }
4757
4758 static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
4759 {
4760         long imbalance;
4761         /*
4762          * make sure after balance, imbalance' > - imbalance/2
4763          * we don't want the imbalance be reversed too much
4764          */
4765         imbalance = ckrm_get_pressure(rq_ckrm_load(dst_rq),0) 
4766                 - ckrm_get_pressure(rq_ckrm_load(this_rq),1);
4767         imbalance /= 2;
4768         return imbalance;
4769 }
4770
4771 /*
4772  * try to balance the two runqueues
4773  *
4774  * Called with both runqueues locked.
4775  * if move_tasks is called, it will try to move at least one task over
4776  */
4777 static int ckrm_move_tasks(runqueue_t *this_rq, int this_cpu, 
4778                            runqueue_t *busiest,
4779                            unsigned long max_nr_move, struct sched_domain *sd,
4780                            enum idle_type idle)
4781 {
4782         struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
4783         ckrm_lrq_t* src_lrq,*dst_lrq;
4784         long pressure_imbalance, pressure_imbalance_old;
4785         int src_cpu = task_cpu(busiest->curr);
4786         struct list_head *list;
4787         int pulled = 0;
4788         long imbalance;
4789
4790         imbalance =  ckrm_rq_imbalance(this_rq,busiest);
4791
4792         if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
4793                 goto out;
4794
4795         //try to find the vip class
4796         list_for_each_entry(clsptr,&active_cpu_classes,links) {
4797                 src_lrq = get_ckrm_lrq(clsptr,src_cpu);
4798
4799                 if (! lrq_nr_running(src_lrq))
4800                         continue;
4801
4802                 if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )  
4803                         {
4804                                 vip_cls = clsptr;
4805                         }
4806         }
4807
4808         /*
4809          * do search from the most significant class
4810          * hopefully, less tasks will be migrated this way
4811          */
4812         clsptr = vip_cls;
4813
4814  move_class:
4815         if (! clsptr)
4816                 goto out;
4817         
4818
4819         src_lrq = get_ckrm_lrq(clsptr,src_cpu);
4820         if (! lrq_nr_running(src_lrq))
4821                 goto other_class;
4822         
4823         dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
4824
4825         //how much pressure for this class should be transferred
4826         pressure_imbalance = (src_lrq->lrq_load * imbalance)/WEIGHT_TO_SHARE(src_lrq->local_weight);
4827         if (pulled && ! pressure_imbalance) 
4828                 goto other_class;
4829         
4830         pressure_imbalance_old = pressure_imbalance;
4831         
4832         //move tasks
4833         pulled += 
4834                 ckrm_cls_move_tasks(src_lrq,dst_lrq,
4835                                     this_rq,
4836                                     busiest,
4837                                     sd,this_cpu,idle,
4838                                     &pressure_imbalance);
4839
4840         /* 
4841          * hzheng: 2 is another magic number
4842          * stop balancing if the imbalance is less than 25% of the orig
4843          */
4844         if (pressure_imbalance <= (pressure_imbalance_old >> 2))
4845                 goto out;
4846                 
4847         //update imbalance
4848         imbalance *= pressure_imbalance / pressure_imbalance_old;
4849  other_class:
4850         //who is next?
4851         list = clsptr->links.next;
4852         if (list == &active_cpu_classes)
4853                 list = list->next;
4854         clsptr = list_entry(list, typeof(*clsptr), links);
4855         if (clsptr != vip_cls)
4856                 goto move_class;
4857  out:
4858         return pulled;
4859 }
4860
4861 /**
4862  * ckrm_check_balance - is load balancing necessary?
4863  * return 0 if load balancing is not necessary
4864  * otherwise return the average load of the system
4865  * also, update nr_group
4866  *
4867  * heuristics: 
4868  *   no load balancing if it's load is over average
4869  *   no load balancing if it's load is far more than the min
4870  * task:
4871  *   read the status of all the runqueues
4872  */
4873 static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
4874                                              enum idle_type idle, int* nr_group)
4875 {
4876         struct sched_group *group = sd->groups;
4877         unsigned long min_load, max_load, avg_load;
4878         unsigned long total_load, this_load, total_pwr;
4879
4880         max_load = this_load = total_load = total_pwr = 0;
4881         min_load = 0xFFFFFFFF;
4882         *nr_group = 0;
4883
4884         do {
4885                 cpumask_t tmp;
4886                 unsigned long load;
4887                 int local_group;
4888                 int i, nr_cpus = 0;
4889
4890                 /* Tally up the load of all CPUs in the group */
4891                 cpus_and(tmp, group->cpumask, cpu_online_map);
4892                 if (unlikely(cpus_empty(tmp)))
4893                         goto nextgroup;
4894
4895                 avg_load = 0;
4896                 local_group = cpu_isset(this_cpu, group->cpumask);
4897
4898                 for_each_cpu_mask(i, tmp) {
4899                         load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
4900                         nr_cpus++;
4901                         avg_load += load;
4902                 }
4903
4904                 if (!nr_cpus)
4905                         goto nextgroup;
4906
4907                 total_load += avg_load;
4908                 total_pwr += group->cpu_power;
4909
4910                 /* Adjust by relative CPU power of the group */
4911                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
4912
4913                 if (local_group) {
4914                         this_load = avg_load;
4915                         goto nextgroup;
4916                 } else if (avg_load > max_load) {
4917                         max_load = avg_load;
4918                 }      
4919                 if (avg_load < min_load) {
4920                         min_load = avg_load;
4921                 }
4922 nextgroup:
4923                 group = group->next;
4924                 *nr_group = *nr_group + 1;
4925         } while (group != sd->groups);
4926
4927         if (!max_load || this_load >= max_load)
4928                 goto out_balanced;
4929
4930         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
4931
4932         /* hzheng: debugging: 105 is a magic number
4933          * 100*max_load <= sd->imbalance_pct*this_load)
4934          * should use imbalance_pct instead
4935          */
4936         if (this_load > avg_load 
4937             || 100*max_load < 105*this_load
4938             || 100*min_load < 70*this_load
4939             )
4940                 goto out_balanced;
4941
4942         return avg_load;
4943  out_balanced:
4944         return 0;
4945 }
4946
4947 /**
4948  * any group that has above average load is considered busy
4949  * find the busiest queue from any of busy group
4950  */
4951 static runqueue_t *
4952 ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
4953                      unsigned long avg_load, enum idle_type idle,
4954                      int nr_group)
4955 {
4956         struct sched_group *group;
4957         runqueue_t * busiest=NULL;
4958         unsigned long rand;
4959         
4960         group = sd->groups;
4961         rand = get_ckrm_rand(nr_group);
4962         nr_group = 0;
4963
4964         do {
4965                 unsigned long load,total_load,max_load;
4966                 cpumask_t tmp;
4967                 int i;
4968                 runqueue_t * grp_busiest;
4969
4970                 cpus_and(tmp, group->cpumask, cpu_online_map);
4971                 if (unlikely(cpus_empty(tmp)))
4972                         goto find_nextgroup;
4973
4974                 total_load = 0;
4975                 max_load = 0;
4976                 grp_busiest = NULL;
4977                 for_each_cpu_mask(i, tmp) {
4978                         load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
4979                         total_load += load;
4980                         if (load > max_load) {
4981                                 max_load = load;
4982                                 grp_busiest = cpu_rq(i);
4983                         }                               
4984                 }
4985
4986                 total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
4987                 if (total_load > avg_load) {
4988                         busiest = grp_busiest;
4989                         if (nr_group >= rand)
4990                                 break;
4991                 }
4992         find_nextgroup:         
4993                 group = group->next;
4994                 nr_group ++;
4995         } while (group != sd->groups);
4996
4997         return busiest;
4998 }
4999
5000 /**
5001  * load_balance - pressure based load balancing algorithm used by ckrm
5002  */
5003 static int ckrm_load_balance_locked(int this_cpu, runqueue_t *this_rq,
5004                                     struct sched_domain *sd, 
5005                                     enum idle_type idle)
5006 {
5007         runqueue_t *busiest;
5008         unsigned long avg_load;
5009         int nr_moved,nr_group;
5010
5011         avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
5012         if (! avg_load)
5013                 goto out_balanced;
5014
5015         busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
5016         if (! busiest)
5017                 goto out_balanced;
5018         /*
5019          * This should be "impossible", but since load
5020          * balancing is inherently racy and statistical,
5021          * it could happen in theory.
5022          */
5023         if (unlikely(busiest == this_rq)) {
5024                 WARN_ON(1);
5025                 goto out_balanced;
5026         }
5027
5028         nr_moved = 0;
5029         if (busiest->nr_running > 1) {
5030                 /*
5031                  * Attempt to move tasks. If find_busiest_group has found
5032                  * an imbalance but busiest->nr_running <= 1, the group is
5033                  * still unbalanced. nr_moved simply stays zero, so it is
5034                  * correctly treated as an imbalance.
5035                  */
5036                 double_lock_balance(this_rq, busiest);
5037                 nr_moved = ckrm_move_tasks(this_rq, this_cpu, busiest,
5038                                            0,sd, idle);         
5039                 spin_unlock(&busiest->lock);
5040                 if (nr_moved) {
5041                         adjust_local_weight();
5042                 }
5043         }
5044
5045         if (!nr_moved) 
5046                 sd->nr_balance_failed ++;
5047         else
5048                 sd->nr_balance_failed  = 0;             
5049
5050         /* We were unbalanced, so reset the balancing interval */
5051         sd->balance_interval = sd->min_interval;
5052
5053         return nr_moved;
5054
5055 out_balanced:
5056         /* tune up the balancing interval */
5057         if (sd->balance_interval < sd->max_interval)
5058                 sd->balance_interval *= 2;
5059
5060         return 0;
5061 }
5062
5063 static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
5064                                     struct sched_domain *sd, 
5065                                     enum idle_type idle)
5066 {
5067         int ret;
5068
5069         if (ckrm_rq_cpu_disabled(this_rq)) 
5070                 return -1;
5071         //spin_lock(&this_rq->lock);
5072         read_lock(&class_list_lock);
5073         ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,idle);
5074         // ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,NEWLY_IDLE);
5075         read_unlock(&class_list_lock);
5076         //spin_unlock(&this_rq->lock);
5077         return ret;
5078 }
5079
5080 #endif   // CONFIG_SMP
5081
5082
5083 void ckrm_cpu_class_queue_update(int on)
5084 {
5085         /* This is called when the mode changes from disabled
5086          * to enabled (on=1) or vice versa (on=0).
5087          * we make sure that all classqueues on all cpus
5088          * either have the default class enqueued (on=1) or 
5089          * all classes dequeued (on=0). 
5090          * if not done a race condition will persist
5091          * when flipping the ckrm_sched_mode.
5092          * Otherwise will lead to more complicated code
5093          * in rq_get_next_task, where we despite knowing of
5094          * runnable tasks can not find an enqueued class.
5095          */
5096
5097         int i;
5098         runqueue_t *rq;
5099         ckrm_lrq_t *lrq;
5100         struct ckrm_cpu_class *clsptr;
5101
5102         if (on) {       
5103                 BUG_ON(ckrm_cpu_enabled());
5104                 for_each_cpu(i) {
5105                         rq = cpu_rq(i);
5106                         BUG_ON(ckrm_rq_cpu_enabled(rq));
5107                         lrq = &rq->dflt_lrq;
5108                         spin_lock(&rq->lock);
5109
5110                         BUG_ON(cls_in_classqueue(&lrq->classqueue_linkobj));
5111
5112                         classqueue_init(&rq->classqueue,1);
5113                         lrq->top_priority = find_first_bit(lrq->active->bitmap,
5114                                                            MAX_PRIO),
5115                         classqueue_enqueue(lrq->classqueue, 
5116                                            &lrq->classqueue_linkobj, 0);
5117                         spin_unlock(&rq->lock);
5118 #if 0
5119                         printk("UPDATE(%d) run=%lu:%d:%d %d:%d->%d\n", i,
5120                                 rq->nr_running,lrq->active->nr_active,
5121                                 lrq->expired->nr_active,
5122                                 find_first_bit(lrq->active->bitmap,MAX_PRIO),
5123                                 find_first_bit(lrq->expired->bitmap,MAX_PRIO),
5124                                 lrq->top_priority);
5125 #endif
5126                 }
5127         } else {
5128                 for_each_cpu(i) {
5129                         rq = cpu_rq(i);
5130                         spin_lock(&rq->lock);
5131
5132                         /* walk through all classes and make sure they
5133                          * are not enqueued
5134                          */
5135                         write_lock(&class_list_lock);
5136                         list_for_each_entry(clsptr,&active_cpu_classes,links) {
5137                                 lrq = get_ckrm_lrq(clsptr,i);
5138                                 BUG_ON((lrq != &rq->dflt_lrq) && lrq_nr_running(lrq));  // must be empty
5139                                 if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
5140                                         classqueue_dequeue(lrq->classqueue,
5141                                                         &lrq->classqueue_linkobj);
5142                         }
5143                         rq->classqueue.enabled = 0;
5144                         write_unlock(&class_list_lock);
5145                         spin_unlock(&rq->lock);
5146                 }
5147         }
5148 }
5149
5150 /*
5151  * callback when a class is getting deleted
5152  * need to remove it from the class runqueue. see (class_queue_update)
5153  */
5154
5155 void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr)
5156 {
5157         int i;
5158         
5159         for_each_cpu(i) {
5160                 runqueue_t *rq = cpu_rq(i);
5161                 ckrm_lrq_t *lrq = get_ckrm_lrq(clsptr,i);
5162
5163                 spin_lock(&rq->lock);
5164                 write_lock(&class_list_lock);
5165                 BUG_ON(lrq_nr_running(lrq));  // must be empty
5166                 if (cls_in_classqueue(&lrq->classqueue_linkobj)) 
5167                         classqueue_dequeue(lrq->classqueue,
5168                                            &lrq->classqueue_linkobj);
5169                 write_unlock(&class_list_lock);
5170                 spin_unlock(&rq->lock);
5171         }
5172 }
5173
5174 #endif  // CONFIG_CKRM_CPU_SCHEDULE