This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / ckrm / ckrm_cpu_monitor.c
1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
2  *
3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
4  *           (C) Hubertus Franke, IBM Corp. 2004
5  * 
6  * Latest version, more details at http://ckrm.sf.net
7  * 
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  */
14
15 /* Changes
16  * 
17  * 23 June 2004: Created
18  * 
19  */
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <asm/errno.h>
23 #include <linux/list.h>
24 #include <linux/spinlock.h>
25 #include <linux/ckrm.h>
26 #include <linux/ckrm_rc.h>
27 #include <linux/ckrm_tc.h>
28 #include <asm/div64.h>
29 #include <linux/ckrm_sched.h>
30
31 #warning MEF I cannot believe that vserver changes force the following include statement: FIX THIS!
32 #include <linux/vs_cvirt.h>
33
34 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
35 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
36
37 #define CKRM_CPU_DEMAND_RUN 0
38 #define CKRM_CPU_DEMAND_SLEEP 1
39 //sample task cpu demand every 64ms
40 #define CPU_DEMAND_TASK_RECALC  (64000000LL)
41 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
42 #define CPU_DEMAND_TP_CLASS 0
43 #define CPU_DEMAND_TP_TASK 1
44
45 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
46 void update_ckrm_idle(unsigned long surplus);
47
48 /*interface to share definition*/
49 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
50 {
51         return cls->shares.my_limit;
52 }
53
54 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
55 {
56         return cls->shares.total_guarantee;
57 }
58
59 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
60 {
61         return cls->shares.total_guarantee;
62 }
63
64 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
65 {
66         return cls->shares.total_guarantee;
67 }
68
69
70 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
71 {
72         unsigned long long now = sched_clock();
73
74         local_stat->run = 0;
75         local_stat->total = 0;
76         local_stat->last_sleep = now;
77         switch (type) {
78         case CPU_DEMAND_TP_CLASS:
79                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
80                 local_stat->cpu_demand = 0; 
81                 break;
82         case CPU_DEMAND_TP_TASK:
83                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
84                 //for task, the init cpu_demand is copied from its parent
85                 break;
86         default:
87                 BUG();
88         }
89 }
90
91 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
92 {
93         int i;
94
95         stat->stat_lock = SPIN_LOCK_UNLOCKED;
96         stat->total_ns = 0;
97         stat->max_demand = 0;
98
99         for (i=0; i<NR_CPUS; i++) {
100                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
101         }
102
103         stat->egrt = 0;
104         stat->megrt = 0;
105         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
106         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
107
108         stat->eshare = CKRM_SHARE_MAX;
109         stat->meshare = CKRM_SHARE_MAX;
110 }
111
112 /**********************************************/
113 /*          cpu demand                        */
114 /**********************************************/
115
116 /*
117  * How CPU demand is calculated:
118  * consider class local runqueue (clr) first
119  * at any time, a clr can at the following three states
120  * -- run: a task belonning to this class is running on this cpu
121  * -- wait: at least one of its task is running, but the class is not running
122  * -- sleep: none of the task of this class is runnable
123  *
124  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
125  * 
126  * the cpu_demand of a class = 
127  *    sum of cpu_demand of all the class local runqueues
128  */
129
130 /**
131  * update_cpu_demand_stat - 
132  * 
133  * should be called whenever the state of a task/task local queue changes
134  * -- when deschedule : report how much run
135  * -- when enqueue: report how much sleep
136  *
137  * how often should we recalculate the cpu demand
138  * the number is in ns
139  */
140 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
141 {       
142         local_stat->total += len;
143         if (state == CKRM_CPU_DEMAND_RUN)
144                 local_stat->run += len;
145
146         if (local_stat->total >= local_stat->recalc_interval) {
147                 local_stat->total >>= CKRM_SHARE_ACCURACY;
148                 if (unlikely(local_stat->run > 0xFFFFFFFF))
149                         local_stat->run = 0xFFFFFFFF;
150
151                 if (local_stat->total > 0xFFFFFFFF) 
152                         local_stat->total = 0xFFFFFFFF;
153                         
154                 do_div(local_stat->run,(unsigned long)local_stat->total);
155
156                 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
157                         local_stat->cpu_demand = local_stat->run;
158                 else {
159                         local_stat->cpu_demand += local_stat->run;
160                         local_stat->cpu_demand >>= 1;
161                 }
162                 local_stat->total = 0;
163                 local_stat->run = 0;
164         }
165 }
166
167 /**
168  * cpu_demand_event - and cpu_demand event occured
169  * @event: one of the following three events:
170  *   CPU_DEMAND_ENQUEUE: local class enqueue
171  *   CPU_DEMAND_DEQUEUE: local class dequeue
172  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
173  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
174  */
175 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) 
176 {       
177         switch (event) {
178         case CPU_DEMAND_ENQUEUE: 
179                 len = sched_clock() - local_stat->last_sleep;
180                 local_stat->last_sleep = 0;
181                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
182                 break;
183         case CPU_DEMAND_DEQUEUE:
184                 if (! local_stat->last_sleep) {
185                         local_stat->last_sleep = sched_clock();
186                 }
187                 break;
188         case CPU_DEMAND_DESCHEDULE:
189                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
190                 break;
191         case CPU_DEMAND_INIT: //for task init only
192                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
193                 break;
194         default:
195                 BUG();
196         }
197 }
198
199 /** 
200  * check all the class local queue
201  * 
202  * to deal with excessive long run/sleep state
203  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
204  */
205 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
206 {
207         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
208         unsigned long long sleep,now;
209         if (local_stat->last_sleep) {
210                 now = sched_clock();
211                 sleep = now - local_stat->last_sleep;
212                 local_stat->last_sleep = now;
213                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
214         }
215 }
216
217 /**
218  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
219  *
220  * self_cpu_demand = sum(cpu demand of all local queues) 
221  */
222 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
223 {
224         int cpu_demand = 0;
225         int i;
226         int cpuonline = 0;
227
228         for_each_online_cpu(i) {
229                 cpu_demand_check_sleep(stat,i);
230                 cpu_demand += stat->local_stats[i].cpu_demand;
231                 cpuonline ++;
232         }
233
234         return (cpu_demand/cpuonline);
235 }
236
237 /*
238  * my max demand = min(cpu_demand, my effective hard limit)
239  */
240 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
241 {
242         unsigned long mmax_demand = get_self_cpu_demand(stat);
243         if (mmax_demand > stat->mehl)
244                 mmax_demand = stat->mehl;
245
246         return mmax_demand;
247 }
248
249 /**
250  * update_max_demand: update effective cpu demand for each class
251  * return -1 on error
252  * 
253  * Assume: the root_core->parent == NULL
254  */
255 static int update_max_demand(struct ckrm_core_class *root_core)
256 {
257         struct ckrm_core_class *cur_core, *child_core;
258         struct ckrm_cpu_class *cls,*c_cls;
259         int ret = -1;
260
261         cur_core = root_core;
262         child_core = NULL;
263         
264  repeat:
265         if (!cur_core) { //normal exit
266                 ret = 0;
267                 goto out;
268         }
269
270         cls = ckrm_get_cpu_class(cur_core);
271         if (! cls) //invalid c_cls, abort
272                 goto out;
273
274         if (!child_core)        //first child
275                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
276         else {
277                 c_cls = ckrm_get_cpu_class(child_core);
278                 if (c_cls)
279                         cls->stat.max_demand += c_cls->stat.max_demand;
280                 else //invalid c_cls, abort
281                         goto out;
282         }
283
284         //check class hard limit
285         if (cls->stat.max_demand > cls->stat.ehl)
286                 cls->stat.max_demand = cls->stat.ehl;
287
288         //next child
289         child_core = ckrm_get_next_child(cur_core, child_core);
290         if (child_core) {
291                 //go down
292                 cur_core = child_core;
293                 child_core = NULL;
294                 goto repeat;
295         } else {                //no more child, go back
296                 child_core = cur_core;
297                 cur_core = child_core->hnode.parent;
298         }
299         goto repeat;
300  out:
301         return ret;
302 }
303
304 /**********************************************/
305 /*          effective guarantee & limit       */
306 /**********************************************/
307 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
308                                        int new_share)
309 {
310         if (!new_share)
311                 new_share = 1;
312
313         BUG_ON(new_share < 0);
314         stat->eshare = new_share;
315 }
316
317 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
318                                             int new_share)
319 {
320         if (!new_share)
321                 new_share = 1;
322
323         BUG_ON(new_share < 0);
324         stat->meshare = new_share;
325 }
326
327 /**
328  *update_child_effective - update egrt, ehl, mehl for all children of parent
329  *@parent: the parent node
330  *return -1 if anything wrong
331  *
332  */
333 static int update_child_effective(struct ckrm_core_class *parent)
334 {
335         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
336         struct ckrm_core_class *child_core;     
337         int ret = -1;
338
339         if (! p_cls)
340                 return ret;
341
342         child_core = ckrm_get_next_child(parent, NULL);
343         while (child_core) {
344                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
345                 if (! c_cls)
346                         return ret;
347
348                 c_cls->stat.egrt =
349                     p_cls->stat.egrt *
350                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
351
352                 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
353                         / c_cls->shares.total_guarantee;
354                 
355                 c_cls->stat.ehl =
356                     p_cls->stat.ehl *
357                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
358
359                 c_cls->stat.mehl =
360                     c_cls->stat.ehl *
361                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
362
363                 set_eshare(&c_cls->stat,c_cls->stat.egrt);
364                 set_meshare(&c_cls->stat,c_cls->stat.megrt);
365
366
367                 child_core = ckrm_get_next_child(parent, child_core);
368         };
369         return 0;
370 }
371
372 /**
373  * update_effectives: update egrt, ehl, mehl for the whole tree
374  * should be called only when class structure changed
375  *
376  * return -1 if anything wrong happened (eg: the structure changed during the process)
377  */
378 static int update_effectives(struct ckrm_core_class *root_core)
379 {
380         struct ckrm_core_class *cur_core, *child_core;
381         struct ckrm_cpu_class *cls;
382         int ret = -1;
383
384         cur_core = root_core;
385         child_core = NULL;
386         cls = ckrm_get_cpu_class(cur_core);
387
388         //initialize the effectives for root 
389         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
390         cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
391                 / cls->shares.total_guarantee;
392         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
393                 / cls->shares.total_guarantee;
394         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
395                 / cls->shares.total_guarantee;
396         set_eshare(&cls->stat,cls->stat.egrt);
397         set_meshare(&cls->stat,cls->stat.megrt);
398
399  repeat:
400         //check exit
401         if (!cur_core)
402                 return 0;
403
404         //visit this node only once
405         if (! child_core)
406                 if (update_child_effective(cur_core) < 0)
407                         return ret; //invalid cur_core node
408         
409         //next child
410         child_core = ckrm_get_next_child(cur_core, child_core);
411
412         if (child_core) {
413                 //go down to the next hier
414                 cur_core = child_core;
415                 child_core = NULL;
416         } else { //no more child, go back
417                 child_core = cur_core;
418                 cur_core = child_core->hnode.parent;
419         }
420         goto repeat;
421 }
422
423 /**********************************************/
424 /*          surplus allocation                */
425 /**********************************************/
426
427 /*
428  * surplus = egrt - demand
429  * if surplus < 0, surplus = 0 
430  */
431 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
432 {
433         int surplus = cls->stat.egrt - cls->stat.max_demand;
434
435         if (surplus < 0)
436                 surplus = 0;
437
438         return surplus;
439 }
440
441 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
442 {
443         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
444
445         if (surplus < 0)
446                 surplus = 0;
447
448         return surplus;
449 }
450
451 /**
452  * consume_surplus: decides how much surplus a node can consume
453  * @ckeck_sl: if check_sl is set, then check soft_limitx
454  * return how much consumed
455  *
456  * implements all the CKRM Scheduling Requirement
457  * assume c_cls is valid
458  */
459 static inline int consume_surplus(int surplus,
460                                        struct ckrm_cpu_class *c_cls,
461                                        struct ckrm_cpu_class *p_cls,
462                                        int check_sl
463                                        )
464 {
465         int consumed = 0;
466         int inc_limit;
467         int total_grt = p_cls->shares.total_guarantee;
468
469         BUG_ON(surplus < 0);
470
471         /*can't consume more than demand or hard limit*/
472         if (c_cls->stat.eshare >= c_cls->stat.max_demand)
473                 goto out;
474
475         //the surplus allocation is propotional to grt
476         consumed =
477                 surplus * c_cls->shares.my_guarantee / total_grt;
478
479         if (! consumed) //no more share
480                 goto out;
481
482         //hard limit and demand limit
483         inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
484
485         if (check_sl) {
486                 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
487                         /total_grt;
488                 if (esl < c_cls->stat.max_demand)
489                         inc_limit = esl - c_cls->stat.eshare;
490         }
491
492         if (consumed > inc_limit)
493                 consumed = inc_limit;
494
495         BUG_ON(consumed < 0);
496  out:           
497         return consumed;
498 }
499
500 /*
501  * how much a node can consume for itself?
502  */
503 static inline int consume_self_surplus(int surplus,
504                                        struct ckrm_cpu_class *p_cls,
505                                        int check_sl
506                                        )
507 {
508         int consumed = 0;
509         int inc_limit;
510         int total_grt = p_cls->shares.total_guarantee;
511         int max_demand = get_mmax_demand(&p_cls->stat);
512
513         BUG_ON(surplus < 0);
514
515         /*can't consume more than demand or hard limit*/
516         if (p_cls->stat.meshare >= max_demand)
517                 goto out;
518
519         //the surplus allocation is propotional to grt
520         consumed =
521                 surplus * p_cls->shares.unused_guarantee / total_grt;
522
523         if (! consumed) //no more share
524                 goto out;
525
526         //hard limit and demand limit
527         inc_limit = max_demand - p_cls->stat.meshare;
528
529         if (check_sl) {
530                 int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
531                         /total_grt;
532                 if (mesl < max_demand)
533                         inc_limit = mesl - p_cls->stat.meshare;
534         }
535
536         if (consumed > inc_limit)
537                 consumed = inc_limit;
538
539         BUG_ON(consumed < 0);
540  out:           
541         return consumed;
542 }
543
544
545 /*
546  * allocate surplus to all its children and also its default class
547  */
548 static int alloc_surplus_single_round(
549                                       int surplus,
550                                       struct ckrm_core_class *parent,
551                                       struct ckrm_cpu_class *p_cls,
552                                       int check_sl)
553 {
554         struct ckrm_cpu_class *c_cls;
555         struct ckrm_core_class *child_core = NULL;
556         int total_consumed = 0,consumed;
557
558         //first allocate to the default class
559         consumed  =
560                 consume_self_surplus(surplus,p_cls,check_sl);
561
562         if (consumed > 0) {
563                 set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
564                 total_consumed += consumed;
565         }
566
567         do {
568                 child_core = ckrm_get_next_child(parent, child_core);
569                 if (child_core)  {
570                         c_cls = ckrm_get_cpu_class(child_core);
571                         if (! c_cls)
572                                 return -1;
573
574                         consumed    =
575                                 consume_surplus(surplus, c_cls,
576                                                      p_cls,check_sl);
577                         if (consumed > 0) {
578                                 set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
579                                 total_consumed += consumed;
580                         }
581                 }
582         } while (child_core);
583
584         return total_consumed;
585 }
586
587 /**
588  * alloc_surplus_node: re-allocate the shares for children under parent
589  * @parent: parent node
590  * return the remaining surplus
591  *
592  * task:
593  *  1. get total surplus
594  *  2. allocate surplus
595  *  3. set the effective_share of each node
596  */
597 static int alloc_surplus_node(struct ckrm_core_class *parent)
598 {
599         struct ckrm_cpu_class *p_cls,*c_cls;
600         int total_surplus,consumed;
601         int check_sl;
602         int ret = -1;
603         struct ckrm_core_class *child_core = NULL;
604
605         p_cls = ckrm_get_cpu_class(parent);
606         if (! p_cls)
607                 goto realloc_out;
608
609         /*
610          * get total surplus
611          */
612         total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
613         BUG_ON(total_surplus < 0);
614         total_surplus += get_my_node_surplus(p_cls);
615
616         do {
617                 child_core = ckrm_get_next_child(parent, child_core);
618                 if (child_core) {
619                         c_cls = ckrm_get_cpu_class(child_core);                         
620                         if (! c_cls)
621                                 goto realloc_out;
622
623                         total_surplus += get_node_surplus(c_cls);
624                 }
625         } while (child_core);
626
627
628         if (! total_surplus) {
629                 ret = 0;
630                 goto realloc_out;
631         }
632
633         /* 
634          * distributing the surplus 
635          * first with the check_sl enabled
636          * once all the tasks has research the soft limit, disable check_sl and try again
637          */
638         
639         check_sl = 1;
640         do {
641                 consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
642                 if (consumed < 0) //something is wrong
643                         goto realloc_out;
644
645                 if (! consumed)
646                         check_sl = 0;
647                 else
648                         total_surplus -= consumed;
649
650         } while ((total_surplus > 0) && (consumed || check_sl) );
651
652         ret = 0;
653         
654  realloc_out:
655         return ret;
656 }
657
658 /**
659  * alloc_surplus - reallocate unused shares
660  *
661  * class A's usused share should be allocated to its siblings
662  * the re-allocation goes downward from the top
663  */
664 static int alloc_surplus(struct ckrm_core_class *root_core)
665 {
666         struct ckrm_core_class *cur_core, *child_core;
667         //      struct ckrm_cpu_class *cls;
668         int ret = -1;
669
670         /*initialize*/
671         cur_core = root_core;
672         child_core = NULL;
673         //      cls = ckrm_get_cpu_class(cur_core);
674
675         /*the ckrm idle tasks get all what's remaining*/
676         /*hzheng: uncomment the following like for hard limit support */
677         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
678         
679  repeat:
680         //check exit
681         if (!cur_core)
682                 return 0;
683
684         //visit this node only once
685         if (! child_core) 
686                 if ( alloc_surplus_node(cur_core) < 0 )
687                         return ret;
688
689         //next child
690         child_core = ckrm_get_next_child(cur_core, child_core);
691         if (child_core) {
692                 //go down
693                 cur_core = child_core;
694                 child_core = NULL;
695                 goto repeat;
696         } else {                //no more child, go back
697                 child_core = cur_core;
698                 cur_core = child_core->hnode.parent;
699         }
700         goto repeat;
701 }
702
703 /**********************************************/
704 /*           CKRM Idle Tasks                  */
705 /**********************************************/
706 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
707 struct task_struct* ckrm_idle_tasks[NR_CPUS];
708
709 /*how many ckrm idle tasks should I wakeup*/
710 static inline int get_nr_idle(unsigned long surplus)
711 {
712         int cpu_online = cpus_weight(cpu_online_map);   
713         int nr_idle = 0; 
714         
715         nr_idle = surplus * cpu_online;
716         nr_idle >>= CKRM_SHARE_ACCURACY;
717
718         if (surplus) 
719                 nr_idle ++;
720
721         if (nr_idle > cpu_online)  
722                 nr_idle = cpu_online;
723
724         return nr_idle;
725 }
726
727 /**
728  * update_ckrm_idle: update the status of the idle class according to the new surplus
729  * surplus: new system surplus
730  *
731  * Task:
732  * -- update share of the idle class 
733  * -- wakeup idle tasks according to surplus
734  */
735 void update_ckrm_idle(unsigned long surplus)
736 {
737         int nr_idle = get_nr_idle(surplus);
738         int i;
739         struct task_struct* idle_task;
740
741         set_eshare(&ckrm_idle_class->stat,surplus);
742         set_meshare(&ckrm_idle_class->stat,surplus);
743         /*wake up nr_idle idle tasks*/
744         for_each_online_cpu(i) {
745                 idle_task = ckrm_idle_tasks[i];
746                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
747                         ckrm_cpu_change_class(idle_task,
748                                               idle_task->cpu_class,
749                                               ckrm_idle_class);
750                 }
751                 if (! idle_task)
752                         continue;
753                 if (i < nr_idle) {
754                         //activate it
755                         wake_up_process(idle_task);
756                 } else {
757                         //deactivate it
758                         idle_task->state = TASK_INTERRUPTIBLE;
759                         set_tsk_need_resched(idle_task);
760                 }
761         }
762 }
763
764 static int ckrm_cpu_idled(void *nothing)
765 {
766         set_user_nice(current,19);
767         daemonize("ckrm_idle_task");
768
769         //deactivate it, it will be awakened by ckrm_cpu_monitor
770         current->state = TASK_INTERRUPTIBLE;
771         schedule();             
772
773         /*similar to cpu_idle */
774         while (1) {
775                 while (!need_resched()) {
776                         ckrm_cpu_monitor(1);
777                         if (current_cpu_data.hlt_works_ok) {
778                                 local_irq_disable();
779                                 if (!need_resched()) {
780                                         set_tsk_need_resched(current);
781                                         safe_halt();
782                                 } else
783                                         local_irq_enable();
784                         }
785                 }
786                 schedule();             
787         }
788         return 0;
789 }
790
791 /**
792  * ckrm_start_ckrm_idle: 
793  *  create the ckrm_idle_class and starts the idle tasks
794  *
795  */
796 void ckrm_start_ckrm_idle(void)
797 {
798         int i;
799         int ret;
800         ckrm_shares_t shares;
801         
802         ckrm_idle_class = &ckrm_idle_class_obj; 
803         memset(ckrm_idle_class,0,sizeof(shares));
804         /*don't care about the shares */
805         init_cpu_class(ckrm_idle_class,&shares);
806         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
807         
808         for_each_online_cpu(i) {
809                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
810                 
811                 /*warn on error, but the system should still work without it*/
812                 if (ret < 0)
813                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
814                 else {
815                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
816                         if (!ckrm_idle_tasks[i])
817                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
818                 }
819         }
820 }
821
822 /**********************************************/
823 /*          Local Weight                      */
824 /**********************************************/
825 /**
826  * adjust_class_local_weight: adjust the local weight for each cpu
827  *
828  * lrq->weight = lpr->pressure * class->weight / total_pressure
829  */
830 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
831 {
832         unsigned long total_pressure = 0;
833         ckrm_lrq_t* lrq;
834         int i;
835         unsigned long class_weight;
836         unsigned long long lw;  
837         
838         //get total pressure
839         for_each_online_cpu(i) {
840                 lrq = get_ckrm_lrq(clsptr,i);
841                 total_pressure += lrq->lrq_load;
842         }
843
844 #if 1
845 #warning "ACB taking out suspicious early return"
846 #else
847         if (! total_pressure)
848                 return;
849 #endif
850         
851         class_weight = cpu_class_weight(clsptr) * cpu_online;
852
853         /*
854          * update weight for each cpu, minimun is 1
855          */
856         for_each_online_cpu(i) {
857                 lrq = get_ckrm_lrq(clsptr,i);
858                 if (! lrq->lrq_load)
859                         /*give idle class a high share to boost interactiveness */
860                         lw = cpu_class_weight(clsptr); 
861                 else {
862                         lw = lrq->lrq_load * class_weight;
863                         do_div(lw,total_pressure);
864                         if (!lw)
865                                 lw = 1;
866                         else if (lw > CKRM_SHARE_MAX)
867                                 lw = CKRM_SHARE_MAX;
868                 }       
869
870                 lrq->local_weight = lw;
871         }
872 }
873
874 /*
875  * assume called with class_list_lock read lock held
876  */
877
878 void adjust_local_weight(void)
879 {
880         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
881         struct ckrm_cpu_class *clsptr;
882         int cpu_online;
883
884         //do nothing if someone already holding the lock
885         if (! spin_trylock(&lock))
886                 return;
887
888         cpu_online = cpus_weight(cpu_online_map);       
889
890         //class status: demand, share,total_ns prio, index
891         list_for_each_entry(clsptr,&active_cpu_classes,links) {
892                 adjust_lrq_weight(clsptr,cpu_online);
893         }
894
895         spin_unlock(&lock);
896 }
897
898 /**********************************************/
899 /*          Main                              */
900 /**********************************************/
901 /**
902  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
903  *@check_min: if check_min is set, the call can't be within 100ms of last call
904  *
905  * this function is called every CPU_MONITOR_INTERVAL
906  * it computes the cpu demand of each class
907  * and re-allocate the un-used shares to other classes
908  */
909 void ckrm_cpu_monitor(int check_min)
910 {
911         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
912         static unsigned long long last_check = 0;
913         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
914         unsigned long long now; 
915 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
916
917         if (!root_core)
918                 return;
919
920         //do nothing if someone already holding the lock
921         if (! spin_trylock(&lock))
922                 return;
923
924         read_lock(&class_list_lock);
925
926         now = sched_clock();
927
928         //consecutive check should be at least 100ms apart
929         if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL))
930                 goto outunlock;
931
932         last_check = now;
933
934         if (update_effectives(root_core) != 0)
935                 goto outunlock;
936         
937         if (update_max_demand(root_core) != 0)
938                 goto outunlock;
939         
940 #ifndef ALLOC_SURPLUS_SUPPORT
941 #warning "MEF taking out alloc_surplus"
942 #else
943         if (alloc_surplus(root_core) != 0)
944                 goto outunlock;
945 #endif
946         
947         adjust_local_weight();
948
949  outunlock:     
950         read_unlock(&class_list_lock);
951         spin_unlock(&lock);
952 }
953
954 /*****************************************************/
955 /*            Supporting Functions                   */
956 /*****************************************************/
957 static pid_t cpu_monitor_pid = -1;
958 static int thread_exit = 0;
959
960 static int ckrm_cpu_monitord(void *nothing)
961 {
962         daemonize("ckrm_cpu_ctrld");
963         for (;;) {
964                 /*sleep for sometime before next try*/
965                 set_current_state(TASK_INTERRUPTIBLE);
966                 schedule_timeout(CPU_MONITOR_INTERVAL);
967                 ckrm_cpu_monitor(1);
968                 if (thread_exit) {
969                         break;
970                 }
971         }
972         cpu_monitor_pid = -1;
973         thread_exit = 2;
974         printk(KERN_DEBUG "cpu_monitord exit\n");
975         return 0;
976 }
977
978 void ckrm_start_monitor(void)
979 {
980         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
981         if (cpu_monitor_pid < 0) {
982                 printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
983         }
984 }
985
986 void ckrm_kill_monitor(void)
987 {
988         printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
989         if (cpu_monitor_pid > 0) {
990                 thread_exit = 1;
991                 while (thread_exit != 2) {
992                         set_current_state(TASK_INTERRUPTIBLE);
993                         schedule_timeout(CPU_MONITOR_INTERVAL);
994                 }
995         }
996 }
997
998 int ckrm_cpu_monitor_init(void)
999 {
1000         ckrm_start_monitor();
1001         /*hzheng: uncomment the following like for hard limit support */
1002         //      ckrm_start_ckrm_idle();
1003         return 0;
1004 }
1005
1006 void ckrm_cpu_monitor_exit(void)
1007 {
1008         ckrm_kill_monitor();
1009 }
1010
1011 module_init(ckrm_cpu_monitor_init);
1012 module_exit(ckrm_cpu_monitor_exit);
1013
1014 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
1015 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
1016 MODULE_LICENSE("GPL");