This commit was manufactured by cvs2svn to create branch
[linux-2.6.git] / kernel / ckrm / ckrm_cpu_monitor.c
1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
2  *
3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
4  *           (C) Hubertus Franke, IBM Corp. 2004
5  * 
6  * Latest version, more details at http://ckrm.sf.net
7  * 
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  */
14
15 /* Changes
16  * 
17  * 23 June 2004: Created
18  * 
19  */
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <asm/errno.h>
23 #include <linux/list.h>
24 #include <linux/spinlock.h>
25 #include <linux/ckrm.h>
26 #include <linux/ckrm_rc.h>
27 #include <linux/ckrm_tc.h>
28 #include <asm/div64.h>
29 #include <linux/ckrm_sched.h>
30
31 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
32 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
33
34 #define CKRM_CPU_DEMAND_RUN 0
35 #define CKRM_CPU_DEMAND_SLEEP 1
36 //sample task cpu demand every 64ms
37 #define CPU_DEMAND_TASK_RECALC  (64000000LL)
38 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
39 #define CPU_DEMAND_TP_CLASS 0
40 #define CPU_DEMAND_TP_TASK 1
41
42 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
43 void update_ckrm_idle(unsigned long surplus);
44
45 /*interface to share definition*/
46 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
47 {
48         return cls->shares.my_limit;
49 }
50
51 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
52 {
53         return cls->shares.total_guarantee;
54 }
55
56 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
57 {
58         return cls->shares.total_guarantee;
59 }
60
61 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
62 {
63         return cls->shares.total_guarantee;
64 }
65
66
67 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
68 {
69         unsigned long long now = sched_clock();
70
71         local_stat->run = 0;
72         local_stat->total = 0;
73         local_stat->last_sleep = now;
74         switch (type) {
75         case CPU_DEMAND_TP_CLASS:
76                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
77                 local_stat->cpu_demand = 0; 
78                 break;
79         case CPU_DEMAND_TP_TASK:
80                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
81                 //for task, the init cpu_demand is copied from its parent
82                 break;
83         default:
84                 BUG();
85         }
86 }
87
88 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
89 {
90         int i;
91
92         stat->stat_lock = SPIN_LOCK_UNLOCKED;
93         stat->total_ns = 0;
94         stat->max_demand = 0;
95
96         for (i=0; i< NR_CPUS; i++) {
97                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
98         }
99
100         stat->egrt = 0;
101         stat->megrt = 0;
102         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
103         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
104
105         stat->eshare = CKRM_SHARE_MAX;
106         stat->meshare = CKRM_SHARE_MAX;
107 }
108
109 /**********************************************/
110 /*          cpu demand                        */
111 /**********************************************/
112
113 /*
114  * How CPU demand is calculated:
115  * consider class local runqueue (clr) first
116  * at any time, a clr can at the following three states
117  * -- run: a task belonning to this class is running on this cpu
118  * -- wait: at least one of its task is running, but the class is not running
119  * -- sleep: none of the task of this class is runnable
120  *
121  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
122  * 
123  * the cpu_demand of a class = 
124  *    sum of cpu_demand of all the class local runqueues
125  */
126
127 /**
128  * update_cpu_demand_stat - 
129  * 
130  * should be called whenever the state of a task/task local queue changes
131  * -- when deschedule : report how much run
132  * -- when enqueue: report how much sleep
133  *
134  * how often should we recalculate the cpu demand
135  * the number is in ns
136  */
137 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
138 {       
139         local_stat->total += len;
140         if (state == CKRM_CPU_DEMAND_RUN)
141                 local_stat->run += len;
142
143         if (local_stat->total >= local_stat->recalc_interval) {
144                 local_stat->total >>= CKRM_SHARE_ACCURACY;
145                 if (unlikely(local_stat->run > 0xFFFFFFFF))
146                         local_stat->run = 0xFFFFFFFF;
147
148                 if (local_stat->total > 0xFFFFFFFF) 
149                         local_stat->total = 0xFFFFFFFF;
150                         
151                 do_div(local_stat->run,(unsigned long)local_stat->total);
152
153                 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
154                         local_stat->cpu_demand = local_stat->run;
155                 else {
156                         local_stat->cpu_demand += local_stat->run;
157                         local_stat->cpu_demand >>= 1;
158                 }
159                 local_stat->total = 0;
160                 local_stat->run = 0;
161         }
162 }
163
164 /**
165  * cpu_demand_event - and cpu_demand event occured
166  * @event: one of the following three events:
167  *   CPU_DEMAND_ENQUEUE: local class enqueue
168  *   CPU_DEMAND_DEQUEUE: local class dequeue
169  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
170  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
171  */
172 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) 
173 {       
174         switch (event) {
175         case CPU_DEMAND_ENQUEUE: 
176                 len = sched_clock() - local_stat->last_sleep;
177                 local_stat->last_sleep = 0;
178                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
179                 break;
180         case CPU_DEMAND_DEQUEUE:
181                 if (! local_stat->last_sleep) {
182                         local_stat->last_sleep = sched_clock();
183                 }
184                 break;
185         case CPU_DEMAND_DESCHEDULE:
186                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
187                 break;
188         case CPU_DEMAND_INIT: //for task init only
189                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
190                 break;
191         default:
192                 BUG();
193         }
194 }
195
196 /** 
197  * check all the class local queue
198  * 
199  * to deal with excessive long run/sleep state
200  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
201  */
202 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
203 {
204         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
205         unsigned long long sleep,now;
206         if (local_stat->last_sleep) {
207                 now = sched_clock();
208                 sleep = now - local_stat->last_sleep;
209                 local_stat->last_sleep = now;
210                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
211         }
212 }
213
214 /**
215  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
216  *
217  * self_cpu_demand = sum(cpu demand of all local queues) 
218  */
219 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
220 {
221         int cpu_demand = 0;
222         int i;
223         int cpuonline = 0;
224
225         for_each_online_cpu(i) {
226                 cpu_demand_check_sleep(stat,i);
227                 cpu_demand += stat->local_stats[i].cpu_demand;
228                 cpuonline ++;
229         }
230
231         return (cpu_demand/cpuonline);
232 }
233
234 /*
235  * my max demand = min(cpu_demand, my effective hard limit)
236  */
237 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
238 {
239         unsigned long mmax_demand = get_self_cpu_demand(stat);
240         if (mmax_demand > stat->mehl)
241                 mmax_demand = stat->mehl;
242
243         return mmax_demand;
244 }
245
246 /**
247  * update_max_demand: update effective cpu demand for each class
248  * return -1 on error
249  * 
250  * Assume: the root_core->parent == NULL
251  */
252 static int update_max_demand(struct ckrm_core_class *root_core)
253 {
254         struct ckrm_core_class *cur_core, *child_core;
255         struct ckrm_cpu_class *cls,*c_cls;
256         int ret = -1;
257
258         cur_core = root_core;
259         child_core = NULL;
260         
261  repeat:
262         if (!cur_core) { //normal exit
263                 ret = 0;
264                 goto out;
265         }
266
267         cls = ckrm_get_cpu_class(cur_core);
268         if (! cls) //invalid c_cls, abort
269                 goto out;
270
271         if (!child_core)        //first child
272                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
273         else {
274                 c_cls = ckrm_get_cpu_class(child_core);
275                 if (c_cls)
276                         cls->stat.max_demand += c_cls->stat.max_demand;
277                 else //invalid c_cls, abort
278                         goto out;
279         }
280
281         //check class hard limit
282         if (cls->stat.max_demand > cls->stat.ehl)
283                 cls->stat.max_demand = cls->stat.ehl;
284
285         //next child
286         child_core = ckrm_get_next_child(cur_core, child_core);
287         if (child_core) {
288                 //go down
289                 cur_core = child_core;
290                 child_core = NULL;
291                 goto repeat;
292         } else {                //no more child, go back
293                 child_core = cur_core;
294                 cur_core = child_core->hnode.parent;
295         }
296         goto repeat;
297  out:
298         return ret;
299 }
300
301 /**********************************************/
302 /*          effective guarantee & limit       */
303 /**********************************************/
304 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
305                                        int new_share)
306 {
307         if (!new_share)
308                 new_share = 1;
309
310         BUG_ON(new_share < 0);
311         stat->eshare = new_share;
312 }
313
314 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
315                                             int new_share)
316 {
317         if (!new_share)
318                 new_share = 1;
319
320         BUG_ON(new_share < 0);
321         stat->meshare = new_share;
322 }
323
324 /**
325  *update_child_effective - update egrt, ehl, mehl for all children of parent
326  *@parent: the parent node
327  *return -1 if anything wrong
328  *
329  */
330 static int update_child_effective(struct ckrm_core_class *parent)
331 {
332         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
333         struct ckrm_core_class *child_core;     
334         int ret = -1;
335
336         if (! p_cls)
337                 return ret;
338
339         child_core = ckrm_get_next_child(parent, NULL);
340         while (child_core) {
341                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
342                 if (! c_cls)
343                         return ret;
344
345                 c_cls->stat.egrt =
346                     p_cls->stat.egrt *
347                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
348
349                 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
350                         / c_cls->shares.total_guarantee;
351                 
352                 c_cls->stat.ehl =
353                     p_cls->stat.ehl *
354                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
355
356                 c_cls->stat.mehl =
357                     c_cls->stat.ehl *
358                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
359
360                 set_eshare(&c_cls->stat,c_cls->stat.egrt);
361                 set_meshare(&c_cls->stat,c_cls->stat.megrt);
362
363
364                 child_core = ckrm_get_next_child(parent, child_core);
365         };
366         return 0;
367 }
368
369 /**
370  * update_effectives: update egrt, ehl, mehl for the whole tree
371  * should be called only when class structure changed
372  *
373  * return -1 if anything wrong happened (eg: the structure changed during the process)
374  */
375 static int update_effectives(struct ckrm_core_class *root_core)
376 {
377         struct ckrm_core_class *cur_core, *child_core;
378         struct ckrm_cpu_class *cls;
379         int ret = -1;
380
381         cur_core = root_core;
382         child_core = NULL;
383         cls = ckrm_get_cpu_class(cur_core);
384
385         //initialize the effectives for root 
386         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
387         cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
388                 / cls->shares.total_guarantee;
389         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
390                 / cls->shares.total_guarantee;
391         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
392                 / cls->shares.total_guarantee;
393         set_eshare(&cls->stat,cls->stat.egrt);
394         set_meshare(&cls->stat,cls->stat.megrt);
395
396  repeat:
397         //check exit
398         if (!cur_core)
399                 return 0;
400
401         //visit this node only once
402         if (! child_core)
403                 if (update_child_effective(cur_core) < 0)
404                         return ret; //invalid cur_core node
405         
406         //next child
407         child_core = ckrm_get_next_child(cur_core, child_core);
408
409         if (child_core) {
410                 //go down to the next hier
411                 cur_core = child_core;
412                 child_core = NULL;
413         } else { //no more child, go back
414                 child_core = cur_core;
415                 cur_core = child_core->hnode.parent;
416         }
417         goto repeat;
418 }
419
420 /**********************************************/
421 /*          surplus allocation                */
422 /**********************************************/
423
424 /*
425  * surplus = egrt - demand
426  * if surplus < 0, surplus = 0 
427  */
428 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
429 {
430         int surplus = cls->stat.egrt - cls->stat.max_demand;
431
432         if (surplus < 0)
433                 surplus = 0;
434
435         return surplus;
436 }
437
438 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
439 {
440         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
441
442         if (surplus < 0)
443                 surplus = 0;
444
445         return surplus;
446 }
447
448 /**
449  * consume_surplus: decides how much surplus a node can consume
450  * @ckeck_sl: if check_sl is set, then check soft_limitx
451  * return how much consumed
452  *
453  * implements all the CKRM Scheduling Requirement
454  * assume c_cls is valid
455  */
456 static inline int consume_surplus(int surplus,
457                                        struct ckrm_cpu_class *c_cls,
458                                        struct ckrm_cpu_class *p_cls,
459                                        int check_sl
460                                        )
461 {
462         int consumed = 0;
463         int inc_limit;
464         int total_grt = p_cls->shares.total_guarantee;
465
466         BUG_ON(surplus < 0);
467
468         /*can't consume more than demand or hard limit*/
469         if (c_cls->stat.eshare >= c_cls->stat.max_demand)
470                 goto out;
471
472         //the surplus allocation is propotional to grt
473         consumed =
474                 surplus * c_cls->shares.my_guarantee / total_grt;
475
476         if (! consumed) //no more share
477                 goto out;
478
479         //hard limit and demand limit
480         inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
481
482         if (check_sl) {
483                 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
484                         /total_grt;
485                 if (esl < c_cls->stat.max_demand)
486                         inc_limit = esl - c_cls->stat.eshare;
487         }
488
489         if (consumed > inc_limit)
490                 consumed = inc_limit;
491
492         BUG_ON(consumed < 0);
493  out:           
494         return consumed;
495 }
496
497 /*
498  * how much a node can consume for itself?
499  */
500 static inline int consume_self_surplus(int surplus,
501                                        struct ckrm_cpu_class *p_cls,
502                                        int check_sl
503                                        )
504 {
505         int consumed = 0;
506         int inc_limit;
507         int total_grt = p_cls->shares.total_guarantee;
508         int max_demand = get_mmax_demand(&p_cls->stat);
509
510         BUG_ON(surplus < 0);
511
512         /*can't consume more than demand or hard limit*/
513         if (p_cls->stat.meshare >= max_demand)
514                 goto out;
515
516         //the surplus allocation is propotional to grt
517         consumed =
518                 surplus * p_cls->shares.unused_guarantee / total_grt;
519
520         if (! consumed) //no more share
521                 goto out;
522
523         //hard limit and demand limit
524         inc_limit = max_demand - p_cls->stat.meshare;
525
526         if (check_sl) {
527                 int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
528                         /total_grt;
529                 if (mesl < max_demand)
530                         inc_limit = mesl - p_cls->stat.meshare;
531         }
532
533         if (consumed > inc_limit)
534                 consumed = inc_limit;
535
536         BUG_ON(consumed < 0);
537  out:           
538         return consumed;
539 }
540
541
542 /*
543  * allocate surplus to all its children and also its default class
544  */
545 static int alloc_surplus_single_round(
546                                       int surplus,
547                                       struct ckrm_core_class *parent,
548                                       struct ckrm_cpu_class *p_cls,
549                                       int check_sl)
550 {
551         struct ckrm_cpu_class *c_cls;
552         struct ckrm_core_class *child_core = NULL;
553         int total_consumed = 0,consumed;
554
555         //first allocate to the default class
556         consumed  =
557                 consume_self_surplus(surplus,p_cls,check_sl);
558
559         if (consumed > 0) {
560                 set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
561                 total_consumed += consumed;
562         }
563
564         do {
565                 child_core = ckrm_get_next_child(parent, child_core);
566                 if (child_core)  {
567                         c_cls = ckrm_get_cpu_class(child_core);
568                         if (! c_cls)
569                                 return -1;
570
571                         consumed    =
572                                 consume_surplus(surplus, c_cls,
573                                                      p_cls,check_sl);
574                         if (consumed > 0) {
575                                 set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
576                                 total_consumed += consumed;
577                         }
578                 }
579         } while (child_core);
580
581         return total_consumed;
582 }
583
584 /**
585  * alloc_surplus_node: re-allocate the shares for children under parent
586  * @parent: parent node
587  * return the remaining surplus
588  *
589  * task:
590  *  1. get total surplus
591  *  2. allocate surplus
592  *  3. set the effective_share of each node
593  */
594 static int alloc_surplus_node(struct ckrm_core_class *parent)
595 {
596         struct ckrm_cpu_class *p_cls,*c_cls;
597         int total_surplus,consumed;
598         int check_sl;
599         int ret = -1;
600         struct ckrm_core_class *child_core = NULL;
601
602         p_cls = ckrm_get_cpu_class(parent);
603         if (! p_cls)
604                 goto realloc_out;
605
606         /*
607          * get total surplus
608          */
609         total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
610         BUG_ON(total_surplus < 0);
611         total_surplus += get_my_node_surplus(p_cls);
612
613         do {
614                 child_core = ckrm_get_next_child(parent, child_core);
615                 if (child_core) {
616                         c_cls = ckrm_get_cpu_class(child_core);                         
617                         if (! c_cls)
618                                 goto realloc_out;
619
620                         total_surplus += get_node_surplus(c_cls);
621                 }
622         } while (child_core);
623
624
625         if (! total_surplus) {
626                 ret = 0;
627                 goto realloc_out;
628         }
629
630         /* 
631          * distributing the surplus 
632          * first with the check_sl enabled
633          * once all the tasks has research the soft limit, disable check_sl and try again
634          */
635         
636         check_sl = 1;
637         do {
638                 consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
639                 if (consumed < 0) //something is wrong
640                         goto realloc_out;
641
642                 if (! consumed)
643                         check_sl = 0;
644                 else
645                         total_surplus -= consumed;
646
647         } while ((total_surplus > 0) && (consumed || check_sl) );
648
649         ret = 0;
650         
651  realloc_out:
652         return ret;
653 }
654
655 /**
656  * alloc_surplus - reallocate unused shares
657  *
658  * class A's usused share should be allocated to its siblings
659  * the re-allocation goes downward from the top
660  */
661 static int alloc_surplus(struct ckrm_core_class *root_core)
662 {
663         struct ckrm_core_class *cur_core, *child_core;
664         //      struct ckrm_cpu_class *cls;
665         int ret = -1;
666
667         /*initialize*/
668         cur_core = root_core;
669         child_core = NULL;
670         //      cls = ckrm_get_cpu_class(cur_core);
671
672         /*the ckrm idle tasks get all what's remaining*/
673         /*hzheng: uncomment the following like for hard limit support */
674         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
675         
676  repeat:
677         //check exit
678         if (!cur_core)
679                 return 0;
680
681         //visit this node only once
682         if (! child_core) 
683                 if ( alloc_surplus_node(cur_core) < 0 )
684                         return ret;
685
686         //next child
687         child_core = ckrm_get_next_child(cur_core, child_core);
688         if (child_core) {
689                 //go down
690                 cur_core = child_core;
691                 child_core = NULL;
692                 goto repeat;
693         } else {                //no more child, go back
694                 child_core = cur_core;
695                 cur_core = child_core->hnode.parent;
696         }
697         goto repeat;
698 }
699
700 /**********************************************/
701 /*           CKRM Idle Tasks                  */
702 /**********************************************/
703 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
704 struct task_struct* ckrm_idle_tasks[NR_CPUS];
705
706 /*how many ckrm idle tasks should I wakeup*/
707 static inline int get_nr_idle(unsigned long surplus)
708 {
709         int cpu_online = cpus_weight(cpu_online_map);   
710         int nr_idle = 0; 
711         
712         nr_idle = surplus * cpu_online;
713         nr_idle >>= CKRM_SHARE_ACCURACY;
714
715         if (surplus) 
716                 nr_idle ++;
717
718         if (nr_idle > cpu_online)  
719                 nr_idle = cpu_online;
720
721         return nr_idle;
722 }
723
724 /**
725  * update_ckrm_idle: update the status of the idle class according to the new surplus
726  * surplus: new system surplus
727  *
728  * Task:
729  * -- update share of the idle class 
730  * -- wakeup idle tasks according to surplus
731  */
732 void update_ckrm_idle(unsigned long surplus)
733 {
734         int nr_idle = get_nr_idle(surplus);
735         int i;
736         struct task_struct* idle_task;
737
738         set_eshare(&ckrm_idle_class->stat,surplus);
739         set_meshare(&ckrm_idle_class->stat,surplus);
740         /*wake up nr_idle idle tasks*/
741         for_each_online_cpu(i) {
742                 idle_task = ckrm_idle_tasks[i];
743                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
744                         ckrm_cpu_change_class(idle_task,
745                                               idle_task->cpu_class,
746                                               ckrm_idle_class);
747                 }
748                 if (! idle_task)
749                         continue;
750                 if (i < nr_idle) {
751                         //activate it
752                         wake_up_process(idle_task);
753                 } else {
754                         //deactivate it
755                         idle_task->state = TASK_INTERRUPTIBLE;
756                         set_tsk_need_resched(idle_task);
757                 }
758         }
759 }
760
761 static int ckrm_cpu_idled(void *nothing)
762 {
763         set_user_nice(current,19);
764         daemonize("ckrm_idle_task");
765
766         //deactivate it, it will be awakened by ckrm_cpu_monitor
767         current->state = TASK_INTERRUPTIBLE;
768         schedule();             
769
770         /*similar to cpu_idle */
771         while (1) {
772                 while (!need_resched()) {
773                         ckrm_cpu_monitor(1);
774                         if (current_cpu_data.hlt_works_ok) {
775                                 local_irq_disable();
776                                 if (!need_resched()) {
777                                         set_tsk_need_resched(current);
778                                         safe_halt();
779                                 } else
780                                         local_irq_enable();
781                         }
782                 }
783                 schedule();             
784         }
785         return 0;
786 }
787
788 /**
789  * ckrm_start_ckrm_idle: 
790  *  create the ckrm_idle_class and starts the idle tasks
791  *
792  */
793 void ckrm_start_ckrm_idle(void)
794 {
795         int i;
796         int ret;
797         ckrm_shares_t shares;
798         
799         ckrm_idle_class = &ckrm_idle_class_obj; 
800         memset(ckrm_idle_class,0,sizeof(shares));
801         /*don't care about the shares */
802         init_cpu_class(ckrm_idle_class,&shares);
803         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
804         
805         for_each_online_cpu(i) {
806                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
807                 
808                 /*warn on error, but the system should still work without it*/
809                 if (ret < 0)
810                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
811                 else {
812                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
813                         if (!ckrm_idle_tasks[i])
814                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
815                 }
816         }
817 }
818
819 /**********************************************/
820 /*          Local Weight                      */
821 /**********************************************/
822 /**
823  * adjust_class_local_weight: adjust the local weight for each cpu
824  *
825  * lrq->weight = lpr->pressure * class->weight / total_pressure
826  */
827 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
828 {
829         unsigned long total_pressure = 0;
830         ckrm_lrq_t* lrq;
831         int i;
832         unsigned long class_weight;
833         unsigned long long lw;  
834
835         //get total pressure
836         for_each_online_cpu(i) {
837                 lrq = get_ckrm_lrq(clsptr,i);
838                 total_pressure += lrq->lrq_load;
839         }
840
841         if (! total_pressure)
842                 return;
843         
844         class_weight = cpu_class_weight(clsptr) * cpu_online;
845
846         /*
847          * update weight for each cpu, minimun is 1
848          */
849         for_each_online_cpu(i) {
850                 lrq = get_ckrm_lrq(clsptr,i);
851                 if (! lrq->lrq_load)
852                         /*give idle class a high share to boost interactiveness */
853                         lw = cpu_class_weight(clsptr); 
854                 else {
855                         lw = lrq->lrq_load * class_weight;
856                         do_div(lw,total_pressure);
857                         if (!lw)
858                                 lw = 1;
859                         else if (lw > CKRM_SHARE_MAX)
860                                 lw = CKRM_SHARE_MAX;
861                 }
862                 
863                 lrq->local_weight = lw;
864         }
865 }
866
867 /*
868  * assume called with class_list_lock read lock held
869  */
870 void adjust_local_weight(void)
871 {
872         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
873         struct ckrm_cpu_class *clsptr;
874         int cpu_online;
875
876         //do nothing if someone already holding the lock
877         if (! spin_trylock(&lock))
878                 return;
879
880         cpu_online = cpus_weight(cpu_online_map);       
881
882         //class status: demand, share,total_ns prio, index
883         list_for_each_entry(clsptr,&active_cpu_classes,links) {
884                 adjust_lrq_weight(clsptr,cpu_online);
885         }
886
887         spin_unlock(&lock);
888 }
889
890 /**********************************************/
891 /*          Main                              */
892 /**********************************************/
893 /**
894  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
895  *@check_min: if check_min is set, the call can't be within 100ms of last call
896  *
897  * this function is called every CPU_MONITOR_INTERVAL
898  * it computes the cpu demand of each class
899  * and re-allocate the un-used shares to other classes
900  */
901 void ckrm_cpu_monitor(int check_min)
902 {
903         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
904         static unsigned long long last_check = 0;
905         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
906         unsigned long long now; 
907 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
908
909         if (!root_core)
910                 return;
911
912         //do nothing if someone already holding the lock
913         if (! spin_trylock(&lock))
914                 return;
915
916         read_lock(&class_list_lock);
917
918         now = sched_clock();
919
920         //consecutive check should be at least 100ms apart
921         if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL))
922                 goto outunlock;
923
924         last_check = now;
925
926         if (update_effectives(root_core) != 0)
927                 goto outunlock;
928         
929         if (update_max_demand(root_core) != 0)
930                 goto outunlock;
931         
932 #ifndef ALLOC_SURPLUS_SUPPORT
933 #warning "MEF taking out alloc_surplus"
934 #else
935         if (alloc_surplus(root_core) != 0)
936                 goto outunlock;
937 #endif
938         
939         adjust_local_weight();
940
941  outunlock:     
942         read_unlock(&class_list_lock);
943         spin_unlock(&lock);
944 }
945
946 /*****************************************************/
947 /*            Supporting Functions                   */
948 /*****************************************************/
949 static pid_t cpu_monitor_pid = -1;
950 static int thread_exit = 0;
951
952 static int ckrm_cpu_monitord(void *nothing)
953 {
954         daemonize("ckrm_cpu_ctrld");
955         for (;;) {
956                 /*sleep for sometime before next try*/
957                 set_current_state(TASK_INTERRUPTIBLE);
958                 schedule_timeout(CPU_MONITOR_INTERVAL);
959                 ckrm_cpu_monitor(1);
960                 if (thread_exit) {
961                         break;
962                 }
963         }
964         cpu_monitor_pid = -1;
965         thread_exit = 2;
966         printk(KERN_DEBUG "cpu_monitord exit\n");
967         return 0;
968 }
969
970 void ckrm_start_monitor(void)
971 {
972         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
973         if (cpu_monitor_pid < 0) {
974                 printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
975         }
976 }
977
978 void ckrm_kill_monitor(void)
979 {
980         printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
981         if (cpu_monitor_pid > 0) {
982                 thread_exit = 1;
983                 while (thread_exit != 2) {
984                         set_current_state(TASK_INTERRUPTIBLE);
985                         schedule_timeout(CPU_MONITOR_INTERVAL);
986                 }
987         }
988 }
989
990 int ckrm_cpu_monitor_init(void)
991 {
992         ckrm_start_monitor();
993         /*hzheng: uncomment the following like for hard limit support */
994         //      ckrm_start_ckrm_idle();
995         return 0;
996 }
997
998 void ckrm_cpu_monitor_exit(void)
999 {
1000         ckrm_kill_monitor();
1001 }
1002
1003 module_init(ckrm_cpu_monitor_init);
1004 module_exit(ckrm_cpu_monitor_exit);
1005
1006 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
1007 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
1008 MODULE_LICENSE("GPL");