ckrm_E16 rc1 io controller version 4
[linux-2.6.git] / kernel / ckrm / ckrm_cpu_monitor.c
1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
2  *
3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
4  *           (C) Hubertus Franke, IBM Corp. 2004
5  * 
6  * Latest version, more details at http://ckrm.sf.net
7  * 
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  */
14
15 /* Changes
16  * 
17  * 23 June 2004: Created
18  * 
19  */
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <asm/errno.h>
23 #include <linux/list.h>
24 #include <linux/spinlock.h>
25 #include <linux/ckrm.h>
26 #include <linux/ckrm_rc.h>
27 #include <linux/ckrm_tc.h>
28 #include <asm/div64.h>
29 #include <linux/ckrm_sched.h>
30
31 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
32 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
33
34 #define CKRM_CPU_DEMAND_RUN 0
35 #define CKRM_CPU_DEMAND_SLEEP 1
36 //sample task cpu demand every 64ms
37 #define CPU_DEMAND_TASK_RECALC  (64000000LL)
38 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
39 #define CPU_DEMAND_TP_CLASS 0
40 #define CPU_DEMAND_TP_TASK 1
41
42 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
43 void update_ckrm_idle(unsigned long surplus);
44
45 /*interface to share definition*/
46 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
47 {
48         return cls->shares.my_limit;
49 }
50
51 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
52 {
53         return cls->shares.total_guarantee;
54 }
55
56 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
57 {
58         return cls->shares.total_guarantee;
59 }
60
61 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
62 {
63         return cls->shares.total_guarantee;
64 }
65
66
67 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
68 {
69         unsigned long long now = sched_clock();
70
71         local_stat->run = 0;
72         local_stat->total = 0;
73         local_stat->last_sleep = now;
74         switch (type) {
75         case CPU_DEMAND_TP_CLASS:
76                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
77                 local_stat->cpu_demand = 0; 
78                 break;
79         case CPU_DEMAND_TP_TASK:
80                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
81                 //for task, the init cpu_demand is copied from its parent
82                 break;
83         default:
84                 BUG();
85         }
86 }
87
88 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
89 {
90         int i;
91
92         stat->stat_lock = SPIN_LOCK_UNLOCKED;
93         stat->total_ns = 0;
94         stat->max_demand = 0;
95
96         for (i=0; i< NR_CPUS; i++) {
97                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
98         }
99
100         stat->egrt = 0;
101         stat->megrt = 0;
102         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
103         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
104
105         stat->eshare = CKRM_SHARE_MAX;
106         stat->meshare = CKRM_SHARE_MAX;
107 }
108
109 /**********************************************/
110 /*          cpu demand                        */
111 /**********************************************/
112
113 /*
114  * How CPU demand is calculated:
115  * consider class local runqueue (clr) first
116  * at any time, a clr can at the following three states
117  * -- run: a task belonning to this class is running on this cpu
118  * -- wait: at least one of its task is running, but the class is not running
119  * -- sleep: none of the task of this class is runnable
120  *
121  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
122  * 
123  * the cpu_demand of a class = 
124  *    sum of cpu_demand of all the class local runqueues
125  */
126
127 /**
128  * update_cpu_demand_stat - 
129  * 
130  * should be called whenever the state of a task/task local queue changes
131  * -- when deschedule : report how much run
132  * -- when enqueue: report how much sleep
133  *
134  * how often should we recalculate the cpu demand
135  * the number is in ns
136  */
137 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
138 {       
139         local_stat->total += len;
140         if (state == CKRM_CPU_DEMAND_RUN)
141                 local_stat->run += len;
142
143         if (local_stat->total >= local_stat->recalc_interval) {
144                 local_stat->total >>= CKRM_SHARE_ACCURACY;
145                 if (unlikely(local_stat->run > 0xFFFFFFFF))
146                         local_stat->run = 0xFFFFFFFF;
147
148                 if (local_stat->total > 0xFFFFFFFF) 
149                         local_stat->total = 0xFFFFFFFF;
150                         
151                 do_div(local_stat->run,(unsigned long)local_stat->total);
152
153                 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
154                         local_stat->cpu_demand = local_stat->run;
155                 else {
156                         local_stat->cpu_demand += local_stat->run;
157                         local_stat->cpu_demand >>= 1;
158                 }
159                 local_stat->total = 0;
160                 local_stat->run = 0;
161         }
162 }
163
164 /**
165  * cpu_demand_event - and cpu_demand event occured
166  * @event: one of the following three events:
167  *   CPU_DEMAND_ENQUEUE: local class enqueue
168  *   CPU_DEMAND_DEQUEUE: local class dequeue
169  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
170  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
171  */
172 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) 
173 {       
174         switch (event) {
175         case CPU_DEMAND_ENQUEUE: 
176                 len = sched_clock() - local_stat->last_sleep;
177                 local_stat->last_sleep = 0;
178                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
179                 break;
180         case CPU_DEMAND_DEQUEUE:
181                 if (! local_stat->last_sleep) {
182                         local_stat->last_sleep = sched_clock();
183                 }
184                 break;
185         case CPU_DEMAND_DESCHEDULE:
186                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
187                 break;
188         case CPU_DEMAND_INIT: //for task init only
189                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
190                 break;
191         default:
192                 BUG();
193         }
194 }
195
196 /** 
197  * check all the class local queue
198  * 
199  * to deal with excessive long run/sleep state
200  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
201  */
202 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
203 {
204         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
205         unsigned long long sleep,now;
206         if (local_stat->last_sleep) {
207                 now = sched_clock();
208                 sleep = now - local_stat->last_sleep;
209                 local_stat->last_sleep = now;
210                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
211         }
212 }
213
214 /**
215  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
216  *
217  * self_cpu_demand = sum(cpu demand of all local queues) 
218  */
219 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
220 {
221         int cpu_demand = 0;
222         int i;
223         int cpuonline = 0;
224
225         for_each_online_cpu(i) {
226                 cpu_demand_check_sleep(stat,i);
227                 cpu_demand += stat->local_stats[i].cpu_demand;
228                 cpuonline ++;
229         }
230
231         return (cpu_demand/cpuonline);
232 }
233
234 /*
235  * my max demand = min(cpu_demand, my effective hard limit)
236  */
237 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
238 {
239         unsigned long mmax_demand = get_self_cpu_demand(stat);
240         if (mmax_demand > stat->mehl)
241                 mmax_demand = stat->mehl;
242
243         return mmax_demand;
244 }
245
246 /**
247  * update_max_demand: update effective cpu demand for each class
248  * return -1 on error
249  * 
250  * Assume: the root_core->parent == NULL
251  */
252 static int update_max_demand(struct ckrm_core_class *root_core)
253 {
254         struct ckrm_core_class *cur_core, *child_core;
255         struct ckrm_cpu_class *cls,*c_cls;
256         int ret = -1;
257
258         cur_core = root_core;
259         child_core = NULL;
260         
261  repeat:
262         if (!cur_core) { //normal exit
263                 ret = 0;
264                 goto out;
265         }
266
267         cls = ckrm_get_cpu_class(cur_core);
268         if (! cls) //invalid c_cls, abort
269                 goto out;
270
271         if (!child_core)        //first child
272                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
273         else {
274                 c_cls = ckrm_get_cpu_class(child_core);
275                 if (c_cls)
276                         cls->stat.max_demand += c_cls->stat.max_demand;
277                 else //invalid c_cls, abort
278                         goto out;
279         }
280
281         //check class hard limit
282         if (cls->stat.max_demand > cls->stat.ehl)
283                 cls->stat.max_demand = cls->stat.ehl;
284
285         //next child
286         child_core = ckrm_get_next_child(cur_core, child_core);
287         if (child_core) {
288                 //go down
289                 cur_core = child_core;
290                 child_core = NULL;
291                 goto repeat;
292         } else {                //no more child, go back
293                 child_core = cur_core;
294                 cur_core = child_core->hnode.parent;
295         }
296         goto repeat;
297  out:
298         return ret;
299 }
300
301 /**********************************************/
302 /*          effective guarantee & limit       */
303 /**********************************************/
304 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
305                                        int new_share)
306 {
307         if (!new_share)
308                 new_share = 1;
309
310         BUG_ON(new_share < 0);
311         stat->eshare = new_share;
312 }
313
314 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
315                                             int new_share)
316 {
317         if (!new_share)
318                 new_share = 1;
319
320         BUG_ON(new_share < 0);
321         stat->meshare = new_share;
322 }
323
324 /**
325  *update_child_effective - update egrt, ehl, mehl for all children of parent
326  *@parent: the parent node
327  *return -1 if anything wrong
328  *
329  */
330 static int update_child_effective(struct ckrm_core_class *parent)
331 {
332         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
333         struct ckrm_core_class *child_core;     
334         int ret = -1;
335
336         if (! p_cls)
337                 return ret;
338
339         child_core = ckrm_get_next_child(parent, NULL);
340         while (child_core) {
341                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
342                 if (! c_cls)
343                         return ret;
344
345                 c_cls->stat.egrt =
346                     p_cls->stat.egrt *
347                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
348
349                 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
350                         / c_cls->shares.total_guarantee;
351                 
352                 c_cls->stat.ehl =
353                     p_cls->stat.ehl *
354                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
355
356                 c_cls->stat.mehl =
357                     c_cls->stat.ehl *
358                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
359
360                 child_core = ckrm_get_next_child(parent, child_core);
361         };
362         return 0;
363 }
364
365 /**
366  * update_effectives: update egrt, ehl, mehl for the whole tree
367  * should be called only when class structure changed
368  *
369  * return -1 if anything wrong happened (eg: the structure changed during the process)
370  */
371 static int update_effectives(struct ckrm_core_class *root_core)
372 {
373         struct ckrm_core_class *cur_core, *child_core;
374         struct ckrm_cpu_class *cls;
375         int ret = -1;
376
377         cur_core = root_core;
378         child_core = NULL;
379         cls = ckrm_get_cpu_class(cur_core);
380
381         //initialize the effectives for root 
382         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
383         cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
384                 / cls->shares.total_guarantee;
385         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
386                 / cls->shares.total_guarantee;
387         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
388                 / cls->shares.total_guarantee;
389         
390  repeat:
391         //check exit
392         if (!cur_core)
393                 return 0;
394
395         //visit this node
396         if (update_child_effective(cur_core) < 0)
397                 return ret; //invalid cur_core node
398         
399         //next child
400         child_core = ckrm_get_next_child(cur_core, child_core);
401
402         if (child_core) {
403                 //go down to the next hier
404                 cur_core = child_core;
405                 child_core = NULL;
406         } else { //no more child, go back
407                 child_core = cur_core;
408                 cur_core = child_core->hnode.parent;
409         }
410         goto repeat;
411 }
412
413 /**********************************************/
414 /*          surplus allocation                */
415 /**********************************************/
416
417 /*
418  * surplus = egrt - demand
419  * if surplus < 0, surplus = 0 
420  */
421 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
422 {
423         int surplus = cls->stat.egrt - cls->stat.max_demand;
424
425         if (surplus < 0)
426                 surplus = 0;
427
428         return surplus;
429 }
430
431 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
432 {
433         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
434
435         if (surplus < 0)
436                 surplus = 0;
437
438         return surplus;
439 }
440
441 /**
442  * node_surplus_consume: consume the surplus
443  * @ckeck_sl: if check_sl is set, then check soft_limit
444  * @total_grt: total guarantee 
445  * return how much consumed
446  * return -1 on error
447  *
448  * implements all the CKRM Scheduling Requirement
449  * update total_grt if necessary 
450  */
451 static inline int node_surplus_consume(int surplus,
452                                        struct ckrm_core_class *child_core,
453                                        struct ckrm_cpu_class *p_cls,
454                                        int check_sl
455                                        )
456 {
457         int consumed = 0;
458         int inc_limit;
459         int glut = 1;
460
461         struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
462         int total_grt = p_cls->shares.total_guarantee;
463
464         BUG_ON(surplus < 0);
465
466         if (! c_cls || ! total_grt)
467                 goto out;
468
469         /*can't consume more than demand or hard limit*/
470         if (c_cls->stat.eshare >= c_cls->stat.max_demand)
471                 goto out;
472
473         consumed =
474                 surplus * c_cls->shares.my_guarantee / total_grt;
475
476         if (! consumed) //no more share
477                 goto out;
478
479         //hard limit and demand limit
480         inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
481
482         if (check_sl) {
483                 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
484                         /p_cls->shares.total_guarantee;
485                 if (esl < c_cls->stat.max_demand)
486                         inc_limit = esl - c_cls->stat.eshare;
487         }
488
489
490         if (consumed > inc_limit)
491                 consumed = inc_limit;
492         else
493                 glut = 0;
494
495         BUG_ON(consumed < 0);
496         set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
497         BUG_ON(c_cls->stat.eshare < 0);
498
499  out:           
500         return consumed;
501 }
502
503 /**
504  * alloc_surplus_node: re-allocate the shares for children under parent
505  * @parent: parent node
506  * return the remaining surplus
507  *
508  * task:
509  *  1. get total surplus
510  *  2. allocate surplus
511  *  3. set the effective_share of each node
512  */
513 static int alloc_surplus_node(struct ckrm_core_class *parent)
514 {
515         int total_surplus , old_surplus;
516         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
517         struct ckrm_core_class *child_core = NULL;
518         int self_share;
519         int check_sl;
520         int ret = -1;
521
522         if (! p_cls)
523                 return ret;
524
525         total_surplus = get_my_node_surplus(p_cls);
526
527         /*
528          * initialize effective_share
529          */
530         do {
531                 child_core = ckrm_get_next_child(parent, child_core);
532                 if (child_core) {
533                         struct ckrm_cpu_class *c_cls;
534
535                         c_cls = ckrm_get_cpu_class(child_core);                         
536                         if (! c_cls)
537                                 return ret; 
538
539                         total_surplus += get_node_surplus(c_cls);
540
541                         set_eshare(&c_cls->stat, c_cls->stat.egrt);
542                 }
543         } while (child_core);
544
545         if (! total_surplus)
546                 goto realloc_out;
547
548         /* distribute the surplus */
549         child_core = NULL;
550         check_sl = 1;
551         old_surplus = 0;
552         do {
553                 if (!child_core) {//start a new round
554
555                         //ok, everybody reached the soft limit
556                         if (old_surplus == total_surplus) 
557                                 check_sl = 0;
558                         old_surplus = total_surplus;
559                 }
560
561                 child_core = ckrm_get_next_child(parent, child_core);
562                 if (child_core)  {
563                         int consumed = 0;
564                         consumed -=
565                                 node_surplus_consume(old_surplus, child_core,
566                                                      p_cls,check_sl);
567                         if (consumed >= 0) 
568                                 total_surplus -= consumed;
569                         else
570                                 return ret;     
571                 }
572                 //start a new round if something is allocated in the last round
573         } while (child_core || check_sl || total_surplus != old_surplus);
574
575  realloc_out:
576         /*how much for itself*/
577         self_share = p_cls->stat.eshare *
578             p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
579
580         if (self_share < p_cls->stat.max_demand) {
581                 /*any remaining surplus goes to the default class*/
582                 self_share += total_surplus;    
583                 if (self_share > p_cls->stat.max_demand)
584                         self_share = p_cls->stat.max_demand;
585         }
586         
587         set_meshare(&p_cls->stat, self_share);
588         return 0;
589 }
590
591 /**
592  * alloc_surplus - reallocate unused shares
593  *
594  * class A's usused share should be allocated to its siblings
595  * the re-allocation goes downward from the top
596  */
597 static int alloc_surplus(struct ckrm_core_class *root_core)
598 {
599         struct ckrm_core_class *cur_core, *child_core;
600         struct ckrm_cpu_class *cls;
601         int ret = -1;
602
603         /*initialize*/
604         cur_core = root_core;
605         child_core = NULL;
606         cls = ckrm_get_cpu_class(cur_core);
607
608         //set root eshare
609         set_eshare(&cls->stat, cls->stat.egrt);
610
611         /*the ckrm idle tasks get all what's remaining*/
612         /*hzheng: uncomment the following like for hard limit support */
613         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
614         
615       repeat:
616         //check exit
617         if (!cur_core)
618                 return 0;
619
620         //visit this node
621         if ( alloc_surplus_node(cur_core) < 0 )
622                 return ret;
623
624         //next child
625         child_core = ckrm_get_next_child(cur_core, child_core);
626         if (child_core) {
627                 //go down
628                 cur_core = child_core;
629                 child_core = NULL;
630                 goto repeat;
631         } else {                //no more child, go back
632                 child_core = cur_core;
633                 cur_core = child_core->hnode.parent;
634         }
635         goto repeat;
636 }
637
638 /**********************************************/
639 /*           CKRM Idle Tasks                  */
640 /**********************************************/
641 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
642 struct task_struct* ckrm_idle_tasks[NR_CPUS];
643
644 /*how many ckrm idle tasks should I wakeup*/
645 static inline int get_nr_idle(unsigned long surplus)
646 {
647         int cpu_online = cpus_weight(cpu_online_map);   
648         int nr_idle = 0; 
649         
650         nr_idle = surplus * cpu_online;
651         nr_idle >>= CKRM_SHARE_ACCURACY;
652
653         if (surplus) 
654                 nr_idle ++;
655
656         if (nr_idle > cpu_online)  
657                 nr_idle = cpu_online;
658
659         return nr_idle;
660 }
661
662 /**
663  * update_ckrm_idle: update the status of the idle class according to the new surplus
664  * surplus: new system surplus
665  *
666  * Task:
667  * -- update share of the idle class 
668  * -- wakeup idle tasks according to surplus
669  */
670 void update_ckrm_idle(unsigned long surplus)
671 {
672         int nr_idle = get_nr_idle(surplus);
673         int i;
674         struct task_struct* idle_task;
675
676         set_eshare(&ckrm_idle_class->stat,surplus);
677         set_meshare(&ckrm_idle_class->stat,surplus);
678         /*wake up nr_idle idle tasks*/
679         for_each_online_cpu(i) {
680                 idle_task = ckrm_idle_tasks[i];
681                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
682                         ckrm_cpu_change_class(idle_task,
683                                               idle_task->cpu_class,
684                                               ckrm_idle_class);
685                 }
686                 if (! idle_task)
687                         continue;
688                 if (i < nr_idle) {
689                         //activate it
690                         wake_up_process(idle_task);
691                 } else {
692                         //deactivate it
693                         idle_task->state = TASK_INTERRUPTIBLE;
694                         set_tsk_need_resched(idle_task);
695                 }
696         }
697 }
698
699 static int ckrm_cpu_idled(void *nothing)
700 {
701         set_user_nice(current,19);
702         daemonize("ckrm_idle_task");
703
704         //deactivate it, it will be waked up by ckrm_cpu_monitor
705         current->state = TASK_INTERRUPTIBLE;
706         schedule();             
707
708         /*similar to cpu_idle */
709         while (1) {
710                 while (!need_resched()) {
711                         ckrm_cpu_monitor();
712                         if (current_cpu_data.hlt_works_ok) {
713                                 local_irq_disable();
714                                 if (!need_resched()) {
715                                         set_tsk_need_resched(current);
716                                         safe_halt();
717                                 } else
718                                         local_irq_enable();
719                         }
720                 }
721                 schedule();             
722         }
723         return 0;
724 }
725
726 /**
727  * ckrm_start_ckrm_idle: 
728  *  create the ckrm_idle_class and starts the idle tasks
729  *
730  */
731 void ckrm_start_ckrm_idle(void)
732 {
733         int i;
734         int ret;
735         ckrm_shares_t shares;
736         
737         ckrm_idle_class = &ckrm_idle_class_obj; 
738         memset(ckrm_idle_class,0,sizeof(shares));
739         /*don't care about the shares */
740         init_cpu_class(ckrm_idle_class,&shares);
741         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
742         
743         for_each_online_cpu(i) {
744                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
745                 
746                 /*warn on error, but the system should still work without it*/
747                 if (ret < 0)
748                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
749                 else {
750                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
751                         if (!ckrm_idle_tasks[i])
752                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
753                 }
754         }
755 }
756
757 /**********************************************/
758 /*          Local Weight                      */
759 /**********************************************/
760 /**
761  * adjust_class_local_weight: adjust the local weight for each cpu
762  *
763  * lrq->weight = lpr->pressure * class->weight / total_pressure
764  */
765 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
766 {
767         unsigned long total_pressure = 0;
768         ckrm_lrq_t* lrq;
769         int i;
770         unsigned long class_weight;
771         unsigned long long lw;  
772
773         //get total pressure
774         for_each_online_cpu(i) {
775                 lrq = get_ckrm_lrq(clsptr,i);
776                 total_pressure += lrq->lrq_load;
777         }
778
779         if (! total_pressure)
780                 return;
781         
782         class_weight = cpu_class_weight(clsptr) * cpu_online;
783
784         /*
785          * update weight for each cpu, minimun is 1
786          */
787         for_each_online_cpu(i) {
788                 lrq = get_ckrm_lrq(clsptr,i);
789                 if (! lrq->lrq_load)
790                         /*give idle class a high share to boost interactiveness */
791                         lw = cpu_class_weight(clsptr); 
792                 else {
793                         lw = lrq->lrq_load * class_weight;
794                         do_div(lw,total_pressure);
795                         if (!lw)
796                                 lw = 1;
797                         else if (lw > CKRM_SHARE_MAX)
798                                 lw = CKRM_SHARE_MAX;
799                 }
800                 
801                 lrq->local_weight = lw;
802         }
803 }
804
805 /*
806  * assume called with class_list_lock read lock held
807  */
808 void adjust_local_weight(void)
809 {
810         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
811         struct ckrm_cpu_class *clsptr;
812         int cpu_online;
813
814         //do nothing if someone already holding the lock
815         if (! spin_trylock(&lock))
816                 return;
817
818         cpu_online = cpus_weight(cpu_online_map);       
819
820         //class status: demand, share,total_ns prio, index
821         list_for_each_entry(clsptr,&active_cpu_classes,links) {
822                 adjust_lrq_weight(clsptr,cpu_online);
823         }
824
825         spin_unlock(&lock);
826 }
827
828 /**********************************************/
829 /*          Main                              */
830 /**********************************************/
831 /**
832  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
833  *
834  * this function is called every CPU_MONITOR_INTERVAL
835  * it computes the cpu demand of each class
836  * and re-allocate the un-used shares to other classes
837  */
838 void ckrm_cpu_monitor(void)
839 {
840         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
841         static unsigned long long last_check = 0;
842         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
843         unsigned long long now; 
844 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
845
846         if (!root_core)
847                 return;
848
849         //do nothing if someone already holding the lock
850         if (! spin_trylock(&lock))
851                 return;
852
853         read_lock(&class_list_lock);
854
855         now = sched_clock();
856
857         //consecutive check should be at least 100ms apart
858         if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
859                 goto outunlock;
860         }
861         last_check = now;
862
863         if (update_effectives(root_core) != 0)
864                 goto outunlock;
865         
866         if (update_max_demand(root_core) != 0)
867                 goto outunlock;
868         
869         if (alloc_surplus(root_core) != 0)
870                 goto outunlock;
871         
872         adjust_local_weight();
873
874  outunlock:     
875         read_unlock(&class_list_lock);
876         spin_unlock(&lock);
877 }
878
879 /*****************************************************/
880 /*            Supporting Functions                   */
881 /*****************************************************/
882 static pid_t cpu_monitor_pid = -1;
883 static int thread_exit = 0;
884
885 static int ckrm_cpu_monitord(void *nothing)
886 {
887         daemonize("ckrm_cpu_ctrld");
888         for (;;) {
889                 /*sleep for sometime before next try*/
890                 set_current_state(TASK_INTERRUPTIBLE);
891                 schedule_timeout(CPU_MONITOR_INTERVAL);
892                 ckrm_cpu_monitor();
893                 if (thread_exit) {
894                         break;
895                 }
896         }
897         cpu_monitor_pid = -1;
898         thread_exit = 2;
899         printk("cpu_monitord exit\n");
900         return 0;
901 }
902
903 void ckrm_start_monitor(void)
904 {
905         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
906         if (cpu_monitor_pid < 0) {
907                 printk("ckrm_cpu_monitord for failed\n");
908         }
909 }
910
911 void ckrm_kill_monitor(void)
912 {
913         int interval = HZ;
914
915         printk("killing process %d\n", cpu_monitor_pid);
916         if (cpu_monitor_pid > 0) {
917                 thread_exit = 1;
918                 while (thread_exit != 2) {
919                         set_current_state(TASK_INTERRUPTIBLE);
920                         schedule_timeout(CPU_MONITOR_INTERVAL);
921                 }
922         }
923 }
924
925 int ckrm_cpu_monitor_init(void)
926 {
927         ckrm_start_monitor();
928         /*hzheng: uncomment the following like for hard limit support */
929         //      ckrm_start_ckrm_idle();
930         return 0;
931 }
932
933 void ckrm_cpu_monitor_exit(void)
934 {
935         ckrm_kill_monitor();
936 }
937
938 module_init(ckrm_cpu_monitor_init);
939 module_exit(ckrm_cpu_monitor_exit);
940
941 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
942 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
943 MODULE_LICENSE("GPL");