various fixes to ckrm core and the cpu controller
[linux-2.6.git] / kernel / ckrm / ckrm_cpu_monitor.c
1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
2  *
3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
4  *           (C) Hubertus Franke, IBM Corp. 2004
5  * 
6  * Latest version, more details at http://ckrm.sf.net
7  * 
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  */
14
15 /* Changes
16  * 
17  * 23 June 2004: Created
18  * 
19  */
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <asm/errno.h>
23 #include <linux/list.h>
24 #include <linux/spinlock.h>
25 #include <linux/ckrm.h>
26 #include <linux/ckrm_rc.h>
27 #include <linux/ckrm_tc.h>
28 #include <asm/div64.h>
29 #include <linux/ckrm_sched.h>
30
31 #define CPU_MONITOR_INTERVAL (2*HZ) /*how often do we adjust the shares*/
32 #define CKRM_SHARE_ACCURACY 10
33 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
34
35 #define CKRM_CPU_DEMAND_RUN 0
36 #define CKRM_CPU_DEMAND_SLEEP 1
37 //sample task cpu demand every 64ms
38 #define CPU_DEMAND_TASK_RECALC  (64000000LL)
39 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
40 #define CPU_DEMAND_TP_CLASS 0
41 #define CPU_DEMAND_TP_TASK 1
42
43 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
44 void update_ckrm_idle(unsigned long surplus);
45
46 /*interface to share definition*/
47 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
48 {
49         return cls->shares.my_limit;
50 }
51
52 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
53 {
54         return cls->shares.total_guarantee;
55 }
56
57 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
58 {
59         return cls->shares.total_guarantee;
60 }
61
62 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
63 {
64         return cls->shares.total_guarantee;
65 }
66
67
68 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
69 {
70         unsigned long long now = sched_clock();
71
72         local_stat->run = 0;
73         local_stat->total = 0;
74         local_stat->last_sleep = now;
75         switch (type) {
76         case CPU_DEMAND_TP_CLASS:
77                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
78                 local_stat->cpu_demand = 0; 
79                 break;
80         case CPU_DEMAND_TP_TASK:
81                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
82                 //for task, the init cpu_demand is copied from its parent
83                 break;
84         default:
85                 BUG();
86         }
87 }
88
89 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
90 {
91         int i;
92
93         stat->stat_lock = SPIN_LOCK_UNLOCKED;
94         stat->total_ns = 0;
95         stat->max_demand = 0;
96
97         for (i=0; i< NR_CPUS; i++) {
98                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
99         }
100
101         stat->egrt = 0;
102         stat->megrt = 0;
103         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
104         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
105
106         stat->eshare = CKRM_SHARE_MAX;
107         stat->meshare = CKRM_SHARE_MAX;
108 }
109
110 /**********************************************/
111 /*          cpu demand                        */
112 /**********************************************/
113
114 /*
115  * How CPU demand is calculated:
116  * consider class local runqueue (clr) first
117  * at any time, a clr can at the following three states
118  * -- run: a task belonning to this class is running on this cpu
119  * -- wait: at least one of its task is running, but the class is not running
120  * -- sleep: none of the task of this class is runnable
121  *
122  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
123  * 
124  * the cpu_demand of a class = 
125  *    sum of cpu_demand of all the class local runqueues
126  */
127
128 /**
129  * update_cpu_demand_stat - 
130  * 
131  * should be called whenever the state of a task/task local queue changes
132  * -- when deschedule : report how much run
133  * -- when enqueue: report how much sleep
134  *
135  * how often should we recalculate the cpu demand
136  * the number is in ns
137  */
138 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
139 {       
140         local_stat->total += len;
141         if (state == CKRM_CPU_DEMAND_RUN)
142                 local_stat->run += len;
143
144         if (local_stat->total >= local_stat->recalc_interval) {
145                 local_stat->total >>= CKRM_SHARE_ACCURACY;
146                 if (unlikely(local_stat->run > 0xFFFFFFFF))
147                         local_stat->run = 0xFFFFFFFF;
148
149                 if (local_stat->total > 0xFFFFFFFF) 
150                         local_stat->total = 0xFFFFFFFF;
151                         
152                 do_div(local_stat->run,(unsigned long)local_stat->total);
153
154                 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
155                         local_stat->cpu_demand = local_stat->run;
156                 else {
157                         local_stat->cpu_demand += local_stat->run;
158                         local_stat->cpu_demand >>= 1;
159                 }
160                 local_stat->total = 0;
161                 local_stat->run = 0;
162         }
163 }
164
165 /**
166  * cpu_demand_event - and cpu_demand event occured
167  * @event: one of the following three events:
168  *   CPU_DEMAND_ENQUEUE: local class enqueue
169  *   CPU_DEMAND_DEQUEUE: local class dequeue
170  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
171  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
172  */
173 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) 
174 {       
175         switch (event) {
176         case CPU_DEMAND_ENQUEUE: 
177                 len = sched_clock() - local_stat->last_sleep;
178                 local_stat->last_sleep = 0;
179                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
180                 break;
181         case CPU_DEMAND_DEQUEUE:
182                 if (! local_stat->last_sleep) {
183                         local_stat->last_sleep = sched_clock();
184                 }
185                 break;
186         case CPU_DEMAND_DESCHEDULE:
187                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
188                 break;
189         case CPU_DEMAND_INIT: //for task init only
190                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
191                 break;
192         default:
193                 BUG();
194         }
195 }
196
197 /** 
198  * check all the class local queue
199  * 
200  * to deal with excessive long run/sleep state
201  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
202  */
203 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
204 {
205         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
206         unsigned long long sleep,now;
207         if (local_stat->last_sleep) {
208                 now = sched_clock();
209                 sleep = now - local_stat->last_sleep;
210                 local_stat->last_sleep = now;
211                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
212         }
213 }
214
215 /**
216  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
217  *
218  * self_cpu_demand = sum(cpu demand of all local queues) 
219  */
220 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
221 {
222         int cpu_demand = 0;
223         int i;
224         int cpuonline = 0;
225
226         for_each_online_cpu(i) {
227                 cpu_demand_check_sleep(stat,i);
228                 cpu_demand += stat->local_stats[i].cpu_demand;
229                 cpuonline ++;
230         }
231
232         return (cpu_demand/cpuonline);
233 }
234
235 /*
236  * my max demand = min(cpu_demand, my effective hard limit)
237  */
238 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
239 {
240         unsigned long mmax_demand = get_self_cpu_demand(stat);
241         if (mmax_demand > stat->mehl)
242                 mmax_demand = stat->mehl;
243
244         return mmax_demand;
245 }
246
247 /**
248  * update_max_demand: update effective cpu demand for each class
249  * return -1 on error
250  * 
251  * Assume: the root_core->parent == NULL
252  */
253 static int update_max_demand(struct ckrm_core_class *root_core)
254 {
255         struct ckrm_core_class *cur_core, *child_core;
256         struct ckrm_cpu_class *cls,*c_cls;
257         int ret = -1;
258
259         cur_core = root_core;
260         child_core = NULL;
261         
262  repeat:
263         if (!cur_core) { //normal exit
264                 ret = 0;
265                 goto out;
266         }
267
268         cls = ckrm_get_cpu_class(cur_core);
269         if (! cls) //invalid c_cls, abort
270                 goto out;
271
272         if (!child_core)        //first child
273                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
274         else {
275                 c_cls = ckrm_get_cpu_class(child_core);
276                 if (c_cls)
277                         cls->stat.max_demand += c_cls->stat.max_demand;
278                 else //invalid c_cls, abort
279                         goto out;
280         }
281
282         //check class hard limit
283         if (cls->stat.max_demand > cls->stat.ehl)
284                 cls->stat.max_demand = cls->stat.ehl;
285
286         //next child
287         child_core = ckrm_get_next_child(cur_core, child_core);
288         if (child_core) {
289                 //go down
290                 cur_core = child_core;
291                 child_core = NULL;
292                 goto repeat;
293         } else {                //no more child, go back
294                 child_core = cur_core;
295                 cur_core = child_core->hnode.parent;
296         }
297         goto repeat;
298  out:
299         return ret;
300 }
301
302 /**********************************************/
303 /*          effective guarantee & limit       */
304 /**********************************************/
305 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
306                                        int new_share)
307 {
308         if (!new_share)
309                 new_share = 1;
310         stat->eshare = new_share;
311 }
312
313 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
314                                             int new_share)
315 {
316         if (!new_share)
317                 new_share = 1;
318         stat->meshare = new_share;
319 }
320
321 /**
322  *update_child_effective - update egrt, ehl, mehl for all children of parent
323  *@parent: the parent node
324  *return -1 if anything wrong
325  *
326  */
327 static int update_child_effective(struct ckrm_core_class *parent)
328 {
329         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
330         struct ckrm_core_class *child_core;
331
332         if (! p_cls)
333                 return -1;
334
335         child_core = ckrm_get_next_child(parent, NULL);
336         while (child_core) {
337                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
338                 if (! c_cls)
339                         return -1;
340
341                 c_cls->stat.egrt =
342                     p_cls->stat.egrt *
343                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
344
345                 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
346                         / c_cls->shares.total_guarantee;
347                 
348                 c_cls->stat.ehl =
349                     p_cls->stat.ehl *
350                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
351
352                 c_cls->stat.mehl =
353                     c_cls->stat.ehl *
354                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
355
356                 child_core = ckrm_get_next_child(parent, child_core);
357         };
358         return 0;
359 }
360
361 /**
362  * update_effectives: update egrt, ehl, mehl for the whole tree
363  * should be called only when class structure changed
364  *
365  * return -1 if anything wrong happened (eg: the structure changed during the process)
366  */
367 static int update_effectives(struct ckrm_core_class *root_core)
368 {
369         struct ckrm_core_class *cur_core, *child_core;
370         struct ckrm_cpu_class *cls;
371
372         cur_core = root_core;
373         child_core = NULL;
374         cls = ckrm_get_cpu_class(cur_core);
375
376         //initialize the effectives for root 
377         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
378         cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
379                 / cls->shares.total_guarantee;
380         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
381                 / cls->shares.total_guarantee;
382         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
383                 / cls->shares.total_guarantee;
384         
385  repeat:
386         //check exit
387         if (!cur_core)
388                 return 0;
389
390         //visit this node
391         if (update_child_effective(cur_core) == -1) {
392                 return -1; //invalid cur_core node
393         }
394
395         //next child
396         child_core = ckrm_get_next_child(cur_core, child_core);
397
398         if (child_core) {
399                 //go down to the next hier
400                 cur_core = child_core;
401                 child_core = NULL;
402         } else { //no more child, go back
403                 child_core = cur_core;
404                 cur_core = child_core->hnode.parent;
405         }
406         goto repeat;
407 }
408
409 /**********************************************/
410 /*          surplus allocation                */
411 /**********************************************/
412
413 /*
414  * surplus = egrt - demand
415  * if surplus < 0, surplus = 0 
416  */
417 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
418 {
419         int surplus = cls->stat.egrt - cls->stat.max_demand;
420
421         if (surplus < 0)
422                 surplus = 0;
423
424         return surplus;
425 }
426
427 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
428 {
429         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
430
431         if (surplus < 0)
432                 surplus = 0;
433
434         return surplus;
435 }
436
437 /**
438  * node_surplus_consume: consume the surplus
439  * @ckeck_sl: if check_sl is set, then check soft_limit
440  * @total_grt: total guarantee 
441  * return how much consumed
442  *
443  * implements all the CKRM Scheduling Requirement
444  * update total_grt if necessary 
445  */
446 static inline int node_surplus_consume(int surplus,
447                                        struct ckrm_core_class *child_core,
448                                        struct ckrm_cpu_class *p_cls,
449                                        int check_sl,
450                                        int *total_grt
451                                        )
452 {
453         int consumed = 0;
454         int inc_limit;
455         int glut = 1;
456
457         struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
458
459         if (! c_cls || ! *total_grt)
460                 goto out;
461
462         /*can't consume more than demand or hard limit*/
463         if (c_cls->stat.eshare >= c_cls->stat.max_demand)
464                 goto out;
465
466         consumed =
467                 surplus * c_cls->shares.my_guarantee / *total_grt;
468
469         if (! consumed) //no more share
470                 goto out;
471
472         //hard limit and demand limit
473         inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
474
475         if (check_sl) {
476                 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
477                         /p_cls->shares.total_guarantee;
478                 if (esl < c_cls->stat.max_demand)
479                         inc_limit = esl - c_cls->stat.eshare;
480         }
481
482
483         if (consumed > inc_limit)
484                 consumed = inc_limit;
485         else
486                 glut = 0;
487
488         c_cls->stat.eshare += consumed;
489
490  out:
491         if (glut) 
492                 *total_grt -= c_cls->shares.my_guarantee;
493
494         return consumed;
495 }
496
497 /**
498  * alloc_surplus_node: re-allocate the shares for children under parent
499  * @parent: parent node
500  * return the remaining surplus
501  *
502  * task:
503  *  1. get total surplus
504  *  2. allocate surplus
505  *  3. set the effective_share of each node
506  */
507 static void alloc_surplus_node(struct ckrm_core_class *parent)
508 {
509         int total_surplus , old_surplus;
510         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
511         struct ckrm_core_class *child_core = NULL;
512         int self_share;
513         int total_grt = p_cls->shares.total_guarantee;
514         int check_sl;
515
516         if (! p_cls)
517                 return;
518
519         total_surplus = get_my_node_surplus(p_cls);
520         /*
521          * initialize effective_share
522          */
523         do {
524                 child_core = ckrm_get_next_child(parent, child_core);
525                 if (child_core) {
526                         struct ckrm_cpu_class *c_cls;
527
528                         c_cls = ckrm_get_cpu_class(child_core);                                         if (! c_cls)
529                                 return;
530
531                         total_surplus += get_node_surplus(c_cls);
532
533                         set_eshare(&c_cls->stat, c_cls->stat.egrt);
534                 }
535         } while (child_core);
536
537         if (! total_surplus)
538                 goto realloc_out;
539
540         /* distribute the surplus */
541         child_core = NULL;
542         check_sl = 1;
543         old_surplus = 0;
544         do {
545                 if (!child_core) {//start a new round
546
547                         //ok, everybody reached the soft limit
548                         if (old_surplus == total_surplus) 
549                                 check_sl = 0;
550
551                         old_surplus = total_surplus;
552                 }
553
554                 child_core = ckrm_get_next_child(parent, child_core);
555                 if (child_core) 
556                         total_surplus -=
557                                 node_surplus_consume(old_surplus, child_core,
558                                                      p_cls,check_sl,&total_grt);
559                 //start a new round if something is allocated in the last round
560         } while (child_core || check_sl || total_surplus != old_surplus);
561
562  realloc_out:
563         /*how much for itself*/
564         self_share = p_cls->stat.eshare *
565             p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
566
567         if (self_share < p_cls->stat.max_demand) {
568                 /*any remaining surplus goes to the default class*/
569                 self_share += total_surplus;    
570                 if (self_share > p_cls->stat.max_demand)
571                         self_share = p_cls->stat.max_demand;
572         }
573         
574         set_meshare(&p_cls->stat, self_share);
575 }
576
577 /**
578  * alloc_surplus - reallocate unused shares
579  *
580  * class A's usused share should be allocated to its siblings
581  * the re-allocation goes downward from the top
582  */
583 static int alloc_surplus(struct ckrm_core_class *root_core)
584 {
585         struct ckrm_core_class *cur_core, *child_core;
586         struct ckrm_cpu_class *cls;
587         int ret = 0;
588
589         /*initialize*/
590         cur_core = root_core;
591         child_core = NULL;
592         cls = ckrm_get_cpu_class(cur_core);
593         set_eshare(&cls->stat, cls->stat.egrt);
594         /*the ckrm idle tasks get all what's remaining*/
595         /*hzheng: uncomment the following like for hard limit support */
596         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
597         
598       repeat:
599         //check exit
600         if (!cur_core)
601                 return ret;
602
603         //visit this node
604         alloc_surplus_node(cur_core);
605         //next child
606         child_core = ckrm_get_next_child(cur_core, child_core);
607         if (child_core) {
608                 //go down
609                 cur_core = child_core;
610                 child_core = NULL;
611                 goto repeat;
612         } else {                //no more child, go back
613                 child_core = cur_core;
614                 cur_core = child_core->hnode.parent;
615         }
616         goto repeat;
617 }
618
619 /**********************************************/
620 /*           CKRM Idle Tasks                  */
621 /**********************************************/
622 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
623 struct task_struct* ckrm_idle_tasks[NR_CPUS];
624
625 /*how many ckrm idle tasks should I wakeup*/
626 static inline int get_nr_idle(unsigned long surplus)
627 {
628         int cpu_online = cpus_weight(cpu_online_map);   
629         int nr_idle = 0; 
630         
631         nr_idle = surplus * cpu_online;
632         nr_idle >>= CKRM_SHARE_ACCURACY;
633
634         if (surplus) 
635                 nr_idle ++;
636
637         if (nr_idle > cpu_online)  
638                 nr_idle = cpu_online;
639
640         return nr_idle;
641 }
642
643 /**
644  * update_ckrm_idle: update the status of the idle class according to the new surplus
645  * surplus: new system surplus
646  *
647  * Task:
648  * -- update share of the idle class 
649  * -- wakeup idle tasks according to surplus
650  */
651 void update_ckrm_idle(unsigned long surplus)
652 {
653         int nr_idle = get_nr_idle(surplus);
654         int i;
655         struct task_struct* idle_task;
656
657         set_eshare(&ckrm_idle_class->stat,surplus);
658         set_meshare(&ckrm_idle_class->stat,surplus);
659         /*wake up nr_idle idle tasks*/
660         for_each_online_cpu(i) {
661                 idle_task = ckrm_idle_tasks[i];
662                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
663                         ckrm_cpu_change_class(idle_task,
664                                               idle_task->cpu_class,
665                                               ckrm_idle_class);
666                 }
667                 if (! idle_task)
668                         continue;
669                 if (i < nr_idle) {
670                         //activate it
671                         wake_up_process(idle_task);
672                 } else {
673                         //deactivate it
674                         idle_task->state = TASK_INTERRUPTIBLE;
675                         set_tsk_need_resched(idle_task);
676                 }
677         }
678 }
679
680 static int ckrm_cpu_idled(void *nothing)
681 {
682         set_user_nice(current,19);
683         daemonize("ckrm_idle_task");
684
685         //deactivate it, it will be waked up by ckrm_cpu_monitor
686         current->state = TASK_INTERRUPTIBLE;
687         schedule();             
688
689         /*similar to cpu_idle */
690         while (1) {
691                 while (!need_resched()) {
692                         ckrm_cpu_monitor();
693                         if (current_cpu_data.hlt_works_ok) {
694                                 local_irq_disable();
695                                 if (!need_resched()) {
696                                         set_tsk_need_resched(current);
697                                         safe_halt();
698                                 } else
699                                         local_irq_enable();
700                         }
701                 }
702                 schedule();             
703         }
704         return 0;
705 }
706
707 /**
708  * ckrm_start_ckrm_idle: 
709  *  create the ckrm_idle_class and starts the idle tasks
710  *
711  */
712 void ckrm_start_ckrm_idle(void)
713 {
714         int i;
715         int ret;
716         ckrm_shares_t shares;
717         
718         ckrm_idle_class = &ckrm_idle_class_obj; 
719         memset(ckrm_idle_class,0,sizeof(shares));
720         /*don't care about the shares */
721         init_cpu_class(ckrm_idle_class,&shares);
722         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
723         
724         for_each_online_cpu(i) {
725                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
726                 
727                 /*warn on error, but the system should still work without it*/
728                 if (ret < 0)
729                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
730                 else {
731                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
732                         if (!ckrm_idle_tasks[i])
733                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
734                 }
735         }
736 }
737
738 /**********************************************/
739 /*          Local Weight                      */
740 /**********************************************/
741 /**
742  * adjust_class_local_weight: adjust the local weight for each cpu
743  *
744  * lrq->weight = lpr->pressure * class->weight / total_pressure
745  */
746 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
747 {
748         unsigned long total_pressure = 0;
749         ckrm_lrq_t* lrq;
750         int i;
751         unsigned long class_weight;
752         unsigned long long lw;  
753
754         //get total pressure
755         for_each_online_cpu(i) {
756                 lrq = get_ckrm_lrq(clsptr,i);
757                 total_pressure += lrq->lrq_load;
758         }
759
760         if (! total_pressure)
761                 return;
762         
763         class_weight = cpu_class_weight(clsptr) * cpu_online;
764
765         /*
766          * update weight for each cpu, minimun is 1
767          */
768         for_each_online_cpu(i) {
769                 lrq = get_ckrm_lrq(clsptr,i);
770                 if (! lrq->lrq_load)
771                         /*give idle class a high share to boost interactiveness */
772                         lw = cpu_class_weight(clsptr); 
773                 else {
774                         lw = lrq->lrq_load * class_weight;
775                         do_div(lw,total_pressure);
776                         if (!lw)
777                                 lw = 1;
778                         else if (lw > CKRM_SHARE_MAX)
779                                 lw = CKRM_SHARE_MAX;
780                 }
781                 
782                 lrq->local_weight = lw;
783         }
784 }
785
786 /*
787  * assume called with class_list_lock read lock held
788  */
789 void adjust_local_weight(void)
790 {
791         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
792         struct ckrm_cpu_class *clsptr;
793         int cpu_online;
794
795         //do nothing if someone already holding the lock
796         if (! spin_trylock(&lock))
797                 return;
798
799         cpu_online = cpus_weight(cpu_online_map);       
800
801         //class status: demand, share,total_ns prio, index
802         list_for_each_entry(clsptr,&active_cpu_classes,links) {
803                 adjust_lrq_weight(clsptr,cpu_online);
804         }
805
806         spin_unlock(&lock);
807 }
808
809 /**********************************************/
810 /*          Main                              */
811 /**********************************************/
812 /**
813  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
814  *
815  * this function is called every CPU_MONITOR_INTERVAL
816  * it computes the cpu demand of each class
817  * and re-allocate the un-used shares to other classes
818  */
819 void ckrm_cpu_monitor(void)
820 {
821         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
822         static unsigned long long last_check = 0;
823         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
824         unsigned long long now; 
825 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
826
827         if (!root_core)
828                 return;
829
830         //do nothing if someone already holding the lock
831         if (! spin_trylock(&lock))
832                 return;
833
834         read_lock(&class_list_lock);
835
836         now = sched_clock();
837
838         //consecutive check should be at least 100ms apart
839         if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
840                 goto outunlock;
841         }
842         last_check = now;
843
844
845         if (update_effectives(root_core) != 0)
846                 goto outunlock;
847         
848         if (update_max_demand(root_core) != 0)
849                 goto outunlock;
850         
851         if (alloc_surplus(root_core) != 0)
852                 goto outunlock;
853         
854         adjust_local_weight();
855  outunlock:     
856         read_unlock(&class_list_lock);
857         spin_unlock(&lock);
858 }
859
860 /*****************************************************/
861 /*            Supporting Functions                   */
862 /*****************************************************/
863 static pid_t cpu_monitor_pid = -1;
864 static int thread_exit = 0;
865
866 static int ckrm_cpu_monitord(void *nothing)
867 {
868         wait_queue_head_t wait;
869
870         init_waitqueue_head(&wait);
871
872         daemonize("ckrm_cpu_ctrld");
873         for (;;) {
874                 /*sleep for sometime before next try*/
875                 interruptible_sleep_on_timeout(&wait, CPU_MONITOR_INTERVAL);
876                 ckrm_cpu_monitor();
877                 if (thread_exit) {
878                         break;
879                 }
880         }
881         cpu_monitor_pid = -1;
882         thread_exit = 2;
883         printk("cpu_monitord exit\n");
884         return 0;
885 }
886
887 void ckrm_start_monitor(void)
888 {
889         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
890         if (cpu_monitor_pid < 0) {
891                 printk("ckrm_cpu_monitord for failed\n");
892         }
893 }
894
895 void ckrm_kill_monitor(void)
896 {
897         wait_queue_head_t wait;
898         int interval = HZ;
899         init_waitqueue_head(&wait);
900
901         printk("killing process %d\n", cpu_monitor_pid);
902         if (cpu_monitor_pid > 0) {
903                 thread_exit = 1;
904                 while (thread_exit != 2) {
905                         interruptible_sleep_on_timeout(&wait, interval);
906                 }
907         }
908 }
909
910 int ckrm_cpu_monitor_init(void)
911 {
912         ckrm_start_monitor();
913         /*hzheng: uncomment the following like for hard limit support */
914         //      ckrm_start_ckrm_idle();
915         return 0;
916 }
917
918 void ckrm_cpu_monitor_exit(void)
919 {
920         ckrm_kill_monitor();
921 }
922
923 module_init(ckrm_cpu_monitor_init);
924 module_exit(ckrm_cpu_monitor_exit);
925
926 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
927 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
928 MODULE_LICENSE("GPL");