This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / ckrm / ckrm_cpu_monitor.c
1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
2  *
3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
4  *           (C) Hubertus Franke, IBM Corp. 2004
5  * 
6  * Latest version, more details at http://ckrm.sf.net
7  * 
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  */
14
15 /* Changes
16  * 
17  * 23 June 2004: Created
18  * 
19  */
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <asm/errno.h>
23 #include <linux/list.h>
24 #include <linux/spinlock.h>
25 #include <linux/ckrm.h>
26 #include <linux/ckrm_rc.h>
27 #include <linux/ckrm_tc.h>
28 #include <asm/div64.h>
29 #include <linux/ckrm_sched.h>
30
31 #define CPU_MONITOR_INTERVAL (4*HZ) /*how often do we adjust the shares*/
32 #define CKRM_SHARE_ACCURACY 7
33 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
34
35 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
36
37 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
38 {
39         int i;
40         struct ckrm_cpu_class_local_stat* local_stat;
41         unsigned long long now = sched_clock();
42
43         stat->stat_lock = SPIN_LOCK_UNLOCKED;
44         stat->total_ns = 0;
45         stat->cpu_demand = 0;
46
47         for (i=0; i< NR_CPUS; i++) {
48                 local_stat = &stat->local_stats[i];
49                 local_stat->run = 0;
50                 local_stat->total = 0;
51                 local_stat->last_sleep = now;
52                 local_stat->cpu_demand = 0;             
53         }
54
55         stat->effective_guarantee = 0;
56         stat->effective_limit = 0;
57         stat->glut = 0;
58         stat->effective_share = 100;
59         stat->self_effective_share = 100;
60 }
61 /**********************************************/
62 /*          cpu demand                        */
63 /**********************************************/
64
65 /*
66  * How CPU demand is calculated:
67  * consider class local runqueue (clr) first
68  * at any time, a clr can at the following three states
69  * -- run: a task belonning to this class is running on this cpu
70  * -- wait: at least one of its task is running, but the class is not running
71  * -- sleep: none of the task of this class is runnable
72  *
73  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
74  * 
75  * the cpu_demand of a class = 
76  *    sum of cpu_demand of all the class local runqueues
77  */
78
79 /**
80  * update_cpu_demand - update a state change
81  * 
82  * should be called whenever the state of a local queue changes
83  * -- when deschedule : report how much run
84  * -- when enqueue: report how much sleep
85  *
86  * to deal with excessive long run/sleep state
87  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
88  */
89 #define CKRM_CPU_DEMAND_RUN 0
90 #define CKRM_CPU_DEMAND_SLEEP 1
91 //how often should we recalculate the cpu demand, in ns
92 #define CPU_DEMAND_CAL_THRESHOLD (1000000000LL)
93 static inline void update_local_cpu_demand(struct ckrm_cpu_class_local_stat* local_stat,int state, unsigned long long len)
94 {       
95         local_stat->total += len;
96         if (state == CKRM_CPU_DEMAND_RUN)
97                 local_stat->run += len;
98
99         if (local_stat->total >= CPU_DEMAND_CAL_THRESHOLD) {
100                 local_stat->total >>= CKRM_SHARE_ACCURACY;
101                 if (local_stat->total > 0xFFFFFFFF)
102                         local_stat->total = 0xFFFFFFFF;
103
104                 do_div(local_stat->run,(unsigned long)local_stat->total);
105                 local_stat->cpu_demand +=local_stat->run;
106                 local_stat->cpu_demand >>= 1;
107                 local_stat->total = 0;
108                 local_stat->run = 0;
109         }
110 }
111
112 static inline void cpu_demand_update_run(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
113 {
114         update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_RUN,len);
115 }
116
117 static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len)
118 {
119         update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
120 }
121
122 #define CPU_DEMAND_ENQUEUE 0
123 #define CPU_DEMAND_DEQUEUE 1
124 #define CPU_DEMAND_DESCHEDULE 2
125
126 /**
127  * cpu_demand_event - and cpu_demand event occured
128  * @event: one of the following three events:
129  *   CPU_DEMAND_ENQUEUE: local class enqueue
130  *   CPU_DEMAND_DEQUEUE: local class dequeue
131  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
132  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
133  */
134 void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len) 
135 {       
136         switch (event) {
137         case CPU_DEMAND_ENQUEUE: 
138                 len = sched_clock() - local_stat->last_sleep;
139                 local_stat->last_sleep = 0;
140                 cpu_demand_update_sleep(local_stat,len);
141                 break;
142         case CPU_DEMAND_DEQUEUE:
143                 local_stat->last_sleep = sched_clock();
144                 break;
145         case CPU_DEMAND_DESCHEDULE:
146                 cpu_demand_update_run(local_stat,len);          
147                 break;
148         default:
149                 BUG();
150         }
151 }
152
153 /** 
154  * check all the class local queue
155  * if local queueu is not in runqueue, then it's in sleep state
156  * if compare to last sleep, 
157  */
158 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
159 {
160         struct ckrm_cpu_class_local_stat * local_stat = &stat->local_stats[cpu];
161         unsigned long long sleep,now;
162         if (local_stat->last_sleep) {
163                 now = sched_clock();
164                 sleep = now - local_stat->last_sleep;
165                 local_stat->last_sleep = now;
166                 cpu_demand_update_sleep(local_stat,sleep);
167         }
168 }
169
170 /**
171  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
172  *
173  * self_cpu_demand = sum(cpu demand of all local queues) 
174  */
175 static unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat
176                                                 *stat)
177 {
178         int cpu_demand = 0;
179         int i;
180
181         for_each_online_cpu(i) {
182                 cpu_demand_check_sleep(stat,i);
183                 cpu_demand += stat->local_stats[i].cpu_demand;
184         }
185
186         if (cpu_demand > CKRM_SHARE_MAX)
187                 cpu_demand = CKRM_SHARE_MAX;
188         return cpu_demand;
189 }
190
191 /*
192  * update effective cpu demand for each class
193  * assume the root_core->parent == NULL
194  */
195 static void update_cpu_demand(struct ckrm_core_class *root_core)
196 {
197         struct ckrm_core_class *cur_core, *child_core;
198         struct ckrm_cpu_class *cls;
199
200         cur_core = root_core;
201         child_core = NULL;
202         /*
203          * iterate the tree
204          * update cpu_demand of each node
205          */
206       repeat:
207         if (!cur_core)
208                 return;
209
210         cls = ckrm_get_cpu_class(cur_core);
211         if (!child_core)        //first child
212                 cls->stat.cpu_demand = get_self_cpu_demand(&cls->stat);
213         else {
214                 cls->stat.cpu_demand +=
215                     ckrm_get_cpu_class(child_core)->stat.cpu_demand;
216                 if (cls->stat.cpu_demand > CKRM_SHARE_MAX)
217                         cls->stat.cpu_demand = CKRM_SHARE_MAX;
218         }
219
220         //next child
221         child_core = ckrm_get_next_child(cur_core, child_core);
222         if (child_core) {
223                 //go down
224                 cur_core = child_core;
225                 child_core = NULL;
226                 goto repeat;
227         } else {                //no more child, go back
228                 child_core = cur_core;
229                 cur_core = child_core->hnode.parent;
230         }
231         goto repeat;
232 }
233
234 /**********************************************/
235 /*          effective guarantee & limit       */
236 /**********************************************/
237 static inline void set_effective_share(struct ckrm_cpu_class_stat *stat,
238                                        int new_share)
239 {
240         if (!new_share)
241                 new_share = 1;
242         stat->effective_share = new_share;
243 }
244
245 static inline void set_self_effective_share(struct ckrm_cpu_class_stat *stat,
246                                             int new_share)
247 {
248         if (!new_share)
249                 new_share = 1;
250         stat->self_effective_share = new_share;
251 }
252
253 static inline void update_child_effective(struct ckrm_core_class *parent)
254 {
255         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
256         struct ckrm_core_class *child_core = ckrm_get_next_child(parent, NULL);
257
258         while (child_core) {
259                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
260
261                 c_cls->stat.effective_guarantee =
262                     p_cls->stat.effective_guarantee *
263                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
264                 c_cls->stat.effective_limit =
265                     p_cls->stat.effective_guarantee * c_cls->shares.my_limit /
266                     p_cls->shares.total_guarantee;
267
268                 child_core = ckrm_get_next_child(parent, child_core);
269         };
270
271 }
272
273 /*
274  * update effective guarantee and effective limit
275  * -- effective share = parent->effective->share * share/parent->total_share
276  * -- effective limit = parent->effective->share * limit/parent->total_share
277  * should be called only when class structure changed
278  */
279 static void update_effective_guarantee_limit(struct ckrm_core_class *root_core)
280 {
281         struct ckrm_core_class *cur_core, *child_core = NULL;
282         struct ckrm_cpu_class *cls;
283
284         cur_core = root_core;
285         cls = ckrm_get_cpu_class(cur_core);
286         cls->stat.effective_guarantee = CKRM_SHARE_MAX;
287         cls->stat.effective_limit = cls->stat.effective_guarantee;
288
289       repeat:
290         //check exit
291         if (!cur_core)
292                 return;
293
294         //visit this node
295         update_child_effective(cur_core);
296         //next child
297         child_core = ckrm_get_next_child(cur_core, child_core);
298         if (child_core) {
299                 //go down
300                 cur_core = child_core;
301                 child_core = NULL;
302                 goto repeat;
303         } else {                //no more child, go back
304                 child_core = cur_core;
305                 cur_core = child_core->hnode.parent;
306         }
307         goto repeat;
308 }
309
310 /**********************************************/
311 /*          surplus allocation                */
312 /**********************************************/
313
314 /*
315  * surplus = my_effective_share - demand
316  * if surplus < 0, surplus = 0 
317  */
318 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
319 {
320         int surplus = cls->stat.effective_guarantee - cls->stat.cpu_demand;
321
322         if (surplus < 0)
323                 surplus = 0;
324
325         return surplus;
326 }
327
328 /*
329  * consume the surplus
330  * return how much consumed
331  * set glut when necessary
332  */
333 static inline int node_surplus_consume(int old_surplus,
334                                        struct ckrm_core_class *child_core,
335                                        struct ckrm_cpu_class *p_cls)
336 {
337         int consumed = 0;
338         int inc_limit;
339
340         struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
341
342         if (c_cls->stat.glut)
343                 goto out;
344
345         //check demand
346         if (c_cls->stat.effective_share >= c_cls->stat.cpu_demand) {
347                 c_cls->stat.glut = 1;
348                 goto out;
349         }
350
351         consumed =
352             old_surplus * c_cls->shares.my_guarantee /
353             p_cls->shares.total_guarantee;
354
355         //check limit
356         inc_limit = c_cls->stat.effective_limit - c_cls->stat.effective_share;
357         if (inc_limit <= consumed) {
358                 c_cls->stat.glut = 1;
359                 consumed = inc_limit;
360         }
361
362         c_cls->stat.effective_share += consumed;
363       out:
364         return consumed;
365 }
366
367 /*
368  * re-allocate the shares for all the childs under this node
369  * task:
370  *  1. get total surplus
371  *  2. allocate surplus
372  *  3. set the effective_share of each node
373  */
374 static void alloc_surplus_node(struct ckrm_core_class *parent)
375 {
376         int total_surplus = 0, old_surplus = 0;
377         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
378         struct ckrm_core_class *child_core = NULL;
379         int self_share;
380
381         /*
382          * calculate surplus 
383          * total_surplus = sum(child_surplus)
384          * reset glut flag
385          * initialize effective_share
386          */
387         do {
388                 child_core = ckrm_get_next_child(parent, child_core);
389                 if (child_core) {
390                         struct ckrm_cpu_class *c_cls =
391                             ckrm_get_cpu_class(child_core);
392                         ckrm_stat_t *stat = &c_cls->stat;
393
394                         total_surplus += get_node_surplus(c_cls);
395                         stat->glut = 0;
396                         set_effective_share(stat, stat->effective_guarantee);
397                 }
398         } while (child_core);
399
400         /*distribute the surplus */
401         child_core = NULL;
402         do {
403                 if (!child_core)        //keep the surplus of last round
404                         old_surplus = total_surplus;
405
406                 child_core = ckrm_get_next_child(parent, child_core);
407                 if (child_core) {
408                         total_surplus -=
409                             node_surplus_consume(old_surplus, child_core,
410                                                  p_cls);
411                 }
412                 //start a new round if something is allocated in the last round
413         } while (child_core || (total_surplus != old_surplus));
414
415         //any remaining surplus goes to the default class
416         self_share = p_cls->stat.effective_share *
417             p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
418         self_share += total_surplus;
419
420         set_self_effective_share(&p_cls->stat, self_share);
421 }
422
423 /**
424  * alloc_surplus - reallocate unused shares
425  *
426  * class A's usused share should be allocated to its siblings
427  */
428 static void alloc_surplus(struct ckrm_core_class *root_core)
429 {
430         struct ckrm_core_class *cur_core, *child_core = NULL;
431         struct ckrm_cpu_class *cls;
432
433         cur_core = root_core;
434         cls = ckrm_get_cpu_class(cur_core);
435         cls->stat.glut = 0;
436         set_effective_share(&cls->stat, cls->stat.effective_guarantee);
437       repeat:
438         //check exit
439         if (!cur_core)
440                 return;
441
442         //visit this node
443         alloc_surplus_node(cur_core);
444         //next child
445         child_core = ckrm_get_next_child(cur_core, child_core);
446         if (child_core) {
447                 //go down
448                 cur_core = child_core;
449                 child_core = NULL;
450                 goto repeat;
451         } else {                //no more child, go back
452                 child_core = cur_core;
453                 cur_core = child_core->hnode.parent;
454         }
455         goto repeat;
456 }
457
458 /**
459  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
460  *
461  * this function is called every CPU_MONITOR_INTERVAL
462  * it computes the cpu demand of each class
463  * and re-allocate the un-used shares to other classes
464  */
465 void ckrm_cpu_monitor(void)
466 {
467         struct ckrm_core_class *root_core = default_cpu_class->core;
468         if (!root_core)
469                 return;
470
471         update_effective_guarantee_limit(root_core);
472         update_cpu_demand(root_core);
473         alloc_surplus(root_core);
474 }
475
476 /*****************************************************/
477 /*            Supporting Functions                   */
478 /*****************************************************/
479 static pid_t cpu_monitor_pid = -1;
480 static int thread_exit = 0;
481
482 static int ckrm_cpu_monitord(void *nothing)
483 {
484         wait_queue_head_t wait;
485
486         init_waitqueue_head(&wait);
487
488         daemonize("ckrm_cpu_ctrld");
489         for (;;) {
490                 /*sleep for sometime before next try*/
491                 interruptible_sleep_on_timeout(&wait, CPU_MONITOR_INTERVAL);
492                 ckrm_cpu_monitor();
493                 if (thread_exit) {
494                         break;
495                 }
496         }
497         cpu_monitor_pid = -1;
498         thread_exit = 2;
499         printk("cpu_monitord exit\n");
500         return 0;
501 }
502
503 void ckrm_start_monitor(void)
504 {
505         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
506         if (cpu_monitor_pid < 0) {
507                 printk("ckrm_cpu_monitord for failed\n");
508         }
509 }
510
511 void ckrm_kill_monitor(void)
512 {
513         wait_queue_head_t wait;
514         int interval = HZ;
515         init_waitqueue_head(&wait);
516
517         printk("killing process %d\n", cpu_monitor_pid);
518         if (cpu_monitor_pid > 0) {
519                 thread_exit = 1;
520                 while (thread_exit != 2) {
521                         interruptible_sleep_on_timeout(&wait, interval);
522                 }
523         }
524 }
525
526 int ckrm_cpu_monitor_init(void)
527 {
528         ckrm_start_monitor();
529         return 0;
530 }
531
532 void ckrm_cpu_monitor_exit(void)
533 {
534         ckrm_kill_monitor();
535 }
536
537 module_init(ckrm_cpu_monitor_init);
538 module_exit(ckrm_cpu_monitor_exit);
539
540 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
541 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
542 MODULE_LICENSE("GPL");