This commit was manufactured by cvs2svn to create tag
[linux-2.6.git] / kernel / ckrm / ckrm_cpu_monitor.c
1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
2  *
3  * Copyright (C) Haoqiang Zheng,  IBM Corp. 2004
4  *           (C) Hubertus Franke, IBM Corp. 2004
5  * 
6  * Latest version, more details at http://ckrm.sf.net
7  * 
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  */
14
15 /* Changes
16  * 
17  * 23 June 2004: Created
18  * 
19  */
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <asm/errno.h>
23 #include <linux/list.h>
24 #include <linux/spinlock.h>
25 #include <linux/ckrm.h>
26 #include <linux/ckrm_rc.h>
27 #include <linux/ckrm_tc.h>
28 #include <asm/div64.h>
29 #include <linux/ckrm_sched.h>
30
31 // #define CONFIG_CKRM_SUPPORT_MAXLIMITS
32
33 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
34
35 #define CKRM_CPU_DEMAND_RUN 0
36 #define CKRM_CPU_DEMAND_SLEEP 1
37 //sample task cpu demand every 32ms
38 #define CPU_DEMAND_TASK_RECALC  ( 32*1000*1000LL)
39 #define CPU_DEMAND_CLASS_RECALC (256*1000*1000LL)
40 #define CPU_DEMAND_TP_CLASS 0
41 #define CPU_DEMAND_TP_TASK 1
42
43 static void update_ckrm_idle(unsigned long surplus);
44
45 void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu);
46 int alloc_surplus(struct ckrm_core_class *root_core);
47 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
48
49 /*interface to share definition*/
50 static inline int get_my_grt(struct ckrm_cpu_class *cls)
51 {
52         return cls->shares.unused_guarantee;
53 }
54
55 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
56 {
57         return cls->shares.my_limit;
58 }
59
60 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
61 {
62         return cls->shares.total_guarantee;
63 }
64
65 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
66 {
67         return cls->shares.total_guarantee;
68 }
69
70 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
71 {
72         return cls->shares.total_guarantee;
73 }
74
75 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
76                                        int new_share)
77 {
78         if (!new_share)
79                 new_share = 1;
80
81         BUG_ON(new_share < 0);
82         stat->eshare = new_share;
83 }
84
85 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
86                                             int new_share)
87 {
88         if (!new_share)
89                 new_share = 1;
90
91         BUG_ON(new_share < 0);
92         stat->meshare = new_share;
93 }
94
95 /**
96  *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
97  *
98  * self_cpu_demand = sum(cpu demand of all local queues) 
99  */
100 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
101 {
102         int cpu_demand = 0;
103         int i;
104         int cpuonline = 0;
105
106         for_each_online_cpu(i) {
107                 cpu_demand_check_sleep(stat,i);
108                 cpu_demand += stat->local_stats[i].cpu_demand;
109                 cpuonline ++;
110         }
111
112         return (cpu_demand/cpuonline);
113 }
114
115 /*
116  * my max demand = min(cpu_demand, my effective hard limit)
117  */
118 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) 
119 {
120         unsigned long mmax_demand = get_self_cpu_demand(stat);
121         if (mmax_demand > stat->mehl)
122                 mmax_demand = stat->mehl;
123
124         return mmax_demand;
125 }
126
127 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
128 {
129         unsigned long long now = sched_clock();
130
131         local_stat->run = 0;
132         local_stat->total = 0;
133         local_stat->last_sleep = now;
134         switch (type) {
135         case CPU_DEMAND_TP_CLASS:
136                 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
137                 local_stat->cpu_demand = 0; 
138                 break;
139         case CPU_DEMAND_TP_TASK:
140                 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
141                 //for task, the init cpu_demand is copied from its parent
142                 break;
143         default:
144                 BUG();
145         }
146 }
147
148 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares)
149 {
150         int i;
151
152         stat->stat_lock = SPIN_LOCK_UNLOCKED;
153         stat->total_ns = 0;
154         stat->max_demand = 0;
155
156         for (i=0; i<NR_CPUS; i++) {
157                 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
158         }
159
160         stat->egrt = 0;
161         stat->megrt = 0;
162         stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
163         stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
164
165         stat->eshare = eshares;
166         stat->meshare = eshares;
167
168         stat->has_savings = 0;  
169         stat->demand_per_share = 0;
170
171 }
172
173 #if 0  // keep handy for debugging if necessary
174 void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num)
175 {
176         struct ckrm_cpu_class_stat* stat = &clsptr->stat;
177         printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num,
178                 clsptr, (clsptr == get_default_cpu_class()),
179                 clsptr->shares.my_guarantee, 
180                 clsptr->shares.my_limit, 
181                 clsptr->shares.total_guarantee,
182                 clsptr->shares.max_limit, 
183                 clsptr->shares.unused_guarantee);
184         printk("      egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n",
185                 stat->egrt,stat->megrt,stat->ehl,stat->mehl,
186                 stat->eshare,stat->meshare);
187 }
188 #endif
189
190 /**********************************************/
191 /*          surplus allocation                */
192 /**********************************************/
193
194 /*
195  * surplus = egrt - demand
196  * if surplus < 0, surplus = 0 
197  */
198 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
199 {
200         int surplus = cls->stat.egrt - cls->stat.max_demand;
201
202         if (surplus < 0)
203                 surplus = 0;
204
205         return surplus;
206 }
207
208 /*
209  * consume savings in advance because this class give surplus to others
210  * this is a quick hack, should be integrated with balance_savings()
211  */
212 static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr, 
213                                             int savings_consumed) 
214 {
215         long long total_savings;
216         ckrm_lrq_t* lrq;
217         int i;
218         int cpu_online = 0;
219         
220         total_savings = 0;
221         for_each_online_cpu(i) {
222                 lrq = get_ckrm_lrq(clsptr,i);
223                 total_savings += lrq->savings;
224                 cpu_online ++;
225         }
226         
227         total_savings -= savings_consumed;
228         if (total_savings < 0)
229                 total_savings = 0;
230
231         //get the average savings
232         do_div(total_savings,cpu_online);       
233         for_each_online_cpu(i) {
234                 lrq = get_ckrm_lrq(clsptr,i);
235                 lrq->savings = total_savings;
236         }
237 }
238
239 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
240 {
241         int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
242         int savings_consumed;
243
244         if (surplus < 0)
245                 surplus = 0;
246
247         /*
248          * a quick hack about the hierarchy savings distribution 
249          * may not be the right way to do
250          *
251          * since this node give its surplus to other nodes, 
252          * it's savings should be consumed
253          * suppose CPU_MONITOR_INTERVAL = (HZ) 
254          * savings_consumed is roughly how much savings will be consumed for the next second
255          */
256         if (surplus) {
257                 savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT);
258                 consumed_surplus_savings(cls, savings_consumed) ;
259         }
260
261         return surplus;
262 }
263
264 /*
265  * all the class in the queue consume the surplus in order
266  * each class consume the amount propotional to its egrt
267  */
268 static int consume_surplus_in_order(struct list_head* queue,
269                                            struct ckrm_cpu_class *p_cls,
270                                            int total_surplus)
271 {
272         int total_grt = 0;
273         struct ckrm_cpu_class *clsptr;  
274
275         /*
276          * get total_grt of the classes in the queue
277          * total_grt can be maintained instead of re-calcuated each time
278          */
279         list_for_each_entry(clsptr,queue,surplus_queue) {
280                 if (unlikely(clsptr == p_cls))
281                         total_grt += clsptr->stat.megrt;
282                 else
283                         total_grt += clsptr->stat.egrt;
284         }
285
286         if (! total_grt)
287                 goto consume_out;
288         
289         //allocate in order
290         list_for_each_entry(clsptr,queue,surplus_queue) {               
291                 int surplus_per_share;
292                 int consumed, my_grt;
293
294                 BUG_ON(! total_grt);
295                 surplus_per_share = 
296                         (total_surplus << CKRM_SHARE_SHIFT) / total_grt;
297
298                 if (surplus_per_share <= 0)
299                         break;
300
301                 if (unlikely(clsptr == p_cls))  //self_node consuming
302                         my_grt =  clsptr->stat.megrt;
303                 else
304                         my_grt = clsptr->stat.egrt;
305
306                 BUG_ON(clsptr->stat.demand_per_share <= 0);
307
308                 if (clsptr->stat.demand_per_share < surplus_per_share)
309                         surplus_per_share = clsptr->stat.demand_per_share;
310
311                 consumed = surplus_per_share * my_grt;
312                 consumed >>= CKRM_SHARE_SHIFT;
313                 total_surplus -= consumed;
314                 BUG_ON(total_surplus < 0);
315                 total_grt -= my_grt;
316
317                 if (unlikely(clsptr == p_cls))
318                         set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed);                     
319                 else
320                         set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed);
321         }       
322  consume_out:   
323         if (total_surplus <= 1) //if total_suplus too small, no need to allocate again
324                 total_surplus = 0;
325         return total_surplus;
326 }
327
328 /*
329  * link all the children of parent and the parent itself using their surplus_queue field
330  * link the whole queue using src_queue
331  * if anything wrong return -1
332  */
333 static int get_class_surplus_queue(struct ckrm_core_class *parent,
334                                    struct list_head* src_queue)
335 {
336         struct ckrm_core_class *child_core = NULL;
337         struct ckrm_cpu_class *p_cls,*c_cls;
338         int ret = -1;
339
340         p_cls = ckrm_get_cpu_class(parent);
341         if (! p_cls)
342                 goto link_out;
343
344         INIT_LIST_HEAD(src_queue);
345
346         //add the parent node itself
347         list_add(&p_cls->surplus_queue,src_queue);
348         do {
349                 child_core = ckrm_get_next_child(parent, child_core);
350                 if (child_core) {
351                         c_cls = ckrm_get_cpu_class(child_core);                         
352                         if (! c_cls)
353                                 goto link_out;
354                         list_add(&c_cls->surplus_queue,src_queue);
355                 }
356         } while (child_core);
357
358         ret = 0;
359
360  link_out:
361         return ret;
362 }
363
364 /*
365  * insert the class to queue based on stat->demand_per_share
366  * status: tested
367  */
368 static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr)
369 {
370         struct ckrm_cpu_class *cur_cls = NULL;  
371         int end_of_queue = 1;
372
373         list_for_each_entry(cur_cls,queue,surplus_queue) {
374                 if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) {
375                         end_of_queue = 0;
376                         break;
377                 }
378         }
379
380         //insert the clsptr
381         if (! cur_cls || end_of_queue)
382                 list_add_tail(&clsptr->surplus_queue,queue);
383         else
384                 list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue);
385 }
386
387 /*
388  * copy all classes in src_queue to dst_queue,
389  * reorder the classes based on their normalized demand 
390  * if a class already saturate (eshare >= demand), also remove it from src_queue
391  * return the total guarantee of the selected classes
392  *
393  * @src_queue: source queue
394  * @dst_queue: destination queue
395  * @check_sl: check soft limit
396  * @check_savings: only class has savings should be considered
397  */
398
399 static unsigned long reorder_surplus_queue(struct list_head* src_queue, 
400                                            struct list_head* dst_queue, 
401                                            int check_sl, int check_savings, 
402                                            struct ckrm_cpu_class *p_cls) 
403 {
404         struct ckrm_cpu_class *clsptr, *tmp;    
405
406         INIT_LIST_HEAD(dst_queue);
407
408         list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) {
409                 struct ckrm_cpu_class_stat* stat = &clsptr->stat;
410                 int inc_limit;
411                 int max_demand, eshare, esl,grt;
412
413                 if (unlikely(clsptr == p_cls)) {
414                         max_demand = get_mmax_demand(stat);
415                         eshare  = stat->meshare;
416                         esl = get_mysoft_limit(clsptr);
417                         grt = stat->megrt;
418                 } else {
419                         max_demand = stat->max_demand;
420                         eshare = stat->eshare;
421                         esl = get_soft_limit(clsptr);
422                         grt = stat->egrt;
423                 }
424
425                 //hard limit and demand limit
426                 inc_limit = max_demand - eshare;
427                 
428                 //no additional share needed
429                 if (inc_limit <= 0 || ! grt) {
430                         list_del(&clsptr->surplus_queue);
431                         continue;
432                 }
433                         
434                 //or no more savings
435                 if (check_savings && ! stat->has_savings)
436                         continue;
437                 
438                 //check soft limit
439                 if (check_sl) {
440                         int soft_limit;
441
442                         soft_limit = p_cls->stat.eshare * esl
443                                 / p_cls->shares.total_guarantee;
444
445                         if (soft_limit < max_demand)
446                                 inc_limit = soft_limit - eshare;
447                         if ( inc_limit <= 0)   /* can turn negative */
448                                 continue;
449                 }
450
451                 BUG_ON(! grt);
452                 //get the stat->demand_per_share
453                 stat->demand_per_share = 
454                         (inc_limit << CKRM_SHARE_SHIFT) / grt;  
455
456                 list_del_init(&clsptr->surplus_queue);
457                 //insert the class to the queue
458                 insert_surplus_queue(dst_queue,clsptr);
459         }
460         return 0;
461 }
462
463 /*
464  * get all the surplus that should be reallocated to the children
465  */
466 static inline int get_total_surplus(struct ckrm_cpu_class *p_cls,
467                                     struct ckrm_core_class *parent) 
468 {
469         struct ckrm_cpu_class *c_cls;
470         int total_surplus;
471         struct ckrm_core_class *child_core = NULL;
472
473         //additional share assigned to this sub node from parent
474         total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
475         BUG_ON(total_surplus < 0);
476
477         //surplus of this node
478         total_surplus += get_my_node_surplus(p_cls);
479         do {
480                 child_core = ckrm_get_next_child(parent, child_core);
481                 if (child_core) {
482                         c_cls = ckrm_get_cpu_class(child_core);                         
483                         if (! c_cls) {
484                                 total_surplus = 0;
485                                 break;
486                         }
487
488                         total_surplus += get_node_surplus(c_cls);                       
489                 }
490         } while (child_core);
491
492         return total_surplus;
493 }
494 /**
495  * alloc_surplus_node: re-allocate the shares for a single level
496  * @parent: parent node
497  * return the remaining surplus
498  *
499  * The surplus reallocation policy is like below.
500  * -- the classes that have eshare >= demand don't need any additional share. 
501  *     So they don't participate the surplus allocation.
502  * -- all the other classes received share in this order:
503  * 1. has savings, not over soft limit
504  * 2. has savings, but over soft limit
505  * 3. no savings, not over soft limit
506  * 4. no savings, over soft limit
507  * 
508  * In each of the 4 levels above, classes get surplus propotionally to its guarantee
509  */
510 static int alloc_surplus_node(struct ckrm_core_class *parent)
511 {
512         struct ckrm_cpu_class *p_cls;
513         int total_surplus;
514         int ret = -1;
515         struct list_head src_queue, dst_queue;
516
517         p_cls = ckrm_get_cpu_class(parent);
518         if (! p_cls) //safty check
519                 goto realloc_out;
520
521         ret = 0;
522         total_surplus = get_total_surplus(p_cls,parent);
523
524         if (! total_surplus) //no surplus to be allocated 
525                 goto realloc_out;
526
527         /* 
528          * first round, allocated to tasks with savings, check_sl
529          */
530         get_class_surplus_queue(parent,&src_queue);
531         reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls);
532         if (! list_empty(&dst_queue)) {
533                 total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
534                 if (! total_surplus)
535                         goto realloc_out;
536         }
537
538         /* 
539          * second round, check savings, but no check_sl
540          */
541         //merge the src_queue and dst_queue and reorder
542         list_splice(&dst_queue, &src_queue);
543         reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls);
544         if (! list_empty(&dst_queue)) {
545                 total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
546                 if (! total_surplus)
547                         goto realloc_out;
548         }
549
550         /* 
551          * third round, no check savings, but check_sl
552          */
553         //merge the src_queue and dst_queue and reorder
554         list_splice(&dst_queue, &src_queue);
555         reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls);
556         if (! list_empty(&dst_queue)) {
557                 total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);
558                 if (! total_surplus)
559                         goto realloc_out;
560         }
561         /* 
562          * fourth round, no check savings, no check_sl
563          */
564         //merge the src_queue and dst_queue and reorder
565         list_splice(&dst_queue, &src_queue);
566         reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls);
567         if (! list_empty(&dst_queue))
568                 total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus);       
569         
570  realloc_out:
571         return ret;
572 }
573
574 /*
575  * return true if the class total savings > MIN_SAVINGS 
576  */
577 static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online)
578 {
579         unsigned long long total_savings;
580         ckrm_lrq_t* lrq;
581         int i;
582 #define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS)
583         
584         total_savings = 0;
585         for_each_online_cpu(i) {
586                 lrq = get_ckrm_lrq(clsptr,i);
587                 total_savings += lrq->savings;
588         }
589
590         if (total_savings < CLASS_MIN_SAVINGS)
591                 return 0;
592
593         //get the average savings
594         do_div(total_savings,cpu_online);       
595         for_each_online_cpu(i) {
596                 lrq = get_ckrm_lrq(clsptr,i);
597                 lrq->savings = total_savings;
598         }
599
600         /*
601          * hzheng: this is another quick hack
602          * only say I have savings when this node has more demand
603          * ignoring the requirement of child classes
604          */
605         if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat))
606                 return 1;
607         else
608                 return 0;
609 }
610
611 /*
612  * check savings status
613  * set has_savings field if the class or its sub class has savings
614  */
615 static void check_savings_status(struct ckrm_core_class *root_core)
616 {
617         struct ckrm_cpu_class *clsptr;
618         int cpu_online;
619
620         cpu_online = cpus_weight(cpu_online_map);       
621
622         //class status: demand, share,total_ns prio, index
623         list_for_each_entry(clsptr,&active_cpu_classes,links) 
624                 clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online);
625 }
626
627 /**
628  * alloc_surplus - reallocate unused shares
629  *
630  * class A's usused share should be allocated to its siblings
631  * the re-allocation goes downward from the top
632  */
633 int alloc_surplus(struct ckrm_core_class *root_core)
634 {
635         struct ckrm_core_class *cur_core, *child_core;
636         //      struct ckrm_cpu_class *cls;
637         int ret = -1;
638
639         check_savings_status(root_core);
640
641         /*initialize*/
642         cur_core = root_core;
643         child_core = NULL;
644         //      cls = ckrm_get_cpu_class(cur_core);
645
646         /*the ckrm idle tasks get all what's remaining*/
647         /*hzheng: uncomment the following like for hard limit support */
648         //      update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
649         
650  repeat:
651         //check exit
652         if (!cur_core)
653                 return 0;
654
655         //visit this node only once
656         if (! child_core) 
657                 if ( alloc_surplus_node(cur_core) < 0 )
658                         return ret;
659
660         //next child
661         child_core = ckrm_get_next_child(cur_core, child_core);
662         if (child_core) {
663                 //go down
664                 cur_core = child_core;
665                 child_core = NULL;
666                 goto repeat;
667         } else {                //no more child, go back
668                 child_core = cur_core;
669                 cur_core = child_core->hnode.parent;
670         }
671         goto repeat;
672 }
673
674
675
676 /**********************************************/
677 /*          cpu demand                        */
678 /**********************************************/
679
680 /*
681  * How CPU demand is calculated:
682  * consider class local runqueue (clr) first
683  * at any time, a clr can at the following three states
684  * -- run: a task belonning to this class is running on this cpu
685  * -- wait: at least one of its task is running, but the class is not running
686  * -- sleep: none of the task of this class is runnable
687  *
688  * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
689  * 
690  * the cpu_demand of a class = 
691  *    sum of cpu_demand of all the class local runqueues
692  */
693
694 /**
695  * update_cpu_demand_stat - 
696  * 
697  * should be called whenever the state of a task/task local queue changes
698  * -- when deschedule : report how much run
699  * -- when enqueue: report how much sleep
700  *
701  * how often should we recalculate the cpu demand
702  * the number is in ns
703  */
704 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,
705                                           int state, unsigned long long len)
706 {       
707         local_stat->total += len;
708         if (state == CKRM_CPU_DEMAND_RUN)
709                 local_stat->run += len;
710
711         if (local_stat->total >= local_stat->recalc_interval) {
712                 local_stat->total >>= CKRM_SHARE_SHIFT;
713                 if (unlikely(local_stat->run > ULONG_MAX))
714                         local_stat->run = ULONG_MAX;
715
716                 if (unlikely(local_stat->total > ULONG_MAX))
717                         local_stat->total = ULONG_MAX;
718                         
719                 do_div(local_stat->run,(unsigned long)local_stat->total);
720
721                 if (unlikely(local_stat->total > ULONG_MAX)) {
722                         //happens after very long sleep
723                         local_stat->cpu_demand = local_stat->run;
724                 } else { 
725                         local_stat->cpu_demand = 
726                             (local_stat->cpu_demand + local_stat->run) >> 1;
727                 }
728                 local_stat->total = 0;
729                 local_stat->run = 0;
730         }
731 }
732
733 /**
734  * cpu_demand_event - and cpu_demand event occured
735  * @event: one of the following three events:
736  *   CPU_DEMAND_ENQUEUE: local class enqueue
737  *   CPU_DEMAND_DEQUEUE: local class dequeue
738  *   CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
739  * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
740  */
741 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) 
742 {       
743         switch (event) {
744         case CPU_DEMAND_ENQUEUE: 
745                 len = sched_clock() - local_stat->last_sleep;
746                 local_stat->last_sleep = 0;
747                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
748                 break;
749         case CPU_DEMAND_DEQUEUE:
750                 if (! local_stat->last_sleep) {
751                         local_stat->last_sleep = sched_clock();
752                 }
753                 break;
754         case CPU_DEMAND_DESCHEDULE:
755                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
756                 break;
757         case CPU_DEMAND_INIT: //for task init only
758                 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
759                 break;
760         default:
761                 BUG();
762         }
763 }
764
765 /** 
766  * check all the class local queue
767  * 
768  * to deal with excessive long run/sleep state
769  * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
770  */
771 void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
772 {
773         struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
774         unsigned long long sleep,now;
775         if (local_stat->last_sleep) {
776                 now = sched_clock();
777                 sleep = now - local_stat->last_sleep;
778                 local_stat->last_sleep = now;
779                 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
780         }
781 }
782
783 /**
784  * update_max_demand: update effective cpu demand for each class
785  * return -1 on error
786  * 
787  * Assume: the root_core->parent == NULL
788  */
789 static int update_max_demand(struct ckrm_core_class *root_core)
790 {
791         struct ckrm_core_class *cur_core, *child_core;
792         struct ckrm_cpu_class *cls,*c_cls;
793         int ret = -1;
794
795         cur_core = root_core;
796         child_core = NULL;
797         
798  repeat:
799         if (!cur_core) { //normal exit
800                 ret = 0;
801                 goto out;
802         }
803
804         cls = ckrm_get_cpu_class(cur_core);
805         if (! cls) //invalid c_cls, abort
806                 goto out;
807
808         if (!child_core)        //first child
809                 cls->stat.max_demand = get_mmax_demand(&cls->stat);
810         else {
811                 c_cls = ckrm_get_cpu_class(child_core);
812                 if (c_cls)
813                         cls->stat.max_demand += c_cls->stat.max_demand;
814                 else //invalid c_cls, abort
815                         goto out;
816         }
817
818         //check class hard limit
819         if (cls->stat.max_demand > cls->stat.ehl)
820                 cls->stat.max_demand = cls->stat.ehl;
821
822         //next child
823         child_core = ckrm_get_next_child(cur_core, child_core);
824         if (child_core) {
825                 //go down
826                 cur_core = child_core;
827                 child_core = NULL;
828                 goto repeat;
829         } else {                //no more child, go back
830                 child_core = cur_core;
831                 cur_core = child_core->hnode.parent;
832         }
833         goto repeat;
834  out:
835         return ret;
836 }
837
838 /**********************************************/
839 /*          effective guarantee & limit       */
840 /**********************************************/
841 /**
842  *update_child_effective - update egrt, ehl, mehl for all children of parent
843  *@parent: the parent node
844  *return -1 if anything wrong
845  *
846  */
847 static int update_child_effective(struct ckrm_core_class *parent)
848 {
849         struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
850         struct ckrm_core_class *child_core;     
851         int ret = -1;
852
853         if (! p_cls)
854                 return ret;
855
856         child_core = ckrm_get_next_child(parent, NULL);
857         while (child_core) {
858                 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
859                 if (! c_cls)
860                         return ret;
861
862                 c_cls->stat.egrt =
863                     p_cls->stat.egrt *
864                     c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
865
866                 c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls)
867                         / c_cls->shares.total_guarantee;
868                 
869                 c_cls->stat.ehl =
870                     p_cls->stat.ehl *
871                     get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
872
873                 c_cls->stat.mehl =
874                     c_cls->stat.ehl *
875                     get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
876
877                 set_eshare(&c_cls->stat,c_cls->stat.egrt);
878                 set_meshare(&c_cls->stat,c_cls->stat.megrt);
879
880
881                 child_core = ckrm_get_next_child(parent, child_core);
882         };
883         return 0;
884 }
885
886 /**
887  * update_effectives: update egrt, ehl, mehl for the whole tree
888  * should be called only when class structure changed
889  *
890  * return -1 if anything wrong happened (eg: the structure changed during the process)
891  */
892 int update_effectives(void)
893 {
894         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
895         struct ckrm_core_class *cur_core, *child_core;
896         struct ckrm_cpu_class *cls;
897         int ret = -1;
898
899         cur_core = root_core;
900         child_core = NULL;
901         cls = ckrm_get_cpu_class(cur_core);
902
903         //initialize the effectives for root 
904         cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
905         cls->stat.megrt = cls->stat.egrt * get_my_grt(cls)
906                 / cls->shares.total_guarantee;
907         cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
908                 / cls->shares.total_guarantee;
909         cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
910                 / cls->shares.total_guarantee;
911         set_eshare(&cls->stat,cls->stat.egrt);
912         set_meshare(&cls->stat,cls->stat.megrt);
913
914  repeat:
915         //check exit
916         if (!cur_core)
917                 return 0;
918
919         //visit this node only once
920         if (! child_core)
921                 if (update_child_effective(cur_core) < 0)
922                         return ret; //invalid cur_core node
923         
924         //next child
925         child_core = ckrm_get_next_child(cur_core, child_core);
926
927         if (child_core) {
928                 //go down to the next hier
929                 cur_core = child_core;
930                 child_core = NULL;
931         } else { //no more child, go back
932                 child_core = cur_core;
933                 cur_core = child_core->hnode.parent;
934         }
935         goto repeat;
936 }
937
938 /**********************************************/
939 /*           CKRM Idle Tasks                  */
940 /**********************************************/
941
942 #ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS
943
944 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
945 struct task_struct* ckrm_idle_tasks[NR_CPUS];
946
947 /*how many ckrm idle tasks should I wakeup*/
948 static inline int get_nr_idle(unsigned long surplus)
949 {
950         int cpu_online = cpus_weight(cpu_online_map);   
951         int nr_idle = 0; 
952         
953         nr_idle = surplus * cpu_online;
954         nr_idle >>= CKRM_SHARE_SHIFT;
955
956         if (surplus) 
957                 nr_idle ++;
958
959         if (nr_idle > cpu_online)  
960                 nr_idle = cpu_online;
961
962         return nr_idle;
963 }
964
965 /**
966  * update_ckrm_idle: update the status of the idle class according 
967  *                   to the new surplus
968  * surplus: new system surplus
969  *
970  * Task:
971  * -- update share of the idle class 
972  * -- wakeup idle tasks according to surplus
973  */
974 void update_ckrm_idle(unsigned long surplus)
975 {
976         int nr_idle = get_nr_idle(surplus);
977         int i;
978         struct task_struct* idle_task;
979
980         set_eshare(&ckrm_idle_class->stat,surplus);
981         set_meshare(&ckrm_idle_class->stat,surplus);
982         /*wake up nr_idle idle tasks*/
983         for_each_online_cpu(i) {
984                 idle_task = ckrm_idle_tasks[i];
985                 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
986                         ckrm_cpu_change_class(idle_task,
987                                               idle_task->cpu_class,
988                                               ckrm_idle_class);
989                 }
990                 if (! idle_task)
991                         continue;
992                 if (i < nr_idle) {
993                         //activate it
994                         wake_up_process(idle_task);
995                 } else {
996                         //deactivate it
997                         idle_task->state = TASK_INTERRUPTIBLE;
998                         set_tsk_need_resched(idle_task);
999                 }
1000         }
1001 }
1002
1003 static int ckrm_cpu_idled(void *nothing)
1004 {
1005         set_user_nice(current,19);
1006         daemonize("ckrm_idle_task");
1007
1008         //deactivate it, it will be awakened by ckrm_cpu_monitor
1009         current->state = TASK_INTERRUPTIBLE;
1010         schedule();             
1011
1012         /*similar to cpu_idle */
1013         while (1) {
1014                 while (!need_resched()) {
1015                         ckrm_cpu_monitor(1);
1016                         if (current_cpu_data.hlt_works_ok) {
1017                                 local_irq_disable();
1018                                 if (!need_resched()) {
1019                                         set_tsk_need_resched(current);
1020                                         safe_halt();
1021                                 } else
1022                                         local_irq_enable();
1023                         }
1024                 }
1025                 schedule();             
1026         }
1027         return 0;
1028 }
1029
1030 /**
1031  * ckrm_start_ckrm_idle: 
1032  *  create the ckrm_idle_class and starts the idle tasks
1033  *
1034  */
1035 void ckrm_start_ckrm_idle(void)
1036 {
1037         int i;
1038         int ret;
1039         ckrm_shares_t shares;
1040         
1041         ckrm_idle_class = &ckrm_idle_class_obj; 
1042         memset(ckrm_idle_class,0,sizeof(shares));
1043         /*don't care about the shares */
1044         init_cpu_class(ckrm_idle_class,&shares);
1045         printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
1046         
1047         for_each_online_cpu(i) {
1048                 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
1049                 
1050                 /*warn on error, but the system should still work without it*/
1051                 if (ret < 0)
1052                         printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
1053                 else {
1054                         ckrm_idle_tasks[i] = find_task_by_pid(ret);
1055                         if (!ckrm_idle_tasks[i])
1056                                 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
1057                 }
1058         }
1059 }
1060
1061 void ckrm_stop_ckrm_idle(void)
1062 {
1063         BUG_ON(1);   // not yet implemented
1064 }
1065
1066 #else
1067
1068 static inline void ckrm_start_ckrm_idle(void) { };
1069 static inline void ckrm_stop_ckrm_idle(void) { };
1070 static inline void update_ckrm_idle(unsigned long surplus) { };
1071
1072 #endif
1073
1074
1075 /**********************************************/
1076 /*          Local Weight                      */
1077 /**********************************************/
1078 /**
1079  * adjust_class_local_weight: adjust the local weight for each cpu
1080  *
1081  * lrq->weight = lpr->pressure * class->weight / total_pressure
1082  */
1083 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
1084 {
1085         unsigned long total_pressure = 0;
1086         ckrm_lrq_t* lrq;
1087         int i;
1088         unsigned long class_weight;
1089         unsigned long long lw;  
1090         struct ckrm_cpu_class_stat *stat;
1091         unsigned long oweight;
1092         unsigned long skewed_limit;
1093         /*
1094          * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare
1095          * then we set the skewed_share 
1096          */
1097 #define SKEWED_SHARE_RATIO 8
1098 #define SKEWED_WEIGHT_MIN 3
1099         
1100         /* get total pressure of the class, if there is not pressure (.. class is
1101          * idle, then leave the weights as is
1102          */
1103         for_each_online_cpu(i) {
1104                 lrq = get_ckrm_lrq(clsptr,i);
1105                 total_pressure += lrq->lrq_load;
1106         }
1107
1108         if (! total_pressure)
1109                 return;
1110         
1111         stat = &clsptr->stat;
1112
1113         class_weight = cpu_class_weight(clsptr) * cpu_online;
1114
1115         /* calculate or skewed limit weight */
1116         skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO);
1117         if (skewed_limit < SKEWED_WEIGHT_MIN)
1118                 skewed_limit = SKEWED_WEIGHT_MIN;
1119
1120         /* calculate over_weight */     
1121         BUG_ON(stat->meshare < stat->megrt);
1122         oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare;
1123         oweight = SHARE_TO_WEIGHT(oweight);
1124
1125         /*
1126          * update weight for each cpu, minimun is 1
1127          */
1128         for_each_online_cpu(i) {
1129                 lrq = get_ckrm_lrq(clsptr,i);
1130                 lrq->over_weight = oweight;
1131                 if (! lrq->lrq_load) {
1132                         /* give idle class a high share to boost 
1133                          * interactiveness 
1134                          */
1135                         lw = cpu_class_weight(clsptr); 
1136                         if (unlikely(lw==0))
1137                                 lw = 1;
1138                 } else {
1139                         lw = lrq->lrq_load;
1140                         lw *= class_weight;
1141                         do_div(lw,total_pressure);
1142                         if (unlikely(lw==0))
1143                                 lw = 1;
1144                         else if (unlikely(lw > CKRM_MAX_WEIGHT))
1145                                 lw = CKRM_MAX_WEIGHT;
1146                 }       
1147                 BUG_ON(lw > CKRM_MAX_WEIGHT);
1148
1149                 /* 
1150                  * set is_skewed and local_weight in proper order
1151                  * to avoid race condition
1152                  */
1153                 lrq->local_weight = lw;
1154                 if (lw < skewed_limit) 
1155                         lrq->skewed_weight = skewed_limit;
1156                 else
1157                         lrq->skewed_weight = 0;
1158                 BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight));
1159         }
1160 }
1161
1162 /*
1163  * assume called with class_list_lock read lock held
1164  */
1165
1166 void adjust_local_weight(void)
1167 {
1168         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
1169         struct ckrm_cpu_class *clsptr;
1170         int cpu_online;
1171
1172         //do nothing if someone already holding the lock
1173         if (! spin_trylock(&lock))
1174                 return;
1175
1176         cpu_online = cpus_weight(cpu_online_map);       
1177
1178         //class status: demand, share,total_ns prio, index
1179         list_for_each_entry(clsptr,&active_cpu_classes,links) {
1180                 adjust_lrq_weight(clsptr,cpu_online);
1181         }
1182
1183         spin_unlock(&lock);
1184 }
1185
1186 /**********************************************/
1187 /*          Main                              */
1188 /**********************************************/
1189 /**
1190  *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
1191  *@check_min: if check_min is set, the call can't be within 100ms of last call
1192  *
1193  * this function is called every CPU_MONITOR_INTERVAL
1194  * it computes the cpu demand of each class
1195  * and re-allocate the un-used shares to other classes
1196  */
1197 void ckrm_cpu_monitor(int check_min)
1198 {
1199         static spinlock_t lock = SPIN_LOCK_UNLOCKED; 
1200         static unsigned long long last_check = 0;
1201         struct ckrm_core_class *root_core = get_default_cpu_class()->core;
1202         unsigned long long now; 
1203         int loc;
1204
1205 #define MIN_CPU_MONITOR_INTERVAL (100*1000*1000)  /* 100 MSEC */
1206
1207         if (ckrm_cpu_disabled() || !root_core)
1208                 return;
1209
1210         //do nothing if someone already holding the lock
1211         if (! spin_trylock(&lock))
1212                 return;
1213
1214         read_lock(&class_list_lock);
1215
1216         now = sched_clock();
1217
1218         //consecutive check should be at least 100ms apart
1219         if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL))
1220                 goto outunlock_np;
1221
1222         last_check = now;
1223
1224         if (update_effectives() != 0) {
1225                 loc = 0;
1226                 goto outunlock;
1227         }
1228         
1229         if (update_max_demand(root_core) != 0) {
1230                 loc = 1;
1231                 goto outunlock;
1232         }
1233         
1234 #warning mef: alloc_surplus call back in system;
1235         if (alloc_surplus(root_core) != 0) {
1236                 loc = 2;
1237                 goto outunlock;
1238         }
1239         
1240         adjust_local_weight();
1241
1242  outunlock_np:
1243         read_unlock(&class_list_lock);
1244         spin_unlock(&lock);
1245         return;
1246
1247  outunlock:     
1248         printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc);
1249         goto outunlock_np;
1250 }
1251
1252 /*****************************************************/
1253 /*            Supporting Functions                   */
1254 /*****************************************************/
1255 static pid_t cpu_monitor_pid = -1;
1256 static int thread_exit = 0;
1257
1258 static int ckrm_cpu_monitord(void *nothing)
1259 {
1260         daemonize("ckrm_cpu_ctrld");
1261         printk("cpu_monitord started\n");
1262         thread_exit = 0;
1263         for (;;) {
1264                 /*sleep for sometime before next try*/
1265                 set_current_state(TASK_INTERRUPTIBLE);
1266                 schedule_timeout(CPU_MONITOR_INTERVAL);
1267                 ckrm_cpu_monitor(1);
1268                 if (thread_exit) {
1269                         break;
1270                 }
1271         }
1272         cpu_monitor_pid = -1;
1273         thread_exit = 2;
1274         printk(KERN_DEBUG "cpu_monitord exit\n");
1275         return 0;
1276 }
1277
1278 void ckrm_cpu_start_monitor(void)
1279 {
1280         if (cpu_monitor_pid != -1) {
1281                 /* already started ... */
1282                 return;
1283         }       
1284         cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
1285         if (cpu_monitor_pid < 0) {
1286                 printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n");
1287         }
1288 }
1289
1290 void ckrm_cpu_kill_monitor(void)
1291 {
1292         printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid);
1293         if (cpu_monitor_pid > 0) {
1294                 thread_exit = 1;
1295                 while (thread_exit != 2) {
1296                         set_current_state(TASK_INTERRUPTIBLE);
1297                         schedule_timeout(CPU_MONITOR_INTERVAL);
1298                 }
1299         }
1300 }
1301
1302 static int __init ckrm_cpu_init_monitor(void)
1303 {
1304         if (ckrm_cpu_enabled()) 
1305                 ckrm_cpu_start_monitor();
1306         return 0;
1307 }
1308
1309 __initcall(ckrm_cpu_init_monitor);
1310