1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
3 * Copyright (C) Haoqiang Zheng, IBM Corp. 2004
4 * (C) Hubertus Franke, IBM Corp. 2004
6 * Latest version, more details at http://ckrm.sf.net
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
17 * 23 June 2004: Created
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <asm/errno.h>
23 #include <linux/list.h>
24 #include <linux/spinlock.h>
25 #include <linux/ckrm.h>
26 #include <linux/ckrm_rc.h>
27 #include <linux/ckrm_tc.h>
28 #include <asm/div64.h>
29 #include <linux/ckrm_sched.h>
31 #define CPU_MONITOR_INTERVAL (2*HZ) /*how often do we adjust the shares*/
32 #define CKRM_SHARE_ACCURACY 10
33 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
35 #define CKRM_CPU_DEMAND_RUN 0
36 #define CKRM_CPU_DEMAND_SLEEP 1
37 //sample task cpu demand every 64ms
38 #define CPU_DEMAND_TASK_RECALC (64000000LL)
39 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
40 #define CPU_DEMAND_TP_CLASS 0
41 #define CPU_DEMAND_TP_TASK 1
43 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
44 void update_ckrm_idle(unsigned long surplus);
46 /*interface to share definition*/
47 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
49 return cls->shares.my_limit;
52 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
54 return cls->shares.total_guarantee;
57 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
59 return cls->shares.total_guarantee;
62 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
64 return cls->shares.total_guarantee;
68 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
70 unsigned long long now = sched_clock();
73 local_stat->total = 0;
74 local_stat->last_sleep = now;
76 case CPU_DEMAND_TP_CLASS:
77 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
78 local_stat->cpu_demand = 0;
80 case CPU_DEMAND_TP_TASK:
81 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
82 //for task, the init cpu_demand is copied from its parent
89 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
93 stat->stat_lock = SPIN_LOCK_UNLOCKED;
97 for (i=0; i< NR_CPUS; i++) {
98 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
103 stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
104 stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
106 stat->eshare = CKRM_SHARE_MAX;
107 stat->meshare = CKRM_SHARE_MAX;
110 /**********************************************/
112 /**********************************************/
115 * How CPU demand is calculated:
116 * consider class local runqueue (clr) first
117 * at any time, a clr can at the following three states
118 * -- run: a task belonning to this class is running on this cpu
119 * -- wait: at least one of its task is running, but the class is not running
120 * -- sleep: none of the task of this class is runnable
122 * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
124 * the cpu_demand of a class =
125 * sum of cpu_demand of all the class local runqueues
129 * update_cpu_demand_stat -
131 * should be called whenever the state of a task/task local queue changes
132 * -- when deschedule : report how much run
133 * -- when enqueue: report how much sleep
135 * how often should we recalculate the cpu demand
136 * the number is in ns
138 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
140 local_stat->total += len;
141 if (state == CKRM_CPU_DEMAND_RUN)
142 local_stat->run += len;
144 if (local_stat->total >= local_stat->recalc_interval) {
145 local_stat->total >>= CKRM_SHARE_ACCURACY;
146 if (unlikely(local_stat->run > 0xFFFFFFFF))
147 local_stat->run = 0xFFFFFFFF;
149 if (local_stat->total > 0xFFFFFFFF)
150 local_stat->total = 0xFFFFFFFF;
152 do_div(local_stat->run,(unsigned long)local_stat->total);
154 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
155 local_stat->cpu_demand = local_stat->run;
157 local_stat->cpu_demand += local_stat->run;
158 local_stat->cpu_demand >>= 1;
160 local_stat->total = 0;
166 * cpu_demand_event - and cpu_demand event occured
167 * @event: one of the following three events:
168 * CPU_DEMAND_ENQUEUE: local class enqueue
169 * CPU_DEMAND_DEQUEUE: local class dequeue
170 * CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
171 * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
173 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len)
176 case CPU_DEMAND_ENQUEUE:
177 len = sched_clock() - local_stat->last_sleep;
178 local_stat->last_sleep = 0;
179 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
181 case CPU_DEMAND_DEQUEUE:
182 if (! local_stat->last_sleep) {
183 local_stat->last_sleep = sched_clock();
186 case CPU_DEMAND_DESCHEDULE:
187 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
189 case CPU_DEMAND_INIT: //for task init only
190 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
198 * check all the class local queue
200 * to deal with excessive long run/sleep state
201 * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
203 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
205 struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
206 unsigned long long sleep,now;
207 if (local_stat->last_sleep) {
209 sleep = now - local_stat->last_sleep;
210 local_stat->last_sleep = now;
211 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
216 *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
218 * self_cpu_demand = sum(cpu demand of all local queues)
220 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
226 for_each_online_cpu(i) {
227 cpu_demand_check_sleep(stat,i);
228 cpu_demand += stat->local_stats[i].cpu_demand;
232 return (cpu_demand/cpuonline);
236 * my max demand = min(cpu_demand, my effective hard limit)
238 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
240 unsigned long mmax_demand = get_self_cpu_demand(stat);
241 if (mmax_demand > stat->mehl)
242 mmax_demand = stat->mehl;
248 * update_max_demand: update effective cpu demand for each class
251 * Assume: the root_core->parent == NULL
253 static int update_max_demand(struct ckrm_core_class *root_core)
255 struct ckrm_core_class *cur_core, *child_core;
256 struct ckrm_cpu_class *cls,*c_cls;
259 cur_core = root_core;
263 if (!cur_core) { //normal exit
268 cls = ckrm_get_cpu_class(cur_core);
269 if (! cls) //invalid c_cls, abort
272 if (!child_core) //first child
273 cls->stat.max_demand = get_mmax_demand(&cls->stat);
275 c_cls = ckrm_get_cpu_class(child_core);
277 cls->stat.max_demand += c_cls->stat.max_demand;
278 else //invalid c_cls, abort
282 //check class hard limit
283 if (cls->stat.max_demand > cls->stat.ehl)
284 cls->stat.max_demand = cls->stat.ehl;
287 child_core = ckrm_get_next_child(cur_core, child_core);
290 cur_core = child_core;
293 } else { //no more child, go back
294 child_core = cur_core;
295 cur_core = child_core->hnode.parent;
302 /**********************************************/
303 /* effective guarantee & limit */
304 /**********************************************/
305 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
310 stat->eshare = new_share;
313 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
318 stat->meshare = new_share;
322 *update_child_effective - update egrt, ehl, mehl for all children of parent
323 *@parent: the parent node
324 *return -1 if anything wrong
327 static int update_child_effective(struct ckrm_core_class *parent)
329 struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
330 struct ckrm_core_class *child_core;
335 child_core = ckrm_get_next_child(parent, NULL);
337 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
343 c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
345 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
346 / c_cls->shares.total_guarantee;
350 get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
354 get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
356 child_core = ckrm_get_next_child(parent, child_core);
362 * update_effectives: update egrt, ehl, mehl for the whole tree
363 * should be called only when class structure changed
365 * return -1 if anything wrong happened (eg: the structure changed during the process)
367 static int update_effectives(struct ckrm_core_class *root_core)
369 struct ckrm_core_class *cur_core, *child_core;
370 struct ckrm_cpu_class *cls;
372 cur_core = root_core;
374 cls = ckrm_get_cpu_class(cur_core);
376 //initialize the effectives for root
377 cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
378 cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
379 / cls->shares.total_guarantee;
380 cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
381 / cls->shares.total_guarantee;
382 cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
383 / cls->shares.total_guarantee;
391 if (update_child_effective(cur_core) == -1) {
392 return -1; //invalid cur_core node
396 child_core = ckrm_get_next_child(cur_core, child_core);
399 //go down to the next hier
400 cur_core = child_core;
402 } else { //no more child, go back
403 child_core = cur_core;
404 cur_core = child_core->hnode.parent;
409 /**********************************************/
410 /* surplus allocation */
411 /**********************************************/
414 * surplus = egrt - demand
415 * if surplus < 0, surplus = 0
417 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
419 int surplus = cls->stat.egrt - cls->stat.max_demand;
427 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
429 int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
438 * node_surplus_consume: consume the surplus
439 * @ckeck_sl: if check_sl is set, then check soft_limit
440 * @total_grt: total guarantee
441 * return how much consumed
443 * implements all the CKRM Scheduling Requirement
444 * update total_grt if necessary
446 static inline int node_surplus_consume(int surplus,
447 struct ckrm_core_class *child_core,
448 struct ckrm_cpu_class *p_cls,
457 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
459 if (! c_cls || ! *total_grt)
462 /*can't consume more than demand or hard limit*/
463 if (c_cls->stat.eshare >= c_cls->stat.max_demand)
467 surplus * c_cls->shares.my_guarantee / *total_grt;
469 if (! consumed) //no more share
472 //hard limit and demand limit
473 inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
476 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
477 /p_cls->shares.total_guarantee;
478 if (esl < c_cls->stat.max_demand)
479 inc_limit = esl - c_cls->stat.eshare;
483 if (consumed > inc_limit)
484 consumed = inc_limit;
488 c_cls->stat.eshare += consumed;
492 *total_grt -= c_cls->shares.my_guarantee;
498 * alloc_surplus_node: re-allocate the shares for children under parent
499 * @parent: parent node
500 * return the remaining surplus
503 * 1. get total surplus
504 * 2. allocate surplus
505 * 3. set the effective_share of each node
507 static void alloc_surplus_node(struct ckrm_core_class *parent)
509 int total_surplus , old_surplus;
510 struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
511 struct ckrm_core_class *child_core = NULL;
513 int total_grt = p_cls->shares.total_guarantee;
519 total_surplus = get_my_node_surplus(p_cls);
521 * initialize effective_share
524 child_core = ckrm_get_next_child(parent, child_core);
526 struct ckrm_cpu_class *c_cls;
528 c_cls = ckrm_get_cpu_class(child_core); if (! c_cls)
531 total_surplus += get_node_surplus(c_cls);
533 set_eshare(&c_cls->stat, c_cls->stat.egrt);
535 } while (child_core);
540 /* distribute the surplus */
545 if (!child_core) {//start a new round
547 //ok, everybody reached the soft limit
548 if (old_surplus == total_surplus)
551 old_surplus = total_surplus;
554 child_core = ckrm_get_next_child(parent, child_core);
557 node_surplus_consume(old_surplus, child_core,
558 p_cls,check_sl,&total_grt);
559 //start a new round if something is allocated in the last round
560 } while (child_core || check_sl || total_surplus != old_surplus);
563 /*how much for itself*/
564 self_share = p_cls->stat.eshare *
565 p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
567 if (self_share < p_cls->stat.max_demand) {
568 /*any remaining surplus goes to the default class*/
569 self_share += total_surplus;
570 if (self_share > p_cls->stat.max_demand)
571 self_share = p_cls->stat.max_demand;
574 set_meshare(&p_cls->stat, self_share);
578 * alloc_surplus - reallocate unused shares
580 * class A's usused share should be allocated to its siblings
581 * the re-allocation goes downward from the top
583 static int alloc_surplus(struct ckrm_core_class *root_core)
585 struct ckrm_core_class *cur_core, *child_core;
586 struct ckrm_cpu_class *cls;
590 cur_core = root_core;
592 cls = ckrm_get_cpu_class(cur_core);
593 set_eshare(&cls->stat, cls->stat.egrt);
594 /*the ckrm idle tasks get all what's remaining*/
595 /*hzheng: uncomment the following like for hard limit support */
596 // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
604 alloc_surplus_node(cur_core);
606 child_core = ckrm_get_next_child(cur_core, child_core);
609 cur_core = child_core;
612 } else { //no more child, go back
613 child_core = cur_core;
614 cur_core = child_core->hnode.parent;
619 /**********************************************/
620 /* CKRM Idle Tasks */
621 /**********************************************/
622 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
623 struct task_struct* ckrm_idle_tasks[NR_CPUS];
625 /*how many ckrm idle tasks should I wakeup*/
626 static inline int get_nr_idle(unsigned long surplus)
628 int cpu_online = cpus_weight(cpu_online_map);
631 nr_idle = surplus * cpu_online;
632 nr_idle >>= CKRM_SHARE_ACCURACY;
637 if (nr_idle > cpu_online)
638 nr_idle = cpu_online;
644 * update_ckrm_idle: update the status of the idle class according to the new surplus
645 * surplus: new system surplus
648 * -- update share of the idle class
649 * -- wakeup idle tasks according to surplus
651 void update_ckrm_idle(unsigned long surplus)
653 int nr_idle = get_nr_idle(surplus);
655 struct task_struct* idle_task;
657 set_eshare(&ckrm_idle_class->stat,surplus);
658 set_meshare(&ckrm_idle_class->stat,surplus);
659 /*wake up nr_idle idle tasks*/
660 for_each_online_cpu(i) {
661 idle_task = ckrm_idle_tasks[i];
662 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
663 ckrm_cpu_change_class(idle_task,
664 idle_task->cpu_class,
671 wake_up_process(idle_task);
674 idle_task->state = TASK_INTERRUPTIBLE;
675 set_tsk_need_resched(idle_task);
680 static int ckrm_cpu_idled(void *nothing)
682 set_user_nice(current,19);
683 daemonize("ckrm_idle_task");
685 //deactivate it, it will be waked up by ckrm_cpu_monitor
686 current->state = TASK_INTERRUPTIBLE;
689 /*similar to cpu_idle */
691 while (!need_resched()) {
693 if (current_cpu_data.hlt_works_ok) {
695 if (!need_resched()) {
696 set_tsk_need_resched(current);
708 * ckrm_start_ckrm_idle:
709 * create the ckrm_idle_class and starts the idle tasks
712 void ckrm_start_ckrm_idle(void)
716 ckrm_shares_t shares;
718 ckrm_idle_class = &ckrm_idle_class_obj;
719 memset(ckrm_idle_class,0,sizeof(shares));
720 /*don't care about the shares */
721 init_cpu_class(ckrm_idle_class,&shares);
722 printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
724 for_each_online_cpu(i) {
725 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
727 /*warn on error, but the system should still work without it*/
729 printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
731 ckrm_idle_tasks[i] = find_task_by_pid(ret);
732 if (!ckrm_idle_tasks[i])
733 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
738 /**********************************************/
740 /**********************************************/
742 * adjust_class_local_weight: adjust the local weight for each cpu
744 * lrq->weight = lpr->pressure * class->weight / total_pressure
746 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
748 unsigned long total_pressure = 0;
751 unsigned long class_weight;
752 unsigned long long lw;
755 for_each_online_cpu(i) {
756 lrq = get_ckrm_lrq(clsptr,i);
757 total_pressure += lrq->lrq_load;
760 if (! total_pressure)
763 class_weight = cpu_class_weight(clsptr) * cpu_online;
766 * update weight for each cpu, minimun is 1
768 for_each_online_cpu(i) {
769 lrq = get_ckrm_lrq(clsptr,i);
771 /*give idle class a high share to boost interactiveness */
772 lw = cpu_class_weight(clsptr);
774 lw = lrq->lrq_load * class_weight;
775 do_div(lw,total_pressure);
778 else if (lw > CKRM_SHARE_MAX)
782 lrq->local_weight = lw;
787 * assume called with class_list_lock read lock held
789 void adjust_local_weight(void)
791 static spinlock_t lock = SPIN_LOCK_UNLOCKED;
792 struct ckrm_cpu_class *clsptr;
795 //do nothing if someone already holding the lock
796 if (! spin_trylock(&lock))
799 cpu_online = cpus_weight(cpu_online_map);
801 //class status: demand, share,total_ns prio, index
802 list_for_each_entry(clsptr,&active_cpu_classes,links) {
803 adjust_lrq_weight(clsptr,cpu_online);
809 /**********************************************/
811 /**********************************************/
813 *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
815 * this function is called every CPU_MONITOR_INTERVAL
816 * it computes the cpu demand of each class
817 * and re-allocate the un-used shares to other classes
819 void ckrm_cpu_monitor(void)
821 static spinlock_t lock = SPIN_LOCK_UNLOCKED;
822 static unsigned long long last_check = 0;
823 struct ckrm_core_class *root_core = get_default_cpu_class()->core;
824 unsigned long long now;
825 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
830 //do nothing if someone already holding the lock
831 if (! spin_trylock(&lock))
834 read_lock(&class_list_lock);
838 //consecutive check should be at least 100ms apart
839 if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
845 if (update_effectives(root_core) != 0)
848 if (update_max_demand(root_core) != 0)
851 if (alloc_surplus(root_core) != 0)
854 adjust_local_weight();
856 read_unlock(&class_list_lock);
860 /*****************************************************/
861 /* Supporting Functions */
862 /*****************************************************/
863 static pid_t cpu_monitor_pid = -1;
864 static int thread_exit = 0;
866 static int ckrm_cpu_monitord(void *nothing)
868 wait_queue_head_t wait;
870 init_waitqueue_head(&wait);
872 daemonize("ckrm_cpu_ctrld");
874 /*sleep for sometime before next try*/
875 interruptible_sleep_on_timeout(&wait, CPU_MONITOR_INTERVAL);
881 cpu_monitor_pid = -1;
883 printk("cpu_monitord exit\n");
887 void ckrm_start_monitor(void)
889 cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
890 if (cpu_monitor_pid < 0) {
891 printk("ckrm_cpu_monitord for failed\n");
895 void ckrm_kill_monitor(void)
897 wait_queue_head_t wait;
899 init_waitqueue_head(&wait);
901 printk("killing process %d\n", cpu_monitor_pid);
902 if (cpu_monitor_pid > 0) {
904 while (thread_exit != 2) {
905 interruptible_sleep_on_timeout(&wait, interval);
910 int ckrm_cpu_monitor_init(void)
912 ckrm_start_monitor();
913 /*hzheng: uncomment the following like for hard limit support */
914 // ckrm_start_ckrm_idle();
918 void ckrm_cpu_monitor_exit(void)
923 module_init(ckrm_cpu_monitor_init);
924 module_exit(ckrm_cpu_monitor_exit);
926 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
927 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
928 MODULE_LICENSE("GPL");