1 /* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor
3 * Copyright (C) Haoqiang Zheng, IBM Corp. 2004
4 * (C) Hubertus Franke, IBM Corp. 2004
6 * Latest version, more details at http://ckrm.sf.net
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
17 * 23 June 2004: Created
20 #include <linux/module.h>
21 #include <linux/init.h>
22 #include <asm/errno.h>
23 #include <linux/list.h>
24 #include <linux/spinlock.h>
25 #include <linux/ckrm.h>
26 #include <linux/ckrm_rc.h>
27 #include <linux/ckrm_tc.h>
28 #include <asm/div64.h>
29 #include <linux/ckrm_sched.h>
31 #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/
32 #define CKRM_SHARE_MAX (1<<CKRM_SHARE_ACCURACY)
34 #define CKRM_CPU_DEMAND_RUN 0
35 #define CKRM_CPU_DEMAND_SLEEP 1
36 //sample task cpu demand every 64ms
37 #define CPU_DEMAND_TASK_RECALC (64000000LL)
38 #define CPU_DEMAND_CLASS_RECALC (256000000LL)
39 #define CPU_DEMAND_TP_CLASS 0
40 #define CPU_DEMAND_TP_TASK 1
42 extern struct ckrm_cpu_class *ckrm_get_cpu_class(struct ckrm_core_class *core);
43 void update_ckrm_idle(unsigned long surplus);
45 /*interface to share definition*/
46 static inline int get_soft_limit(struct ckrm_cpu_class *cls)
48 return cls->shares.my_limit;
51 static inline int get_mysoft_limit(struct ckrm_cpu_class *cls)
53 return cls->shares.total_guarantee;
56 static inline int get_hard_limit(struct ckrm_cpu_class *cls)
58 return cls->shares.total_guarantee;
61 static inline int get_myhard_limit(struct ckrm_cpu_class *cls)
63 return cls->shares.total_guarantee;
67 static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type)
69 unsigned long long now = sched_clock();
72 local_stat->total = 0;
73 local_stat->last_sleep = now;
75 case CPU_DEMAND_TP_CLASS:
76 local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC;
77 local_stat->cpu_demand = 0;
79 case CPU_DEMAND_TP_TASK:
80 local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC;
81 //for task, the init cpu_demand is copied from its parent
88 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat)
92 stat->stat_lock = SPIN_LOCK_UNLOCKED;
96 for (i=0; i< NR_CPUS; i++) {
97 cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS);
102 stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/
103 stat->mehl = CKRM_SHARE_MAX; /*default: no limit */
105 stat->eshare = CKRM_SHARE_MAX;
106 stat->meshare = CKRM_SHARE_MAX;
109 /**********************************************/
111 /**********************************************/
114 * How CPU demand is calculated:
115 * consider class local runqueue (clr) first
116 * at any time, a clr can at the following three states
117 * -- run: a task belonning to this class is running on this cpu
118 * -- wait: at least one of its task is running, but the class is not running
119 * -- sleep: none of the task of this class is runnable
121 * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2))
123 * the cpu_demand of a class =
124 * sum of cpu_demand of all the class local runqueues
128 * update_cpu_demand_stat -
130 * should be called whenever the state of a task/task local queue changes
131 * -- when deschedule : report how much run
132 * -- when enqueue: report how much sleep
134 * how often should we recalculate the cpu demand
135 * the number is in ns
137 static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len)
139 local_stat->total += len;
140 if (state == CKRM_CPU_DEMAND_RUN)
141 local_stat->run += len;
143 if (local_stat->total >= local_stat->recalc_interval) {
144 local_stat->total >>= CKRM_SHARE_ACCURACY;
145 if (unlikely(local_stat->run > 0xFFFFFFFF))
146 local_stat->run = 0xFFFFFFFF;
148 if (local_stat->total > 0xFFFFFFFF)
149 local_stat->total = 0xFFFFFFFF;
151 do_div(local_stat->run,(unsigned long)local_stat->total);
153 if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep
154 local_stat->cpu_demand = local_stat->run;
156 local_stat->cpu_demand += local_stat->run;
157 local_stat->cpu_demand >>= 1;
159 local_stat->total = 0;
165 * cpu_demand_event - and cpu_demand event occured
166 * @event: one of the following three events:
167 * CPU_DEMAND_ENQUEUE: local class enqueue
168 * CPU_DEMAND_DEQUEUE: local class dequeue
169 * CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule
170 * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run
172 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len)
175 case CPU_DEMAND_ENQUEUE:
176 len = sched_clock() - local_stat->last_sleep;
177 local_stat->last_sleep = 0;
178 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len);
180 case CPU_DEMAND_DEQUEUE:
181 if (! local_stat->last_sleep) {
182 local_stat->last_sleep = sched_clock();
185 case CPU_DEMAND_DESCHEDULE:
186 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len);
188 case CPU_DEMAND_INIT: //for task init only
189 cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK);
197 * check all the class local queue
199 * to deal with excessive long run/sleep state
200 * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record
202 static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu)
204 struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu];
205 unsigned long long sleep,now;
206 if (local_stat->last_sleep) {
208 sleep = now - local_stat->last_sleep;
209 local_stat->last_sleep = now;
210 update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep);
215 *get_self_cpu_demand - get cpu demand of the class itself (excluding children)
217 * self_cpu_demand = sum(cpu demand of all local queues)
219 static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat)
225 for_each_online_cpu(i) {
226 cpu_demand_check_sleep(stat,i);
227 cpu_demand += stat->local_stats[i].cpu_demand;
231 return (cpu_demand/cpuonline);
235 * my max demand = min(cpu_demand, my effective hard limit)
237 static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat)
239 unsigned long mmax_demand = get_self_cpu_demand(stat);
240 if (mmax_demand > stat->mehl)
241 mmax_demand = stat->mehl;
247 * update_max_demand: update effective cpu demand for each class
250 * Assume: the root_core->parent == NULL
252 static int update_max_demand(struct ckrm_core_class *root_core)
254 struct ckrm_core_class *cur_core, *child_core;
255 struct ckrm_cpu_class *cls,*c_cls;
258 cur_core = root_core;
262 if (!cur_core) { //normal exit
267 cls = ckrm_get_cpu_class(cur_core);
268 if (! cls) //invalid c_cls, abort
271 if (!child_core) //first child
272 cls->stat.max_demand = get_mmax_demand(&cls->stat);
274 c_cls = ckrm_get_cpu_class(child_core);
276 cls->stat.max_demand += c_cls->stat.max_demand;
277 else //invalid c_cls, abort
281 //check class hard limit
282 if (cls->stat.max_demand > cls->stat.ehl)
283 cls->stat.max_demand = cls->stat.ehl;
286 child_core = ckrm_get_next_child(cur_core, child_core);
289 cur_core = child_core;
292 } else { //no more child, go back
293 child_core = cur_core;
294 cur_core = child_core->hnode.parent;
301 /**********************************************/
302 /* effective guarantee & limit */
303 /**********************************************/
304 static inline void set_eshare(struct ckrm_cpu_class_stat *stat,
310 BUG_ON(new_share < 0);
311 stat->eshare = new_share;
314 static inline void set_meshare(struct ckrm_cpu_class_stat *stat,
320 BUG_ON(new_share < 0);
321 stat->meshare = new_share;
325 *update_child_effective - update egrt, ehl, mehl for all children of parent
326 *@parent: the parent node
327 *return -1 if anything wrong
330 static int update_child_effective(struct ckrm_core_class *parent)
332 struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
333 struct ckrm_core_class *child_core;
339 child_core = ckrm_get_next_child(parent, NULL);
341 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
347 c_cls->shares.my_guarantee / p_cls->shares.total_guarantee;
349 c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee
350 / c_cls->shares.total_guarantee;
354 get_hard_limit(c_cls) / p_cls->shares.total_guarantee;
358 get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
360 child_core = ckrm_get_next_child(parent, child_core);
366 * update_effectives: update egrt, ehl, mehl for the whole tree
367 * should be called only when class structure changed
369 * return -1 if anything wrong happened (eg: the structure changed during the process)
371 static int update_effectives(struct ckrm_core_class *root_core)
373 struct ckrm_core_class *cur_core, *child_core;
374 struct ckrm_cpu_class *cls;
377 cur_core = root_core;
379 cls = ckrm_get_cpu_class(cur_core);
381 //initialize the effectives for root
382 cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */
383 cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee
384 / cls->shares.total_guarantee;
385 cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls)
386 / cls->shares.total_guarantee;
387 cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
388 / cls->shares.total_guarantee;
396 if (update_child_effective(cur_core) < 0)
397 return ret; //invalid cur_core node
400 child_core = ckrm_get_next_child(cur_core, child_core);
403 //go down to the next hier
404 cur_core = child_core;
406 } else { //no more child, go back
407 child_core = cur_core;
408 cur_core = child_core->hnode.parent;
413 /**********************************************/
414 /* surplus allocation */
415 /**********************************************/
418 * surplus = egrt - demand
419 * if surplus < 0, surplus = 0
421 static inline int get_node_surplus(struct ckrm_cpu_class *cls)
423 int surplus = cls->stat.egrt - cls->stat.max_demand;
431 static inline int get_my_node_surplus(struct ckrm_cpu_class *cls)
433 int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat);
442 * node_surplus_consume: consume the surplus
443 * @ckeck_sl: if check_sl is set, then check soft_limit
444 * @total_grt: total guarantee
445 * return how much consumed
448 * implements all the CKRM Scheduling Requirement
449 * update total_grt if necessary
451 static inline int node_surplus_consume(int surplus,
452 struct ckrm_core_class *child_core,
453 struct ckrm_cpu_class *p_cls,
461 struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
462 int total_grt = p_cls->shares.total_guarantee;
466 if (! c_cls || ! total_grt)
469 /*can't consume more than demand or hard limit*/
470 if (c_cls->stat.eshare >= c_cls->stat.max_demand)
474 surplus * c_cls->shares.my_guarantee / total_grt;
476 if (! consumed) //no more share
479 //hard limit and demand limit
480 inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare;
483 int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
484 /p_cls->shares.total_guarantee;
485 if (esl < c_cls->stat.max_demand)
486 inc_limit = esl - c_cls->stat.eshare;
490 if (consumed > inc_limit)
491 consumed = inc_limit;
495 BUG_ON(consumed < 0);
496 set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
497 BUG_ON(c_cls->stat.eshare < 0);
504 * alloc_surplus_node: re-allocate the shares for children under parent
505 * @parent: parent node
506 * return the remaining surplus
509 * 1. get total surplus
510 * 2. allocate surplus
511 * 3. set the effective_share of each node
513 static int alloc_surplus_node(struct ckrm_core_class *parent)
515 int total_surplus , old_surplus;
516 struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
517 struct ckrm_core_class *child_core = NULL;
525 total_surplus = get_my_node_surplus(p_cls);
528 * initialize effective_share
531 child_core = ckrm_get_next_child(parent, child_core);
533 struct ckrm_cpu_class *c_cls;
535 c_cls = ckrm_get_cpu_class(child_core);
539 total_surplus += get_node_surplus(c_cls);
541 set_eshare(&c_cls->stat, c_cls->stat.egrt);
543 } while (child_core);
548 /* distribute the surplus */
553 if (!child_core) {//start a new round
555 //ok, everybody reached the soft limit
556 if (old_surplus == total_surplus)
558 old_surplus = total_surplus;
561 child_core = ckrm_get_next_child(parent, child_core);
565 node_surplus_consume(old_surplus, child_core,
568 total_surplus -= consumed;
572 //start a new round if something is allocated in the last round
573 } while (child_core || check_sl || total_surplus != old_surplus);
576 /*how much for itself*/
577 self_share = p_cls->stat.eshare *
578 p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
580 if (self_share < p_cls->stat.max_demand) {
581 /*any remaining surplus goes to the default class*/
582 self_share += total_surplus;
583 if (self_share > p_cls->stat.max_demand)
584 self_share = p_cls->stat.max_demand;
587 set_meshare(&p_cls->stat, self_share);
592 * alloc_surplus - reallocate unused shares
594 * class A's usused share should be allocated to its siblings
595 * the re-allocation goes downward from the top
597 static int alloc_surplus(struct ckrm_core_class *root_core)
599 struct ckrm_core_class *cur_core, *child_core;
600 struct ckrm_cpu_class *cls;
604 cur_core = root_core;
606 cls = ckrm_get_cpu_class(cur_core);
609 set_eshare(&cls->stat, cls->stat.egrt);
611 /*the ckrm idle tasks get all what's remaining*/
612 /*hzheng: uncomment the following like for hard limit support */
613 // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
621 if ( alloc_surplus_node(cur_core) < 0 )
625 child_core = ckrm_get_next_child(cur_core, child_core);
628 cur_core = child_core;
631 } else { //no more child, go back
632 child_core = cur_core;
633 cur_core = child_core->hnode.parent;
638 /**********************************************/
639 /* CKRM Idle Tasks */
640 /**********************************************/
641 struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class;
642 struct task_struct* ckrm_idle_tasks[NR_CPUS];
644 /*how many ckrm idle tasks should I wakeup*/
645 static inline int get_nr_idle(unsigned long surplus)
647 int cpu_online = cpus_weight(cpu_online_map);
650 nr_idle = surplus * cpu_online;
651 nr_idle >>= CKRM_SHARE_ACCURACY;
656 if (nr_idle > cpu_online)
657 nr_idle = cpu_online;
663 * update_ckrm_idle: update the status of the idle class according to the new surplus
664 * surplus: new system surplus
667 * -- update share of the idle class
668 * -- wakeup idle tasks according to surplus
670 void update_ckrm_idle(unsigned long surplus)
672 int nr_idle = get_nr_idle(surplus);
674 struct task_struct* idle_task;
676 set_eshare(&ckrm_idle_class->stat,surplus);
677 set_meshare(&ckrm_idle_class->stat,surplus);
678 /*wake up nr_idle idle tasks*/
679 for_each_online_cpu(i) {
680 idle_task = ckrm_idle_tasks[i];
681 if (unlikely(idle_task->cpu_class != ckrm_idle_class)) {
682 ckrm_cpu_change_class(idle_task,
683 idle_task->cpu_class,
690 wake_up_process(idle_task);
693 idle_task->state = TASK_INTERRUPTIBLE;
694 set_tsk_need_resched(idle_task);
699 static int ckrm_cpu_idled(void *nothing)
701 set_user_nice(current,19);
702 daemonize("ckrm_idle_task");
704 //deactivate it, it will be waked up by ckrm_cpu_monitor
705 current->state = TASK_INTERRUPTIBLE;
708 /*similar to cpu_idle */
710 while (!need_resched()) {
712 if (current_cpu_data.hlt_works_ok) {
714 if (!need_resched()) {
715 set_tsk_need_resched(current);
727 * ckrm_start_ckrm_idle:
728 * create the ckrm_idle_class and starts the idle tasks
731 void ckrm_start_ckrm_idle(void)
735 ckrm_shares_t shares;
737 ckrm_idle_class = &ckrm_idle_class_obj;
738 memset(ckrm_idle_class,0,sizeof(shares));
739 /*don't care about the shares */
740 init_cpu_class(ckrm_idle_class,&shares);
741 printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class);
743 for_each_online_cpu(i) {
744 ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL);
746 /*warn on error, but the system should still work without it*/
748 printk(KERN_ERR"Warn: can't start ckrm idle tasks\n");
750 ckrm_idle_tasks[i] = find_task_by_pid(ret);
751 if (!ckrm_idle_tasks[i])
752 printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret);
757 /**********************************************/
759 /**********************************************/
761 * adjust_class_local_weight: adjust the local weight for each cpu
763 * lrq->weight = lpr->pressure * class->weight / total_pressure
765 static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online)
767 unsigned long total_pressure = 0;
770 unsigned long class_weight;
771 unsigned long long lw;
774 for_each_online_cpu(i) {
775 lrq = get_ckrm_lrq(clsptr,i);
776 total_pressure += lrq->lrq_load;
779 if (! total_pressure)
782 class_weight = cpu_class_weight(clsptr) * cpu_online;
785 * update weight for each cpu, minimun is 1
787 for_each_online_cpu(i) {
788 lrq = get_ckrm_lrq(clsptr,i);
790 /*give idle class a high share to boost interactiveness */
791 lw = cpu_class_weight(clsptr);
793 lw = lrq->lrq_load * class_weight;
794 do_div(lw,total_pressure);
797 else if (lw > CKRM_SHARE_MAX)
801 lrq->local_weight = lw;
806 * assume called with class_list_lock read lock held
808 void adjust_local_weight(void)
810 static spinlock_t lock = SPIN_LOCK_UNLOCKED;
811 struct ckrm_cpu_class *clsptr;
814 //do nothing if someone already holding the lock
815 if (! spin_trylock(&lock))
818 cpu_online = cpus_weight(cpu_online_map);
820 //class status: demand, share,total_ns prio, index
821 list_for_each_entry(clsptr,&active_cpu_classes,links) {
822 adjust_lrq_weight(clsptr,cpu_online);
828 /**********************************************/
830 /**********************************************/
832 *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
834 * this function is called every CPU_MONITOR_INTERVAL
835 * it computes the cpu demand of each class
836 * and re-allocate the un-used shares to other classes
838 void ckrm_cpu_monitor(void)
840 static spinlock_t lock = SPIN_LOCK_UNLOCKED;
841 static unsigned long long last_check = 0;
842 struct ckrm_core_class *root_core = get_default_cpu_class()->core;
843 unsigned long long now;
844 #define MIN_CPU_MONITOR_INTERVAL 100000000UL
849 //do nothing if someone already holding the lock
850 if (! spin_trylock(&lock))
853 read_lock(&class_list_lock);
857 //consecutive check should be at least 100ms apart
858 if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
863 if (update_effectives(root_core) != 0)
866 if (update_max_demand(root_core) != 0)
869 if (alloc_surplus(root_core) != 0)
872 adjust_local_weight();
875 read_unlock(&class_list_lock);
879 /*****************************************************/
880 /* Supporting Functions */
881 /*****************************************************/
882 static pid_t cpu_monitor_pid = -1;
883 static int thread_exit = 0;
885 static int ckrm_cpu_monitord(void *nothing)
887 daemonize("ckrm_cpu_ctrld");
889 /*sleep for sometime before next try*/
890 set_current_state(TASK_INTERRUPTIBLE);
891 schedule_timeout(CPU_MONITOR_INTERVAL);
897 cpu_monitor_pid = -1;
899 printk("cpu_monitord exit\n");
903 void ckrm_start_monitor(void)
905 cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL);
906 if (cpu_monitor_pid < 0) {
907 printk("ckrm_cpu_monitord for failed\n");
911 void ckrm_kill_monitor(void)
915 printk("killing process %d\n", cpu_monitor_pid);
916 if (cpu_monitor_pid > 0) {
918 while (thread_exit != 2) {
919 set_current_state(TASK_INTERRUPTIBLE);
920 schedule_timeout(CPU_MONITOR_INTERVAL);
925 int ckrm_cpu_monitor_init(void)
927 ckrm_start_monitor();
928 /*hzheng: uncomment the following like for hard limit support */
929 // ckrm_start_ckrm_idle();
933 void ckrm_cpu_monitor_exit(void)
938 module_init(ckrm_cpu_monitor_init);
939 module_exit(ckrm_cpu_monitor_exit);
941 MODULE_AUTHOR("Haoqiang Zheng <hzheng@cs.columbia.edu>");
942 MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor");
943 MODULE_LICENSE("GPL");