1 /* include/linux/ckrm_sched.h - Supports CKRM scheduling
3 * Copyright (C) Haoqiang Zheng, IBM Corp. 2004
4 * Copyright (C) Hubertus Franke, IBM Corp. 2004
6 * Latest version, more details at http://ckrm.sf.net
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
18 #include <linux/sched.h>
19 #include <linux/ckrm_rc.h>
20 #include <linux/ckrm_classqueue.h>
22 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
25 unsigned int nr_active;
26 unsigned long bitmap[BITMAP_SIZE];
27 struct list_head queue[MAX_PRIO];
30 #ifdef CONFIG_CKRM_CPU_SCHEDULE
31 #define rq_active(p,rq) (get_task_lrq(p)->active)
32 #define rq_expired(p,rq) (get_task_lrq(p)->expired)
33 int __init init_ckrm_sched_res(void);
35 #define rq_active(p,rq) (rq->active)
36 #define rq_expired(p,rq) (rq->expired)
37 static inline void init_ckrm_sched_res(void) {}
38 static inline int ckrm_cpu_monitor_init(void) {return 0;}
39 #endif //CONFIG_CKRM_CPU_SCHEDULE
41 #ifdef CONFIG_CKRM_CPU_SCHEDULE
42 struct ckrm_runqueue {
43 cq_node_t classqueue_linkobj; /*links in classqueue */
44 struct ckrm_cpu_class *cpu_class; // class it belongs to
45 struct classqueue_struct *classqueue; // classqueue it belongs tow
46 unsigned long long uncounted_ns;
48 prio_array_t *active, *expired, arrays[2];
50 set to 0 on init, become null or array switch
51 set to jiffies whenever an non-interactive job expires
52 reset to jiffies if expires
54 unsigned long expired_timestamp;
57 * highest priority of tasks in active
58 * initialized to be MAX_PRIO
59 * updated on enqueue, dequeue
64 unsigned long lrq_load;
69 * unused CPU time accumulated while thoe class
70 * is inactive goes to savings
73 * a class can't accumulate more than SAVING_THRESHOLD of savings
74 * savings are kept in normalized form (like cvt)
75 * so when task share change the savings should be scaled accordingly
77 unsigned long long savings;
79 unsigned long magic; //for debugging
82 typedef struct ckrm_runqueue ckrm_lrq_t;
85 * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
88 struct ckrm_cpu_class_stat {
91 unsigned long long total_ns; /*how much nano-secs it has consumed */
93 struct ckrm_cpu_demand_stat local_stats[NR_CPUS];
98 unsigned long max_demand; /* the maximun a class can consume */
99 int egrt,megrt; /*effective guarantee*/
100 int ehl,mehl; /*effective hard limit, my effective hard limit*/
103 * eshare: for both default class and its children
104 * meshare: just for the default class
110 #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
112 #define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds
113 #define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
114 #define USAGE_WINDOW_SIZE 60 //keep the last 60 sample
117 unsigned long samples[USAGE_WINDOW_SIZE]; //record usages
118 unsigned long sample_pointer; //pointer for the sliding window
119 unsigned long long last_ns; //ns for last sample
120 long long last_sample_jiffies; //in number of jiffies
124 * manages the class status
125 * there should be only one instance of this object for each class in the whole system
127 struct ckrm_cpu_class {
128 struct ckrm_core_class *core;
129 struct ckrm_core_class *parent;
130 struct ckrm_shares shares;
131 spinlock_t cnt_lock; // always grab parent's lock first and then child's
132 struct ckrm_cpu_class_stat stat;
133 struct list_head links; // for linking up in cpu classes
134 ckrm_lrq_t local_queues[NR_CPUS]; // runqueues
135 struct ckrm_usage usage;
136 unsigned long magic; //for debugging
139 #define cpu_class_weight(cls) (cls->stat.meshare)
140 #define local_class_weight(lrq) (lrq->local_weight)
142 static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
144 return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC);
147 struct classqueue_struct *get_cpu_classqueue(int cpu);
148 struct ckrm_cpu_class * get_default_cpu_class(void);
151 static inline void ckrm_usage_init(struct ckrm_usage* usage)
155 for (i=0; i < USAGE_WINDOW_SIZE; i++)
156 usage->samples[i] = 0;
157 usage->sample_pointer = 0;
159 usage->last_sample_jiffies = 0;
163 * this function can be called at any frequency
164 * it's self-contained
166 static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr)
168 struct ckrm_usage* usage = &clsptr->usage;
169 unsigned long long cur_sample;
170 int duration = jiffies - usage->last_sample_jiffies;
172 //jiffies wasn't start from 0
173 //so it need to be properly handled
174 if (unlikely(!usage->last_sample_jiffies))
175 usage->last_sample_jiffies = jiffies;
177 //called too frequenctly
178 if (duration < USAGE_SAMPLE_FREQ)
181 usage->last_sample_jiffies = jiffies;
183 cur_sample = clsptr->stat.total_ns - usage->last_ns;
184 usage->last_ns = clsptr->stat.total_ns;
186 //scale it based on the sample duration
187 cur_sample *= ((USAGE_SAMPLE_FREQ<< 15)/duration);
189 usage->samples[usage->sample_pointer] = cur_sample;
190 // printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
192 usage->sample_pointer ++;
193 if (usage->sample_pointer >= USAGE_WINDOW_SIZE)
194 usage->sample_pointer = 0;
197 //duration is specified in number of jiffies
198 //return the usage in percentage
199 static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration)
201 int nr_samples = duration/USAGE_SAMPLE_FREQ?:1;
202 struct ckrm_usage* usage = &clsptr->usage;
203 unsigned long long total = 0;
206 if (nr_samples > USAGE_WINDOW_SIZE)
207 nr_samples = USAGE_WINDOW_SIZE;
209 idx = usage->sample_pointer;
210 for (i = 0; i< nr_samples; i++) {
212 idx = USAGE_WINDOW_SIZE;
214 total += usage->samples[idx];
217 do_div(total,nr_samples);
218 do_div(total,NS_PER_SAMPLE);
219 do_div(total,cpus_weight(cpu_online_map));
224 #define lrq_nr_running(lrq) \
225 (lrq->active->nr_active + lrq->expired->nr_active)
227 static inline ckrm_lrq_t *
228 get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
230 return &(cls->local_queues[cpu]);
233 static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
235 return &(p->cpu_class->local_queues[task_cpu(p)]);
238 #define task_list_entry(list) list_entry(list,struct task_struct,run_list)
239 #define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj)
241 /* some additional interfaces exported from sched.c */
243 extern rwlock_t class_list_lock;
244 extern struct list_head active_cpu_classes;
245 unsigned int task_timeslice(task_t *p);
246 void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls);
248 void init_cpu_classes(void);
249 void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
250 void ckrm_cpu_change_class(void *task, void *old, void *new);
253 #define CPU_DEMAND_ENQUEUE 0
254 #define CPU_DEMAND_DEQUEUE 1
255 #define CPU_DEMAND_DESCHEDULE 2
256 #define CPU_DEMAND_INIT 3
258 /*functions exported by ckrm_cpu_monitor.c*/
259 void ckrm_cpu_monitor(void);
260 int ckrm_cpu_monitor_init(void);
261 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
262 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
263 void adjust_local_weight(void);
265 #define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
266 #define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu])
267 #define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu))
269 /********************************************************************
270 * Parameters that determine how quickly CVT's progress and how
271 * priority can impact a LRQ's runqueue position. See also
272 * get_effective_prio(). These parameters need to adjusted
273 * in accordance to the following example and understanding.
277 * A class with 5% share, can execute 50M nsecs / per sec ~ 2^28.
278 * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7.
279 * With CLASS_QUANTIZER=16, the local_cvt of this class will increase
280 * by 2^28/2^9 = 2^19 = 512K.
281 * Setting CLASS_QUANTIZER to 16, 2^(19-16) = 8 slots / per second.
282 * A class with 5% shares, will cover 80 slots / per second.
284 * PRIORITY_QUANTIZER:
286 * How much can top priorities of class impact slot bonus.
287 * There are 40 nice priorities. "2" will allow upto 10 slots improvement
288 * in the RQ thus for 50% class it can perform ~1sec starvation.
290 *******************************************************************/
292 #define CLASS_QUANTIZER 16 //shift from ns to increase class bonus
293 #define PRIORITY_QUANTIZER 2 //controls how much a high prio task can borrow
295 #define CKRM_SHARE_ACCURACY 10
296 #define NSEC_PER_MS 1000000
297 #define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
300 #define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC) // 10 seconds
302 #define CVT_UPDATE_TICK ((HZ/2)?:1)
304 // ABSOLUTE_CKRM_TUNING determines whether classes can make up
305 // lost time in absolute time or in relative values
307 #define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior
309 #ifdef ABSOLUTE_CKRM_TUNING
311 #define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE
312 //an absolute bonus of 200ms for classes when reactivated
313 #define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq))
314 #define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
316 #define scale_cvt(val,lrq) ((val)*local_class_weight(lrq))
317 #define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq)))
321 #define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY)
323 * to improve system responsiveness
324 * an inactive class is put a little bit ahead of the current class when it wakes up
325 * the amount is set in normalized termis to simplify the calculation
326 * for class with 100% share, it can be 2s ahead
327 * while for class with 10% share, it can be 200ms ahead
329 #define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS)
332 * normalized savings can't be more than MAX_NORMALIZED_SAVINGS
333 * based on the current configuration
334 * this means that a class with share 100% will accumulate 10s at most
335 * while a class with 1% of the share can only accumulate 100ms
338 //a class with share 100% can get 100ms every 500ms
339 //while a class with share 10% can only get 10ms every 500ms
340 #define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY)
342 #define scale_cvt(val,lrq) (val)
343 #define unscale_cvt(val,lrq) (val)
349 * get_effective_prio: return the effective priority of a class local queue
351 * class priority = progress * a + urgency * b
352 * progress = queue cvt
353 * urgency = queue top priority
354 * a and b are scaling factors
355 * currently, prio increases by 1 if either: top_priority increase by one
356 * or, local_cvt increases by 4ms
358 static inline int get_effective_prio(ckrm_lrq_t * lrq)
362 prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage
363 prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency
368 CVT_t get_local_cur_cvt(int cpu);
371 * update_class_priority:
373 * called whenever cvt or top_priority changes
375 * internal: (calling structure)
376 * update_class_priority
377 * -- set_top_priority
378 * -- class_enqueue_task
379 * -- class_dequeue_task
380 * -- rq_get_next_task (queue switch)
381 * -- update_local_cvt
384 static inline void update_class_priority(ckrm_lrq_t *local_rq)
386 int effective_prio = get_effective_prio(local_rq);
387 classqueue_update_prio(local_rq->classqueue,
388 &local_rq->classqueue_linkobj,
393 * set the new top priority and reposition the queue
394 * called when: task enqueue/dequeue and queue switch
396 static inline void set_top_priority(ckrm_lrq_t *lrq,
399 lrq->top_priority = new_priority;
400 update_class_priority(lrq);
404 * task_load: how much load this task counts
406 static inline unsigned long task_load(struct task_struct* p)
408 return (task_timeslice(p) * p->demand_stat.cpu_demand);
412 * runqueue load is the local_weight of all the classes on this cpu
413 * must be called with class_list_lock held
415 static inline unsigned long ckrm_cpu_load(int cpu)
417 struct ckrm_cpu_class *clsptr;
419 struct ckrm_cpu_demand_stat* l_stat;
423 list_for_each_entry(clsptr,&active_cpu_classes,links) {
424 lrq = get_ckrm_lrq(clsptr,cpu);
425 l_stat = get_cls_local_stat(clsptr,cpu);
426 load = lrq->local_weight;
427 if (l_stat->cpu_demand < load)
428 load = l_stat->cpu_demand;
434 static inline void class_enqueue_task(struct task_struct *p,
435 prio_array_t * array)
440 lrq = get_task_lrq(p);
442 cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0);
443 lrq->lrq_load += task_load(p);
445 if ((p->prio < lrq->top_priority) && (array == lrq->active))
446 set_top_priority(lrq, p->prio);
448 if (! cls_in_classqueue(&lrq->classqueue_linkobj)) {
449 cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0);
450 effective_prio = get_effective_prio(lrq);
451 classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio);
456 static inline void class_dequeue_task(struct task_struct *p,
457 prio_array_t * array)
459 ckrm_lrq_t *lrq = get_task_lrq(p);
460 unsigned long load = task_load(p);
462 BUG_ON(lrq->lrq_load < load);
463 lrq->lrq_load -= load;
465 cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0);
467 if ((array == lrq->active) && (p->prio == lrq->top_priority)
468 && list_empty(&(array->queue[p->prio])))
469 set_top_priority(lrq,
470 find_next_bit(array->bitmap, MAX_PRIO,
475 * called after a task is switched out. Update the local cvt accounting
476 * we need to stick with long instead of long long due to nonexistent 64-bit division
478 static inline void update_local_cvt(struct task_struct *p, unsigned long nsec)
480 ckrm_lrq_t * lrq = get_task_lrq(p);
482 unsigned long cvt_inc = nsec / local_class_weight(lrq);
484 lrq->local_cvt += cvt_inc;
485 lrq->uncounted_ns += nsec;
487 update_class_priority(lrq);
490 static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
492 struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
493 struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj);
495 return (class_compare_prio(node1,node2) < 0);
499 * return a random value with range [0, (val-1)]
501 static inline int get_ckrm_rand(unsigned long val)
504 static int last_rand[NR_CPUS];
505 int cpu = smp_processor_id();
507 rand = last_rand[cpu];
512 last_rand[cpu] = rand;
516 void update_class_cputime(int this_cpu);
518 /**********************************************/
519 /* PID_LOAD_BALANCING */
520 /**********************************************/
521 struct ckrm_load_struct {
522 unsigned long load_p; /*propotional*/
523 unsigned long load_i; /*integral */
524 long load_d; /*derivative */
527 typedef struct ckrm_load_struct ckrm_load_t;
529 static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
530 ckrm_load->load_p = 0;
531 ckrm_load->load_i = 0;
532 ckrm_load->load_d = 0;
535 void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
536 long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group);
537 #define rq_ckrm_load(rq) (&((rq)->ckrm_load))
539 static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load)
541 read_lock(&class_list_lock);
544 ckrm_load_sample(ckrm_load,this_cpu);
547 if (! (j % CVT_UPDATE_TICK)) {
548 // printk("ckrm_sched j=%lu\n",j);
549 classqueue_update_base(get_cpu_classqueue(this_cpu));
550 update_class_cputime(this_cpu);
553 read_unlock(&class_list_lock);
556 #endif //CONFIG_CKRM_CPU_SCHEDULE