1 /* include/linux/ckrm_sched.h - Supports CKRM scheduling
3 * Copyright (C) Haoqiang Zheng, IBM Corp. 2004
4 * Copyright (C) Hubertus Franke, IBM Corp. 2004
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
17 * Please read Documentation/ckrm/cpu_sched for a general overview of
18 * how the O(1) CKRM scheduler.
20 * ckrm_sched.h provides the definition for the per class local runqueue.
27 #include <linux/sched.h>
28 #include <linux/ckrm_rc.h>
29 #include <linux/ckrm_classqueue.h>
31 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
34 unsigned int nr_active;
35 unsigned long bitmap[BITMAP_SIZE];
36 struct list_head queue[MAX_PRIO];
40 #ifndef CONFIG_CKRM_CPU_SCHEDULE
42 #define rq_active(p,rq) (rq->active)
43 #define rq_expired(p,rq) (rq->expired)
44 static inline void init_ckrm_sched_res(void) {}
45 static inline int ckrm_cpu_monitor_init(void) {return 0;}
49 #define rq_active(p,rq) (get_task_lrq(p)->active)
50 #define rq_expired(p,rq) (get_task_lrq(p)->expired)
52 enum ckrm_sched_mode {
53 CKRM_SCHED_MODE_DISABLED, /* always use default linux scheduling */
54 /* effectively disables the ckrm scheduler */
55 CKRM_SCHED_MODE_ENABLED /* always uses ckrm scheduling behavior */
58 extern unsigned int ckrm_sched_mode; /* true internal sched_mode (DIS/EN ABLED) */
60 int __init init_ckrm_sched_res(void);
62 typedef unsigned long long CVT_t; // cummulative virtual time
64 struct ckrm_runqueue {
65 cq_node_t classqueue_linkobj; /*links in classqueue */
66 struct ckrm_cpu_class *cpu_class; // class it belongs to
67 struct classqueue_struct *classqueue; // classqueue it belongs tow
68 unsigned long long uncounted_ns;
70 prio_array_t *active, *expired, arrays[2];
72 set to 0 on init, become null or array switch
73 set to jiffies whenever an non-interactive job expires
74 reset to jiffies if expires
76 unsigned long expired_timestamp;
77 int best_expired_prio;
80 * highest priority of tasks in active
81 * initialized to be MAX_PRIO
82 * updated on enqueue, dequeue
87 unsigned long lrq_load;
89 /* Three different weights are distinguished:
90 * local_weight, skewed_weight, over_weight:
92 * - local_weight: main weight to drive CVT progression
93 * - over_weight: weight to reduce savings when over its guarantee
94 * - skewed_weight: weight to use when local_weight to small
95 * avoids starvation problems.
102 * unused CPU time accumulated while the class
103 * is inactive goes to savings
105 * initialized to be 0
106 * a class can't accumulate more than SAVING_THRESHOLD of savings
110 unsigned long magic; //for debugging
111 } ____cacheline_aligned_in_smp;
113 #define CKRM_LRQ_MAGIC (0xACDC0702)
115 typedef struct ckrm_runqueue ckrm_lrq_t;
117 #define ckrm_cpu_disabled() (ckrm_sched_mode == CKRM_SCHED_MODE_DISABLED)
118 #define ckrm_cpu_enabled() (ckrm_sched_mode == CKRM_SCHED_MODE_ENABLED)
121 * ckrm_cpu_class_stat - cpu usage statistics maintained for each class
124 struct ckrm_cpu_class_stat {
125 spinlock_t stat_lock;
127 unsigned long long total_ns; /*how much nano-secs it has consumed */
129 struct ckrm_cpu_demand_stat local_stats[NR_CPUS];
134 unsigned long max_demand; /* the maximun a class can consume */
135 int egrt,megrt; /*effective guarantee*/
136 int ehl,mehl; /*effective hard limit, my effective hard limit*/
139 * eshare: for both default class and its children
140 * meshare: just for the default class
145 /* a boolean indicates if the class has savings or not */
149 * a temporary value used by reorder_surplus_queue
151 int demand_per_share;
154 #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3
156 #define USAGE_SAMPLE_FREQ (HZ) //sample every 1 seconds
157 #define USAGE_MAX_HISTORY (60) // keep the last 60 usage samples
158 #define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ))
161 unsigned long samples[USAGE_MAX_HISTORY]; //record usages
162 unsigned long sample_pointer; // pointer for the sliding window
163 unsigned long long last_ns; // ns for last sample
164 long long last_sample_jiffies; // in number of jiffies
168 * CPU controller object allocated for each CLASS
170 struct ckrm_cpu_class {
171 struct ckrm_core_class *core;
172 struct ckrm_core_class *parent;
173 struct ckrm_shares shares;
174 spinlock_t cnt_lock; // always grab parent's lock first and then child's
175 struct ckrm_cpu_class_stat stat;
176 struct list_head links; // for linking up in cpu classes
177 struct list_head surplus_queue; //used for surplus allocation
178 ckrm_lrq_t* local_queues[NR_CPUS]; // runqueues
179 struct ckrm_usage usage;
180 unsigned long magic; //for debugging
186 #define cpu_class_weight(cls) (SHARE_TO_WEIGHT(cls->stat.meshare))
187 #define local_class_weight(lrq) (lrq->local_weight)
189 static inline int valid_cpu_class(struct ckrm_cpu_class * cls)
191 return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC);
194 struct classqueue_struct *get_cpu_classqueue(int cpu);
195 struct ckrm_cpu_class * get_default_cpu_class(void);
198 static inline void ckrm_usage_init(struct ckrm_usage* usage)
202 for (i=0; i < USAGE_MAX_HISTORY; i++)
203 usage->samples[i] = 0;
204 usage->sample_pointer = 0;
206 usage->last_sample_jiffies = 0;
210 * this function can be called at any frequency
211 * it's self-contained
213 static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr)
215 struct ckrm_usage* usage = &clsptr->usage;
216 unsigned long long cur_sample;
217 int duration = jiffies - usage->last_sample_jiffies;
219 //jiffies wasn't start from 0
220 //so it need to be properly handled
221 if (unlikely(!usage->last_sample_jiffies))
222 usage->last_sample_jiffies = jiffies;
224 //called too frequenctly
225 if (duration < USAGE_SAMPLE_FREQ)
228 usage->last_sample_jiffies = jiffies;
230 cur_sample = clsptr->stat.total_ns - usage->last_ns;
231 usage->last_ns = clsptr->stat.total_ns;
233 //scale it based on the sample duration
234 cur_sample *= ((USAGE_SAMPLE_FREQ<< 15)/duration);
236 usage->samples[usage->sample_pointer] = cur_sample;
237 // printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies);
239 usage->sample_pointer ++;
240 if (usage->sample_pointer >= USAGE_MAX_HISTORY)
241 usage->sample_pointer = 0;
244 #define lrq_nr_running(lrq) \
245 (lrq->active->nr_active + lrq->expired->nr_active)
247 static inline ckrm_lrq_t *get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu)
249 return cls->local_queues[cpu];
252 static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p)
254 return p->cpu_class->local_queues[task_cpu(p)];
257 #define task_list_entry(list) list_entry(list,struct task_struct,run_list)
258 #define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj)
260 /* some additional interfaces exported from sched.c */
262 extern rwlock_t class_list_lock;
263 extern struct list_head active_cpu_classes;
264 unsigned int task_timeslice(task_t *p);
265 void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls);
267 void init_cpu_classes(void);
268 void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares);
269 void ckrm_cpu_change_class(void *task, void *old, void *new);
271 #define CPU_DEMAND_ENQUEUE 0
272 #define CPU_DEMAND_DEQUEUE 1
273 #define CPU_DEMAND_DESCHEDULE 2
274 #define CPU_DEMAND_INIT 3
276 /*functions exported by ckrm_cpu_monitor.c*/
277 int update_effectives(void);
278 void ckrm_cpu_monitor(int check_min);
279 int ckrm_cpu_monitor_init(void);
280 void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares);
281 void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
282 void adjust_local_weight(void);
284 #define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)])
285 #define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu])
286 #define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu))
288 /********************************************************************
289 * Parameters that determine how quickly CVT's progress and how
290 * priority can impact a LRQ's runqueue position. See also
291 * get_effective_prio(). These parameters need to adjusted
292 * in accordance to the following example and understanding.
296 * A class with 50% share, can execute 500 ms / per sec ~ 2^29 ns.
297 * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7.
298 * With CLASS_QUANTIZER=16, the local_cvt of this class will increase
299 * by 2^29/2^9 = 2^20 = 1024K.
300 * Setting CLASS_QUANTIZER to 16, 2^(20-16) = 16 slots / per second.
301 * Do the same math, a class with any share value, will cover 16 slots / per second.
302 * So 2^8 total slots is good track for 8 seconds of system execution
304 * PRIORITY_QUANTIZER:
306 * How much can top priorities of class impact slot bonus.
307 * There are 40 nice priorities, range from -20 to 19, with default nice = 0
308 * "2" will allow upto 5 slots improvement
309 * when certain task within the class has a nice value of -20
310 * in the RQ thus for 50% class it can perform ~300 msec starvation.
312 *******************************************************************/
315 * The class priority is biasd toward classes with high priority tasks.
316 * But we need to prevent this bias from starving other classes.
317 * If a class has nice value of -20, how much it can starve the default class?
318 * priority bonus = (120-100) >> PRIORITY_QUANTIZER,
319 * if PRIORITY_QUANTIZER = 2, then it's 5 steps ahead
320 * A class without bonus thus can't get to run until:
321 * bonus * CKRM_MAX_WEIGHT * CVT_INC_PERSHARE = (120-100) >> PRIORITY_QUANTIZER
322 * (1 << CKRM_WEIGHT_SHIFT)
323 * (1 << CLASS_QUANTIZER)
327 * CKRM_WEIGHT_SHIFT and CLASS_QUANTIZER control how much a class with
328 * high priority task can starve a normal priority class, so it should
329 * be constant CLASS_QUANTIZER should not be too small otherwise we
330 * don't have enough bins in the classqueue.
331 * The ideal value of CLASS_QUANTIZER is 20, but a little smaller is acceptable
334 #define CLASS_QUANTIZER (18)// shift from ns to increase class bonus
335 #define PRIORITY_QUANTIZER (2) // how much a high prio task can borrow
336 #define CKRM_WEIGHT_SHIFT (8) // 1/2^x == finest weight granularity
337 #define CKRM_MAX_WEIGHT (1<<CKRM_WEIGHT_SHIFT) // - " -
340 * shares are set in a hierarchical path. Since specified share settings
341 * of a class (c) are relative to the parent (p) and its totals
342 * the shares can get very small, dependent on how many classes are
346 #define CKRM_SHARE_SHIFT (13)
347 #define CKRM_SHARE_MAX (1 << CKRM_SHARE_SHIFT)
349 #define SHARE_TO_WEIGHT(x) ((x) >> (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
350 #define WEIGHT_TO_SHARE(x) ((x) << (CKRM_SHARE_SHIFT - CKRM_WEIGHT_SHIFT))
352 /* Other constants */
354 #define NSEC_PER_MS (1000000)
355 #define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ)
357 #define MAX_SAVINGS_ABSOLUTE (4LLU*NSEC_PER_SEC) // 4 seconds
358 #define CVT_UPDATE_TICK ((HZ/2)?:1)
359 #define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE
360 #define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES)
363 * get_effective_prio: return the effective priority of a class local queue
365 * class priority = progress * a + urgency * b
366 * progress = queue cvt
367 * urgency = queue top priority
368 * a and b are scaling factors
369 * currently, prio increases by 1 if either: top_priority increase by one
370 * or, local_cvt increases by 4ms
372 static inline int get_effective_prio(ckrm_lrq_t * lrq)
376 prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage
377 prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency
382 CVT_t get_local_cur_cvt(int cpu);
385 * update_class_priority:
387 * called whenever cvt or top_priority changes
389 * internal: (calling structure)
390 * update_class_priority
391 * -- set_top_priority
392 * -- class_enqueue_task
393 * -- class_dequeue_task
394 * -- rq_get_next_task (queue switch)
395 * -- update_local_cvt
398 static inline void update_class_priority(ckrm_lrq_t *local_rq)
400 int effective_prio = get_effective_prio(local_rq);
401 classqueue_update_prio(local_rq->classqueue,
402 &local_rq->classqueue_linkobj,
407 * set the new top priority and reposition the queue
408 * called when: task enqueue/dequeue and queue switch
410 static inline void set_top_priority(ckrm_lrq_t *lrq,
413 lrq->top_priority = new_priority;
414 update_class_priority(lrq);
418 * task_load: how much load this task counts
420 static inline unsigned long task_load(struct task_struct* p)
422 return (task_timeslice(p) * p->demand_stat.cpu_demand);
426 * moved to ckrm_sched.c
427 * but may need to make it static inline to improve performance
429 void update_local_cvt(struct task_struct *p, unsigned long nsec);
431 static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr)
433 struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj);
434 struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj);
436 return (class_compare_prio(node1,node2) < 0);
440 * return a random value with range [0, (val-1)]
442 static inline int get_ckrm_rand(unsigned long val)
445 static int last_rand[NR_CPUS];
446 int cpu = smp_processor_id();
448 rand = last_rand[cpu];
453 last_rand[cpu] = rand;
457 void update_class_cputime(int this_cpu, int idle);
459 /**********************************************/
460 /* PID_LOAD_BALANCING */
461 /**********************************************/
463 #define CPU_PID_CTRL_TICK 32
465 struct ckrm_load_struct {
466 unsigned long load_p; /*propotional*/
467 unsigned long load_i; /*integral */
468 long load_d; /*derivative */
471 typedef struct ckrm_load_struct ckrm_load_t;
473 static inline void ckrm_load_init(ckrm_load_t* ckrm_load) {
474 ckrm_load->load_p = 0;
475 ckrm_load->load_i = 0;
476 ckrm_load->load_d = 0;
479 void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu);
480 long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group);
481 #define rq_ckrm_load(rq) (&((rq)->ckrm_load))
484 #endif /*CONFIG_CKRM_CPU_SCHEDULE */