/* include/linux/ckrm_sched.h - Supports CKRM scheduling * * Copyright (C) Haoqiang Zheng, IBM Corp. 2004 * Copyright (C) Hubertus Franke, IBM Corp. 2004 * * Latest version, more details at http://ckrm.sf.net * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * */ #ifndef _CKRM_SCHED_H #define _CKRM_SCHED_H #include #include #include #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) struct prio_array { unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; }; #ifdef CONFIG_CKRM_CPU_SCHEDULE #define rq_active(p,rq) (get_task_lrq(p)->active) #define rq_expired(p,rq) (get_task_lrq(p)->expired) int __init init_ckrm_sched_res(void); #else #define rq_active(p,rq) (rq->active) #define rq_expired(p,rq) (rq->expired) static inline void init_ckrm_sched_res(void) {} static inline int ckrm_cpu_monitor_init(void) {return 0;} #endif //CONFIG_CKRM_CPU_SCHEDULE #ifdef CONFIG_CKRM_CPU_SCHEDULE struct ckrm_runqueue { cq_node_t classqueue_linkobj; /*links in classqueue */ struct ckrm_cpu_class *cpu_class; // class it belongs to struct classqueue_struct *classqueue; // classqueue it belongs tow unsigned long long uncounted_ns; prio_array_t *active, *expired, arrays[2]; /* set to 0 on init, become null or array switch set to jiffies whenever an non-interactive job expires reset to jiffies if expires */ unsigned long expired_timestamp; /* * highest priority of tasks in active * initialized to be MAX_PRIO * updated on enqueue, dequeue */ int top_priority; CVT_t local_cvt; unsigned long lrq_load; int local_weight; /* * unused CPU time accumulated while thoe class * is inactive goes to savings * * initialized to be 0 * a class can't accumulate more than SAVING_THRESHOLD of savings */ unsigned long long savings; unsigned long magic; //for debugging }; typedef struct ckrm_runqueue ckrm_lrq_t; /** * ckrm_cpu_class_stat - cpu usage statistics maintained for each class * */ struct ckrm_cpu_class_stat { spinlock_t stat_lock; unsigned long long total_ns; /*how much nano-secs it has consumed */ struct ckrm_cpu_demand_stat local_stats[NR_CPUS]; /* * */ unsigned long max_demand; /* the maximun a class can consume */ int egrt,megrt; /*effective guarantee*/ int ehl,mehl; /*effective hard limit, my effective hard limit*/ /* * eshare: for both default class and its children * meshare: just for the default class */ int eshare; int meshare; }; #define CKRM_CPU_CLASS_MAGIC 0x7af2abe3 #define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds #define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ)) #define USAGE_WINDOW_SIZE 60 //keep the last 60 sample struct ckrm_usage { unsigned long samples[USAGE_WINDOW_SIZE]; //record usages unsigned long sample_pointer; //pointer for the sliding window unsigned long long last_ns; //ns for last sample long long last_sample_jiffies; //in number of jiffies }; /* * manages the class status * there should be only one instance of this object for each class in the whole system */ struct ckrm_cpu_class { struct ckrm_core_class *core; struct ckrm_core_class *parent; struct ckrm_shares shares; spinlock_t cnt_lock; // always grab parent's lock first and then child's struct ckrm_cpu_class_stat stat; struct list_head links; // for linking up in cpu classes ckrm_lrq_t local_queues[NR_CPUS]; // runqueues struct ckrm_usage usage; unsigned long magic; //for debugging }; #define cpu_class_weight(cls) (cls->stat.meshare) #define local_class_weight(lrq) (lrq->local_weight) static inline int valid_cpu_class(struct ckrm_cpu_class * cls) { return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC); } struct classqueue_struct *get_cpu_classqueue(int cpu); struct ckrm_cpu_class * get_default_cpu_class(void); static inline void ckrm_usage_init(struct ckrm_usage* usage) { int i; for (i=0; i < USAGE_WINDOW_SIZE; i++) usage->samples[i] = 0; usage->sample_pointer = 0; usage->last_ns = 0; usage->last_sample_jiffies = 0; } /* * this function can be called at any frequency * it's self-contained */ static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr) { struct ckrm_usage* usage = &clsptr->usage; unsigned long long cur_sample; int duration = jiffies - usage->last_sample_jiffies; //jiffies wasn't start from 0 //so it need to be properly handled if (unlikely(!usage->last_sample_jiffies)) usage->last_sample_jiffies = jiffies; //called too frequenctly if (duration < USAGE_SAMPLE_FREQ) return; usage->last_sample_jiffies = jiffies; cur_sample = clsptr->stat.total_ns - usage->last_ns; usage->last_ns = clsptr->stat.total_ns; //scale it based on the sample duration cur_sample *= ((USAGE_SAMPLE_FREQ<< 15)/duration); cur_sample >>= 15; usage->samples[usage->sample_pointer] = cur_sample; // printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies); usage->sample_pointer ++; if (usage->sample_pointer >= USAGE_WINDOW_SIZE) usage->sample_pointer = 0; } //duration is specified in number of jiffies //return the usage in percentage static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration) { int nr_samples = duration/USAGE_SAMPLE_FREQ?:1; struct ckrm_usage* usage = &clsptr->usage; unsigned long long total = 0; int i, idx; if (nr_samples > USAGE_WINDOW_SIZE) nr_samples = USAGE_WINDOW_SIZE; idx = usage->sample_pointer; for (i = 0; i< nr_samples; i++) { if (! idx) idx = USAGE_WINDOW_SIZE; idx --; total += usage->samples[idx]; } total *= 100; do_div(total,nr_samples); do_div(total,NS_PER_SAMPLE); do_div(total,cpus_weight(cpu_online_map)); return total; } #define lrq_nr_running(lrq) \ (lrq->active->nr_active + lrq->expired->nr_active) static inline ckrm_lrq_t * get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu) { return &(cls->local_queues[cpu]); } static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p) { return &(p->cpu_class->local_queues[task_cpu(p)]); } #define task_list_entry(list) list_entry(list,struct task_struct,run_list) #define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj) /* some additional interfaces exported from sched.c */ struct runqueue; extern rwlock_t class_list_lock; extern struct list_head active_cpu_classes; unsigned int task_timeslice(task_t *p); void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls); void init_cpu_classes(void); void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares); void ckrm_cpu_change_class(void *task, void *old, void *new); #define CPU_DEMAND_ENQUEUE 0 #define CPU_DEMAND_DEQUEUE 1 #define CPU_DEMAND_DESCHEDULE 2 #define CPU_DEMAND_INIT 3 /*functions exported by ckrm_cpu_monitor.c*/ void ckrm_cpu_monitor(int check_min); int ckrm_cpu_monitor_init(void); void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len); void adjust_local_weight(void); #define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)]) #define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu]) #define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu)) /******************************************************************** * Parameters that determine how quickly CVT's progress and how * priority can impact a LRQ's runqueue position. See also * get_effective_prio(). These parameters need to adjusted * in accordance to the following example and understanding. * * CLASS_QUANTIZER: * * A class with 50% share, can execute 500 ms / per sec ~ 2^29 ns. * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7. * With CLASS_QUANTIZER=16, the local_cvt of this class will increase * by 2^29/2^9 = 2^20 = 1024K. * Setting CLASS_QUANTIZER to 16, 2^(20-16) = 16 slots / per second. * Do the same math, a class with any share value, will cover 16 slots / per second. * So 2^8 total slots is good track for 8 seconds of system execution * * PRIORITY_QUANTIZER: * * How much can top priorities of class impact slot bonus. * There are 40 nice priorities, range from -20 to 19, with default nice = 0 * "2" will allow upto 5 slots improvement * when certain task within the class has a nice value of -20 * in the RQ thus for 50% class it can perform ~300 msec starvation. * *******************************************************************/ #define CLASS_QUANTIZER 16 //shift from ns to increase class bonus #define PRIORITY_QUANTIZER 2 //controls how much a high prio task can borrow #define CKRM_SHARE_ACCURACY 13 #define NSEC_PER_MS 1000000 #define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ) #define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC) // 10 seconds #define CVT_UPDATE_TICK ((HZ/2)?:1) // ABSOLUTE_CKRM_TUNING determines whether classes can make up // lost time in absolute time or in relative values #define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior #ifdef ABSOLUTE_CKRM_TUNING #define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE //an absolute bonus of 200ms for classes when reactivated #define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) #define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES) #define scale_cvt(val,lrq) ((val)*local_class_weight(lrq)) #define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq))) #else #define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) /* * to improve system responsiveness * an inactive class is put a little bit ahead of the current class when it wakes up * the amount is set in normalized term to simplify the calculation * for class with 100% share, it can be 2s ahead * while for class with 10% share, it can be 200ms ahead */ #define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS) /* * normalized savings can't be more than MAX_NORMALIZED_SAVINGS * based on the current configuration * this means that a class with share 100% will accumulate 10s at most * while a class with 1% of the share can only accumulate 100ms */ //a class with share 100% can get 100ms every 500ms //while a class with share 10% can only get 10ms every 500ms #define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY) #define scale_cvt(val,lrq) (val) #define unscale_cvt(val,lrq) (val) #endif /** * get_effective_prio: return the effective priority of a class local queue * * class priority = progress * a + urgency * b * progress = queue cvt * urgency = queue top priority * a and b are scaling factors * currently, prio increases by 1 if either: top_priority increase by one * or, local_cvt increases by 4ms */ static inline int get_effective_prio(ckrm_lrq_t * lrq) { int prio; prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage #ifndef URGENCY_SUPPORT #warning "ACB removing urgency calculation from get_effective_prio" #else prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency #endif return prio; } CVT_t get_local_cur_cvt(int cpu); /** * update_class_priority: * * called whenever cvt or top_priority changes * * internal: (calling structure) * update_class_priority * -- set_top_priority * -- class_enqueue_task * -- class_dequeue_task * -- rq_get_next_task (queue switch) * -- update_local_cvt * -- schedule */ static inline void update_class_priority(ckrm_lrq_t *local_rq) { int effective_prio = get_effective_prio(local_rq); classqueue_update_prio(local_rq->classqueue, &local_rq->classqueue_linkobj, effective_prio); } /* * set the new top priority and reposition the queue * called when: task enqueue/dequeue and queue switch */ static inline void set_top_priority(ckrm_lrq_t *lrq, int new_priority) { lrq->top_priority = new_priority; update_class_priority(lrq); } /* * task_load: how much load this task counts */ static inline unsigned long task_load(struct task_struct* p) { return (task_timeslice(p) * p->demand_stat.cpu_demand); } /* * runqueue load is the local_weight of all the classes on this cpu * must be called with class_list_lock held */ static inline unsigned long ckrm_cpu_load(int cpu) { struct ckrm_cpu_class *clsptr; ckrm_lrq_t* lrq; struct ckrm_cpu_demand_stat* l_stat; int total_load = 0; int load; list_for_each_entry(clsptr,&active_cpu_classes,links) { lrq = get_ckrm_lrq(clsptr,cpu); l_stat = get_cls_local_stat(clsptr,cpu); load = lrq->local_weight; if (l_stat->cpu_demand < load) load = l_stat->cpu_demand; total_load += load; } return total_load; } static inline void class_enqueue_task(struct task_struct *p, prio_array_t * array) { ckrm_lrq_t *lrq; int effective_prio; lrq = get_task_lrq(p); cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); lrq->lrq_load += task_load(p); if ((p->prio < lrq->top_priority) && (array == lrq->active)) set_top_priority(lrq, p->prio); if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); effective_prio = get_effective_prio(lrq); classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio); } } static inline void class_dequeue_task(struct task_struct *p, prio_array_t * array) { ckrm_lrq_t *lrq = get_task_lrq(p); unsigned long load = task_load(p); BUG_ON(lrq->lrq_load < load); lrq->lrq_load -= load; cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); if ((array == lrq->active) && (p->prio == lrq->top_priority) && list_empty(&(array->queue[p->prio]))) set_top_priority(lrq, find_next_bit(array->bitmap, MAX_PRIO, p->prio)); } /* * called after a task is switched out. Update the local cvt accounting * we need to stick with long instead of long long due to nonexistent 64-bit division */ static inline void update_local_cvt(struct task_struct *p, unsigned long nsec) { ckrm_lrq_t * lrq = get_task_lrq(p); unsigned long cvt_inc = nsec / local_class_weight(lrq); lrq->local_cvt += cvt_inc; lrq->uncounted_ns += nsec; update_class_priority(lrq); } static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) { struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj); struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj); return (class_compare_prio(node1,node2) < 0); } /* * return a random value with range [0, (val-1)] */ static inline int get_ckrm_rand(unsigned long val) { int rand; static int last_rand[NR_CPUS]; int cpu = smp_processor_id(); rand = last_rand[cpu]; rand ++; if (rand >= val) rand = 0; last_rand[cpu] = rand; return rand; } void update_class_cputime(int this_cpu); /**********************************************/ /* PID_LOAD_BALANCING */ /**********************************************/ struct ckrm_load_struct { unsigned long load_p; /*propotional*/ unsigned long load_i; /*integral */ long load_d; /*derivative */ }; typedef struct ckrm_load_struct ckrm_load_t; static inline void ckrm_load_init(ckrm_load_t* ckrm_load) { ckrm_load->load_p = 0; ckrm_load->load_i = 0; ckrm_load->load_d = 0; } void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu); long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group); #define rq_ckrm_load(rq) (&((rq)->ckrm_load)) static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load) { read_lock(&class_list_lock); #ifdef CONFIG_SMP ckrm_load_sample(ckrm_load,this_cpu); #endif if (! (j % CVT_UPDATE_TICK)) { // printk("ckrm_sched j=%lu\n",j); classqueue_update_base(get_cpu_classqueue(this_cpu)); update_class_cputime(this_cpu); } read_unlock(&class_list_lock); } #endif //CONFIG_CKRM_CPU_SCHEDULE #endif