From: Marc Fiuczynski Date: Tue, 28 Sep 2004 06:14:31 +0000 (+0000) Subject: Merge in the version 7 cpu controller from CKRM. X-Git-Tag: before-enable-kexec-patch~60 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=c7089c9ff74690b4a1387b120eb5710f3510249f;p=linux-2.6.git Merge in the version 7 cpu controller from CKRM. - Had to shuffle the runqueue datastructure from sched.h to sched.c. - Fixed arithmetic fault that would occur early in boot due to uninitialized CKRM cpu controller data structure. Odd that the CKRM folks didn't have this bug, too. --- diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h index 1bdf9b775..a825336cb 100644 --- a/include/linux/ckrm_classqueue.h +++ b/include/linux/ckrm_classqueue.h @@ -116,7 +116,7 @@ void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int cq_node_t *classqueue_get_head(struct classqueue_struct *cq); /*update the base priority of the classqueue*/ -void classqueue_update_base(struct classqueue_struct *cq, int new_base); +void classqueue_update_base(struct classqueue_struct *cq); /** * class_compare_prio: compare the priority of this two nodes diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h index 9d82214fb..b7e6b30d0 100644 --- a/include/linux/ckrm_sched.h +++ b/include/linux/ckrm_sched.h @@ -15,30 +15,34 @@ #ifndef _CKRM_SCHED_H #define _CKRM_SCHED_H -#define CC_BUG_ON_DO(cond,action) do { if (cond) action; BUG_ON(cond); } while(0) -#define CC_BUG_ON(cond) BUG_ON(cond) - #include #include #include -//update every second -#define CVT_UPDATE_TICK (1*HZ/1 ?: 1) -#define CLASS_BONUS_RATE 22 // shift from ns to increase class bonus -#define PRIORITY_BONUS_RATE 0 // ?? Hubertus - #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + struct prio_array { - int nr_active; + unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; }; -struct ckrm_local_runqueue { +#ifdef CONFIG_CKRM_CPU_SCHEDULE +#define rq_active(p,rq) (get_task_lrq(p)->active) +#define rq_expired(p,rq) (get_task_lrq(p)->expired) +int __init init_ckrm_sched_res(void); +#else +#define rq_active(p,rq) (rq->active) +#define rq_expired(p,rq) (rq->expired) +static inline void init_ckrm_sched_res(void) {} +static inline int ckrm_cpu_monitor_init(void) {return 0;} +#endif //CONFIG_CKRM_CPU_SCHEDULE + +#ifdef CONFIG_CKRM_CPU_SCHEDULE +struct ckrm_runqueue { cq_node_t classqueue_linkobj; /*links in classqueue */ struct ckrm_cpu_class *cpu_class; // class it belongs to struct classqueue_struct *classqueue; // classqueue it belongs tow - CVT_t uncounted_cvt; unsigned long long uncounted_ns; prio_array_t *active, *expired, arrays[2]; @@ -55,19 +59,27 @@ struct ckrm_local_runqueue { * updated on enqueue, dequeue */ int top_priority; - CVT_t local_cvt; // snapshot of local_cvt, update on every loadbalance + CVT_t local_cvt; + + unsigned long lrq_load; + int local_weight; + + + /* + * unused CPU time accumulated while thoe class + * is inactive goes to savings + * + * initialized to be 0 + * a class can't accumulate more than SAVING_THRESHOLD of savings + * savings are kept in normalized form (like cvt) + * so when task share change the savings should be scaled accordingly + */ + unsigned long long savings; + unsigned long magic; //for debugging }; -/** - * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping - */ -struct ckrm_cpu_class_local_stat { - unsigned long long run; - unsigned long long total; - unsigned long long last_sleep; - unsigned long cpu_demand; /*estimated cpu demand */ -}; +typedef struct ckrm_runqueue ckrm_lrq_t; /** * ckrm_cpu_class_stat - cpu usage statistics maintained for each class @@ -78,22 +90,35 @@ struct ckrm_cpu_class_stat { unsigned long long total_ns; /*how much nano-secs it has consumed */ - struct ckrm_cpu_class_local_stat local_stats[NR_CPUS]; - unsigned long cpu_demand; + struct ckrm_cpu_demand_stat local_stats[NR_CPUS]; + + /* + * + */ + unsigned long max_demand; /* the maximun a class can consume */ + int egrt,megrt; /*effective guarantee*/ + int ehl,mehl; /*effective hard limit, my effective hard limit*/ - /*temp stat used by cpu monitor */ - int effective_guarantee; - int effective_limit; - int glut; //true or false /* - * effective_share: for both default class and its children - * self_effective_share: just for the default class + * eshare: for both default class and its children + * meshare: just for the default class */ - int effective_share; - int self_effective_share; + int eshare; + int meshare; }; -typedef struct ckrm_cpu_class_stat ckrm_stat_t; +#define CKRM_CPU_CLASS_MAGIC 0x7af2abe3 + +#define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds +#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ)) +#define USAGE_WINDOW_SIZE 60 //keep the last 60 sample + +struct ckrm_usage { + unsigned long samples[USAGE_WINDOW_SIZE]; //record usages + unsigned long sample_pointer; //pointer for the sliding window + unsigned long long last_ns; //ns for last sample + long long last_sample_jiffies; //in number of jiffies +}; /* * manages the class status @@ -104,72 +129,221 @@ struct ckrm_cpu_class { struct ckrm_core_class *parent; struct ckrm_shares shares; spinlock_t cnt_lock; // always grab parent's lock first and then child's - CVT_t global_cvt; // total cummulative virtual time struct ckrm_cpu_class_stat stat; struct list_head links; // for linking up in cpu classes - struct ckrm_local_runqueue local_queues[NR_CPUS]; // runqueues + ckrm_lrq_t local_queues[NR_CPUS]; // runqueues + struct ckrm_usage usage; + unsigned long magic; //for debugging }; -#if CONFIG_CKRM_CPU_SCHEDULE -#define rq_active(p,rq) (get_task_class_queue(p)->active) -#define rq_expired(p,rq) (get_task_class_queue(p)->expired) -#else -#define rq_active(p,rq) (rq->active) -#define rq_expired(p,rq) (rq->expired) -#endif +#define cpu_class_weight(cls) (cls->stat.meshare) +#define local_class_weight(lrq) (lrq->local_weight) -//#define cpu_class_weight(cls) (cls->shares.my_guarantee) -#define cpu_class_weight(cls) (cls->stat.self_effective_share) - -#define bpt_queue(cpu) (& (cpu_rq(cpu)->classqueue) ) -CVT_t get_min_cvt(int cpu); +static inline int valid_cpu_class(struct ckrm_cpu_class * cls) +{ + return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC); +} struct classqueue_struct *get_cpu_classqueue(int cpu); +struct ckrm_cpu_class * get_default_cpu_class(void); + + +static inline void ckrm_usage_init(struct ckrm_usage* usage) +{ + int i; + + for (i=0; i < USAGE_WINDOW_SIZE; i++) + usage->samples[i] = 0; + usage->sample_pointer = 0; + usage->last_ns = 0; + usage->last_sample_jiffies = 0; +} + +/* + * this function can be called at any frequency + * it's self-contained + */ +static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr) +{ + struct ckrm_usage* usage = &clsptr->usage; + unsigned long long cur_sample; + int duration = jiffies - usage->last_sample_jiffies; + + //jiffies wasn't start from 0 + //so it need to be properly handled + if (unlikely(!usage->last_sample_jiffies)) + usage->last_sample_jiffies = jiffies; + + //called too frequenctly + if (duration < USAGE_SAMPLE_FREQ) + return; + + usage->last_sample_jiffies = jiffies; + + cur_sample = clsptr->stat.total_ns - usage->last_ns; + usage->last_ns = clsptr->stat.total_ns; + + //scale it based on the sample duration + cur_sample *= ((USAGE_SAMPLE_FREQ<< 15)/duration); + cur_sample >>= 15; + usage->samples[usage->sample_pointer] = cur_sample; + // printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies); + + usage->sample_pointer ++; + if (usage->sample_pointer >= USAGE_WINDOW_SIZE) + usage->sample_pointer = 0; +} + +//duration is specified in number of jiffies +//return the usage in percentage +static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration) +{ + int nr_samples = duration/USAGE_SAMPLE_FREQ?:1; + struct ckrm_usage* usage = &clsptr->usage; + unsigned long long total = 0; + int i, idx; + + if (nr_samples > USAGE_WINDOW_SIZE) + nr_samples = USAGE_WINDOW_SIZE; + + idx = usage->sample_pointer; + for (i = 0; i< nr_samples; i++) { + if (! idx) + idx = USAGE_WINDOW_SIZE; + idx --; + total += usage->samples[idx]; + } + total *= 100; + do_div(total,nr_samples); + do_div(total,NS_PER_SAMPLE); + do_div(total,cpus_weight(cpu_online_map)); + return total; +} -extern struct ckrm_cpu_class default_cpu_class_obj; -#define default_cpu_class (&default_cpu_class_obj) -#define local_queue_nr_running(local_queue) \ - (local_queue->active->nr_active + local_queue->expired->nr_active) +#define lrq_nr_running(lrq) \ + (lrq->active->nr_active + lrq->expired->nr_active) -static inline struct ckrm_local_runqueue * -get_ckrm_local_runqueue(struct ckrm_cpu_class*cls, int cpu) +static inline ckrm_lrq_t * +get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu) { return &(cls->local_queues[cpu]); } -static inline struct ckrm_local_runqueue *get_task_class_queue(struct task_struct *p) +static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p) { return &(p->cpu_class->local_queues[task_cpu(p)]); } #define task_list_entry(list) list_entry(list,struct task_struct,run_list) -#define class_list_entry(list) list_entry(list,struct ckrm_local_runqueue,classqueue_linkobj) +#define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj) /* some additional interfaces exported from sched.c */ struct runqueue; -void dequeue_task(struct task_struct *p, prio_array_t * array); -void enqueue_task(struct task_struct *p, prio_array_t * array); -struct runqueue *task_rq_lock(task_t * p, unsigned long *flags); -void task_rq_unlock(struct runqueue *rq, unsigned long *flags); -extern spinlock_t cvt_lock; extern rwlock_t class_list_lock; extern struct list_head active_cpu_classes; +unsigned int task_timeslice(task_t *p); +void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls); -/*functions exported by ckrm_cpu_class.c*/ -int __init init_ckrm_sched_res(void); void init_cpu_classes(void); +void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares); +void ckrm_cpu_change_class(void *task, void *old, void *new); + -/*functions exported by ckrm_cpu_monitor.c*/ -void ckrm_cpu_monitor(void); -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); #define CPU_DEMAND_ENQUEUE 0 #define CPU_DEMAND_DEQUEUE 1 #define CPU_DEMAND_DESCHEDULE 2 -void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len); +#define CPU_DEMAND_INIT 3 + +/*functions exported by ckrm_cpu_monitor.c*/ +void ckrm_cpu_monitor(void); +int ckrm_cpu_monitor_init(void); +void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); +void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len); +void adjust_local_weight(void); + +#define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)]) +#define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu]) +#define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu)) + +/******************************************************************** + * Parameters that determine how quickly CVT's progress and how + * priority can impact a LRQ's runqueue position. See also + * get_effective_prio(). These parameters need to adjusted + * in accordance to the following example and understanding. + * + * CLASS_QUANTIZER: + * + * A class with 5% share, can execute 50M nsecs / per sec ~ 2^28. + * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7. + * With CLASS_QUANTIZER=16, the local_cvt of this class will increase + * by 2^28/2^9 = 2^19 = 512K. + * Setting CLASS_QUANTIZER to 16, 2^(19-16) = 8 slots / per second. + * A class with 5% shares, will cover 80 slots / per second. + * + * PRIORITY_QUANTIZER: + * + * How much can top priorities of class impact slot bonus. + * There are 40 nice priorities. "2" will allow upto 10 slots improvement + * in the RQ thus for 50% class it can perform ~1sec starvation. + * + *******************************************************************/ + +#define CLASS_QUANTIZER 16 //shift from ns to increase class bonus +#define PRIORITY_QUANTIZER 2 //controls how much a high prio task can borrow + +#define CKRM_SHARE_ACCURACY 10 +#define NSEC_PER_MS 1000000 +#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ) + + +#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC) // 10 seconds + +#define CVT_UPDATE_TICK ((HZ/2)?:1) + +// ABSOLUTE_CKRM_TUNING determines whether classes can make up +// lost time in absolute time or in relative values + +#define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior + +#ifdef ABSOLUTE_CKRM_TUNING + +#define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE +//an absolute bonus of 200ms for classes when reactivated +#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) +#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES) + +#define scale_cvt(val,lrq) ((val)*local_class_weight(lrq)) +#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq))) + +#else + +#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) +/* + * to improve system responsiveness + * an inactive class is put a little bit ahead of the current class when it wakes up + * the amount is set in normalized termis to simplify the calculation + * for class with 100% share, it can be 2s ahead + * while for class with 10% share, it can be 200ms ahead + */ +#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS) + +/* + * normalized savings can't be more than MAX_NORMALIZED_SAVINGS + * based on the current configuration + * this means that a class with share 100% will accumulate 10s at most + * while a class with 1% of the share can only accumulate 100ms + */ + +//a class with share 100% can get 100ms every 500ms +//while a class with share 10% can only get 10ms every 500ms +#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY) + +#define scale_cvt(val,lrq) (val) +#define unscale_cvt(val,lrq) (val) + +#endif -#define get_task_local_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)]) -#define get_rq_local_stat(lrq,cpu) (&(lrq)->cpu_class->stat.local_stats[cpu]) /** * get_effective_prio: return the effective priority of a class local queue @@ -181,18 +355,18 @@ void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, u * currently, prio increases by 1 if either: top_priority increase by one * or, local_cvt increases by 4ms */ -static inline int get_effective_prio(struct ckrm_local_runqueue * lcq) +static inline int get_effective_prio(ckrm_lrq_t * lrq) { int prio; - // cumulative usage - prio = lcq->local_cvt >> CLASS_BONUS_RATE; - // queue urgency - prio += lcq->top_priority >> PRIORITY_BONUS_RATE; + prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage + prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency return prio; } +CVT_t get_local_cur_cvt(int cpu); + /** * update_class_priority: * @@ -206,9 +380,8 @@ static inline int get_effective_prio(struct ckrm_local_runqueue * lcq) * -- rq_get_next_task (queue switch) * -- update_local_cvt * -- schedule - * -- update_global_cvt */ -static inline void update_class_priority(struct ckrm_local_runqueue *local_rq) +static inline void update_class_priority(ckrm_lrq_t *local_rq) { int effective_prio = get_effective_prio(local_rq); classqueue_update_prio(local_rq->classqueue, @@ -220,42 +393,80 @@ static inline void update_class_priority(struct ckrm_local_runqueue *local_rq) * set the new top priority and reposition the queue * called when: task enqueue/dequeue and queue switch */ -static inline void set_top_priority(struct ckrm_local_runqueue *class_queue, +static inline void set_top_priority(ckrm_lrq_t *lrq, int new_priority) { - class_queue->top_priority = new_priority; - update_class_priority(class_queue); + lrq->top_priority = new_priority; + update_class_priority(lrq); +} + +/* + * task_load: how much load this task counts + */ +static inline unsigned long task_load(struct task_struct* p) +{ + return (task_timeslice(p) * p->demand_stat.cpu_demand); +} + +/* + * runqueue load is the local_weight of all the classes on this cpu + * must be called with class_list_lock held + */ +static inline unsigned long ckrm_cpu_load(int cpu) +{ + struct ckrm_cpu_class *clsptr; + ckrm_lrq_t* lrq; + struct ckrm_cpu_demand_stat* l_stat; + int total_load = 0; + int load; + + list_for_each_entry(clsptr,&active_cpu_classes,links) { + lrq = get_ckrm_lrq(clsptr,cpu); + l_stat = get_cls_local_stat(clsptr,cpu); + load = lrq->local_weight; + if (l_stat->cpu_demand < load) + load = l_stat->cpu_demand; + total_load += load; + } + return total_load; } static inline void class_enqueue_task(struct task_struct *p, prio_array_t * array) { - struct ckrm_local_runqueue *queue; + ckrm_lrq_t *lrq; int effective_prio; - queue = get_task_class_queue(p); + lrq = get_task_lrq(p); + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); + lrq->lrq_load += task_load(p); - if (! cls_in_classqueue(&queue->classqueue_linkobj)) { - cpu_demand_event(get_task_local_stat(p),CPU_DEMAND_ENQUEUE,0); - /*make sure the cvt of this class is up to date*/ - queue->local_cvt = get_min_cvt(task_cpu(p)); - effective_prio = get_effective_prio(queue); - classqueue_enqueue(queue->classqueue, &queue->classqueue_linkobj, effective_prio); + if ((p->prio < lrq->top_priority) && (array == lrq->active)) + set_top_priority(lrq, p->prio); + + if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { + cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); + effective_prio = get_effective_prio(lrq); + classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio); } - - if ((p->prio < queue->top_priority) && (array == queue->active)) - set_top_priority(queue, p->prio); } static inline void class_dequeue_task(struct task_struct *p, prio_array_t * array) { - struct ckrm_local_runqueue *queue = get_task_class_queue(p); + ckrm_lrq_t *lrq = get_task_lrq(p); + unsigned long load = task_load(p); - if ((array == queue->active) && (p->prio == queue->top_priority) + BUG_ON(lrq->lrq_load < load); + lrq->lrq_load -= load; + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); + + if ((array == lrq->active) && (p->prio == lrq->top_priority) && list_empty(&(array->queue[p->prio]))) - set_top_priority(queue, + set_top_priority(lrq, find_next_bit(array->bitmap, MAX_PRIO, p->prio)); } @@ -266,32 +477,82 @@ static inline void class_dequeue_task(struct task_struct *p, */ static inline void update_local_cvt(struct task_struct *p, unsigned long nsec) { - struct ckrm_local_runqueue *class_queue = get_task_class_queue(p); - struct ckrm_cpu_class *cls = class_queue->cpu_class; + ckrm_lrq_t * lrq = get_task_lrq(p); + + unsigned long cvt_inc = nsec / local_class_weight(lrq); + + lrq->local_cvt += cvt_inc; + lrq->uncounted_ns += nsec; - unsigned long cvt_inc = nsec / cpu_class_weight(cls); + update_class_priority(lrq); +} - class_queue->local_cvt += cvt_inc; - class_queue->uncounted_cvt += cvt_inc; +static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) +{ + struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj); + struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj); - class_queue->uncounted_ns += nsec; - update_class_priority(class_queue); + return (class_compare_prio(node1,node2) < 0); } /* - * called during loadbalancing - * to charge the class with locally accumulated cvt + * return a random value with range [0, (val-1)] */ -void update_global_cvts(int this_cpu); +static inline int get_ckrm_rand(unsigned long val) +{ + int rand; + static int last_rand[NR_CPUS]; + int cpu = smp_processor_id(); + + rand = last_rand[cpu]; + rand ++; + if (rand >= val) + rand = 0; + + last_rand[cpu] = rand; + return rand; +} -/** - * - */ -static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) +void update_class_cputime(int this_cpu); + +/**********************************************/ +/* PID_LOAD_BALANCING */ +/**********************************************/ +struct ckrm_load_struct { + unsigned long load_p; /*propotional*/ + unsigned long load_i; /*integral */ + long load_d; /*derivative */ +}; + +typedef struct ckrm_load_struct ckrm_load_t; + +static inline void ckrm_load_init(ckrm_load_t* ckrm_load) { + ckrm_load->load_p = 0; + ckrm_load->load_i = 0; + ckrm_load->load_d = 0; +} + +void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu); +long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group); +#define rq_ckrm_load(rq) (&((rq)->ckrm_load)) + +static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load) { - struct cq_node_struct* node1 = &(get_task_class_queue(p)->classqueue_linkobj); - struct cq_node_struct* node2 = &(get_task_class_queue(curr)->classqueue_linkobj); + read_lock(&class_list_lock); + +#ifdef CONFIG_SMP + ckrm_load_sample(ckrm_load,this_cpu); +#endif - return (class_compare_prio(node1,node2) < 0); + if (! (j % CVT_UPDATE_TICK)) { + // printk("ckrm_sched j=%lu\n",j); + classqueue_update_base(get_cpu_classqueue(this_cpu)); + update_class_cputime(this_cpu); + } + + read_unlock(&class_list_lock); } + +#endif //CONFIG_CKRM_CPU_SCHEDULE + #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 98f7a1eba..dd5005295 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -94,7 +94,7 @@ extern unsigned long avenrun[]; /* Load averages */ extern int nr_threads; extern int last_pid; DECLARE_PER_CPU(unsigned long, process_counts); -DECLARE_PER_CPU(struct runqueue, runqueues); +// DECLARE_PER_CPU(struct runqueue, runqueues); -- removed after ckrm cpu v7 merge extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); @@ -429,6 +429,25 @@ int set_current_groups(struct group_info *group_info); struct audit_context; /* See audit.c */ struct mempolicy; +#ifdef CONFIG_CKRM_CPU_SCHEDULE +/** + * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class + * @run: how much time it has been running since the counter started + * @total: total time since the counter started + * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping + * @recalc_interval: how often do we recalculate the cpu_demand + * @cpu_demand: moving average of run/total + */ +struct ckrm_cpu_demand_stat { + unsigned long long run; + unsigned long long total; + unsigned long long last_sleep; + unsigned long long recalc_interval; + unsigned long cpu_demand; /*estimated cpu demand */ +}; +#endif + + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -528,7 +547,6 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; - sigset_t blocked, real_blocked; struct sigpending pending; @@ -594,7 +612,9 @@ struct task_struct { struct list_head taskclass_link; #ifdef CONFIG_CKRM_CPU_SCHEDULE struct ckrm_cpu_class *cpu_class; -#endif + //track cpu demand of this task + struct ckrm_cpu_demand_stat demand_stat; +#endif //CONFIG_CKRM_CPU_SCHEDULE #endif // CONFIG_CKRM_TYPE_TASKCLASS #ifdef CONFIG_CKRM_RES_MEM struct list_head mm_peers; // list of tasks using same mm_struct @@ -781,83 +801,6 @@ extern int idle_cpu(int cpu); void yield(void); -/* - * These are the runqueue data structures: - */ -typedef struct runqueue runqueue_t; - -#ifdef CONFIG_CKRM_CPU_SCHEDULE -#include -#endif - -#ifdef CONFIG_CKRM_CPU_SCHEDULE - -/** - * if belong to different class, compare class priority - * otherwise compare task priority - */ -#define TASK_PREEMPTS_CURR(p, rq) \ - (((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio) -#else -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) -struct prio_array { - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; -}; -#define rq_active(p,rq) (rq->active) -#define rq_expired(p,rq) (rq->expired) -#define ckrm_rebalance_tick(j,this_cpu) do {} while (0) -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) -#endif - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct runqueue { - spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; -#if defined(CONFIG_SMP) - unsigned long cpu_load; -#endif - unsigned long long nr_switches, nr_preempt; - unsigned long expired_timestamp, nr_uninterruptible; - unsigned long long timestamp_last_tick; - task_t *curr, *idle; - struct mm_struct *prev_mm; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - unsigned long ckrm_cpu_load; - struct classqueue_struct classqueue; -#else - prio_array_t *active, *expired, arrays[2]; -#endif - int best_expired_prio; - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct sched_domain *sd; - - /* For active balancing */ - int active_balance; - int push_cpu; - - task_t *migration_thread; - struct list_head migration_queue; -#endif - struct list_head hold_queue; - int idle_tokens; -}; - /* * The default (Linux) execution domain. */ @@ -894,6 +837,7 @@ static inline struct user_struct *get_uid(struct user_struct *u) atomic_inc(&u->__count); return u; } + extern void free_uid(struct user_struct *); extern void switch_uid(struct user_struct *); @@ -999,6 +943,7 @@ static inline int capable(int cap) } #endif + /* * Routines for handling mm_structs */ @@ -1132,7 +1077,7 @@ static inline struct mm_struct * get_task_mm(struct task_struct * task) return mm; } - + /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available */ diff --git a/init/Kconfig b/init/Kconfig index da1b24f0e..e63697a39 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -175,21 +175,12 @@ config CKRM_RES_NUMTASKS config CKRM_CPU_SCHEDULE bool "CKRM CPU scheduler" depends on CKRM_TYPE_TASKCLASS - default m + default y help Use CKRM CPU scheduler instead of Linux Scheduler Say N if unsure, Y to use the feature. -config CKRM_CPU_MONITOR - bool "CKRM CPU Resoure Monitor" - depends on CKRM_CPU_SCHEDULE - default m - help - Monitor CPU Resource Usage of the classes - - Say N if unsure, Y to use the feature. - config CKRM_RES_BLKIO tristate " Disk I/O Resource Controller" depends on CKRM_TYPE_TASKCLASS && IOSCHED_CFQ diff --git a/init/main.c b/init/main.c index e93d25685..6416eab8d 100644 --- a/init/main.c +++ b/init/main.c @@ -55,6 +55,7 @@ int __init init_ckrm_sched_res(void); #else #define init_ckrm_sched_res() ((void)0) #endif +//#include /* * This is one of the first .c files built. Error out early @@ -476,6 +477,7 @@ asmlinkage void __init start_kernel(void) * printk() and can access its per-cpu storage. */ smp_prepare_boot_cpu(); + /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() @@ -695,7 +697,9 @@ static int init(void * unused) * firmware files. */ populate_rootfs(); + do_basic_setup(); + init_ckrm_sched_res(); sched_init_smp(); diff --git a/kernel/Makefile b/kernel/Makefile index 905f3c59d..ec5001052 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,12 +27,9 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o -obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o -obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_sched.o +obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_KGDB) += kgdbstub.o - ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index 32b576b9b..b32530977 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -9,6 +9,5 @@ endif obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o - obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o - obj-$(CONFIG_CKRM_CPU_MONITOR) += ckrm_cpu_monitor.o - obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o + obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o + obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c index 0ded7f3c6..ad45380ee 100644 --- a/kernel/ckrm/ckrm_cpu_class.c +++ b/kernel/ckrm/ckrm_cpu_class.c @@ -23,17 +23,32 @@ #include #include - struct ckrm_res_ctlr cpu_rcbs; +/** + * insert_cpu_class - insert a class to active_cpu_class list + * + * insert the class in decreasing order of class weight + */ +static inline void insert_cpu_class(struct ckrm_cpu_class *cls) +{ + list_add(&cls->links,&active_cpu_classes); +} + /* * initialize a class object and its local queues */ - static void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) +void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) { int i,j,k; prio_array_t *array; - struct ckrm_local_runqueue* queue; + ckrm_lrq_t* queue; + + cls->shares = *shares; + cls->cnt_lock = SPIN_LOCK_UNLOCKED; + ckrm_cpu_stat_init(&cls->stat); + ckrm_usage_init(&cls->usage); + cls->magic = CKRM_CPU_CLASS_MAGIC; for (i = 0 ; i < NR_CPUS ; i++) { queue = &cls->local_queues[i]; @@ -58,34 +73,37 @@ struct ckrm_res_ctlr cpu_rcbs; queue->top_priority = MAX_PRIO; cq_node_init(&queue->classqueue_linkobj); queue->local_cvt = 0; - queue->uncounted_cvt = 0; + queue->lrq_load = 0; + queue->local_weight = cpu_class_weight(cls); queue->uncounted_ns = 0; + queue->savings = 0; queue->magic = 0x43FF43D7; } - cls->shares = *shares; - cls->global_cvt = 0; - cls->cnt_lock = SPIN_LOCK_UNLOCKED; - ckrm_cpu_stat_init(&cls->stat); - // add to class list write_lock(&class_list_lock); - list_add(&cls->links,&active_cpu_classes); + insert_cpu_class(cls); write_unlock(&class_list_lock); } static inline void set_default_share(ckrm_shares_t *shares) { shares->my_guarantee = 0; - shares->my_limit = CKRM_SHARE_DFLT_MAX_LIMIT; shares->total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - shares->max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - shares->cur_max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + shares->my_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + shares->max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + shares->cur_max_limit = 0; } -struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) { - return ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); +struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) +{ + struct ckrm_cpu_class * cls; + cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); + if (valid_cpu_class(cls)) + return cls; + else + return NULL; } @@ -94,7 +112,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class struct ckrm_cpu_class *cls; if (! parent) /*root class*/ - cls = default_cpu_class; + cls = get_default_cpu_class(); else cls = (struct ckrm_cpu_class *) kmalloc(sizeof(struct ckrm_cpu_class),GFP_ATOMIC); @@ -113,7 +131,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class cls->parent = parent; } } else - printk("alloc_cpu_class failed GFP_ATOMIC\n"); + printk(KERN_ERR"alloc_cpu_class failed\n"); return cls; } @@ -132,7 +150,7 @@ static void ckrm_free_cpu_class(void *my_res) return; /*the default class can't be freed*/ - if (cls == default_cpu_class) + if (cls == get_default_cpu_class()) return; // Assuming there will be no children when this function is called @@ -187,7 +205,16 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) parres = NULL; } + /* + * hzheng: CKRM_SHARE_DONTCARE should be handled + */ + if (new_share->my_guarantee == CKRM_SHARE_DONTCARE) + new_share->my_guarantee = 0; + rc = set_shares(new_share, cur, par); + if (cur->my_limit == CKRM_SHARE_DONTCARE) + cur->my_limit = cur->max_limit; + spin_unlock(&cls->cnt_lock); if (cls->parent) { @@ -196,9 +223,6 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) return rc; } -/* - * translate the global_CVT to ticks - */ static int ckrm_cpu_get_share(void *my_res, struct ckrm_shares *shares) { @@ -213,35 +237,42 @@ static int ckrm_cpu_get_share(void *my_res, int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) { struct ckrm_cpu_class *cls = my_res; + struct ckrm_cpu_class_stat* stat = &cls->stat; + ckrm_lrq_t* lrq; + int i; if (!cls) return -EINVAL; seq_printf(sfile, "-------- CPU Class Status Start---------\n"); - seq_printf(sfile, " gua= %d limit= %d\n", + seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n", cls->shares.my_guarantee, - cls->shares.my_limit); - seq_printf(sfile, " total_gua= %d limit= %d\n", + cls->shares.my_limit, cls->shares.total_guarantee, cls->shares.max_limit); - seq_printf(sfile, " used_gua= %d cur_limit= %d\n", + seq_printf(sfile, "\tunused_grt= %d cur_max_limit= %d\n", cls->shares.unused_guarantee, cls->shares.cur_max_limit); - seq_printf(sfile, " Share= %d\n",cpu_class_weight(cls)); - seq_printf(sfile, " cvt= %llu\n",cls->local_queues[0].local_cvt); - seq_printf(sfile, " total_ns= %llu\n",cls->stat.total_ns); - seq_printf(sfile, " prio= %d\n",cls->local_queues[0].classqueue_linkobj.prio); - seq_printf(sfile, " index= %d\n",cls->local_queues[0].classqueue_linkobj.index); - seq_printf(sfile, " run= %llu\n",cls->stat.local_stats[0].run); - seq_printf(sfile, " total= %llu\n",cls->stat.local_stats[0].total); - seq_printf(sfile, " cpu_demand= %lu\n",cls->stat.cpu_demand); - - seq_printf(sfile, " effective_guarantee= %d\n",cls->stat.effective_guarantee); - seq_printf(sfile, " effective_limit= %d\n",cls->stat.effective_limit); - seq_printf(sfile, " effective_share= %d\n",cls->stat.effective_share); - seq_printf(sfile, "-------- CPU Class Status END ---------\n"); + seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt); + seq_printf(sfile, "\tmegrt= %d\n",stat->megrt); + seq_printf(sfile, "\tehl= %d\n",stat->ehl); + seq_printf(sfile, "\tmehl= %d\n",stat->mehl); + seq_printf(sfile, "\teshare= %d\n",stat->eshare); + seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls)); + seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand); + seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns); + seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n", + get_ckrm_usage(cls,2*HZ), + get_ckrm_usage(cls,10*HZ), + get_ckrm_usage(cls,60*HZ) + ); + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(cls,i); + seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav=%lu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings); + } + seq_printf(sfile, "-------- CPU Class Status END ---------\n"); return 0; } @@ -249,28 +280,16 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) /* * task will remain in the same cpu but on a different local runqueue */ -static void ckrm_cpu_change_class(void *task, void *old, void *new) +void ckrm_cpu_change_class(void *task, void *old, void *new) { struct task_struct *tsk = task; struct ckrm_cpu_class *newcls = new; - unsigned long flags; - struct runqueue *rq; - prio_array_t *array; /*sanity checking*/ if (!task || ! old || !new) return; - rq = task_rq_lock(tsk,&flags); - array = tsk->array; - if (array) { - dequeue_task(tsk,array); - tsk->cpu_class = newcls; - enqueue_task(tsk,rq_active(tsk,rq)); - } else { - tsk->cpu_class = newcls; - } - task_rq_unlock(rq,&flags); + _ckrm_cpu_change_class(tsk,newcls); } /*dummy function, not used*/ @@ -297,7 +316,7 @@ static int ckrm_cpu_set_config(void *my_res, const char *cfgstr) } struct ckrm_res_ctlr cpu_rcbs = { - .res_name = "CKRM CPU Class", + .res_name = "cpu", .res_hdepth = 1, .resid = -1, .res_alloc = ckrm_alloc_cpu_class, @@ -339,10 +358,11 @@ void init_cpu_classes(void) //init classqueues for each processor for (i=0; i < NR_CPUS; i++) classqueue_init(get_cpu_classqueue(i)); -/* - * hzheng: initialize the default cpu class - * required for E14 since ckrm_init is called after sched_init - */ + + /* + * hzheng: initialize the default cpu class + * required for E14/E15 since ckrm_init is called after sched_init + */ ckrm_alloc_cpu_class(NULL,NULL); } diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c index 674ee6e50..c83c83fca 100644 --- a/kernel/ckrm/ckrm_cpu_monitor.c +++ b/kernel/ckrm/ckrm_cpu_monitor.c @@ -28,36 +28,84 @@ #include #include -#define CPU_MONITOR_INTERVAL (4*HZ) /*how often do we adjust the shares*/ -#define CKRM_SHARE_ACCURACY 7 +#define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/ #define CKRM_SHARE_MAX (1<shares.my_limit; +} + +static inline int get_mysoft_limit(struct ckrm_cpu_class *cls) +{ + return cls->shares.total_guarantee; +} + +static inline int get_hard_limit(struct ckrm_cpu_class *cls) +{ + return cls->shares.total_guarantee; +} + +static inline int get_myhard_limit(struct ckrm_cpu_class *cls) +{ + return cls->shares.total_guarantee; +} + + +static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type) +{ + unsigned long long now = sched_clock(); + + local_stat->run = 0; + local_stat->total = 0; + local_stat->last_sleep = now; + switch (type) { + case CPU_DEMAND_TP_CLASS: + local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC; + local_stat->cpu_demand = 0; + break; + case CPU_DEMAND_TP_TASK: + local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC; + //for task, the init cpu_demand is copied from its parent + break; + default: + BUG(); + } +} void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) { int i; - struct ckrm_cpu_class_local_stat* local_stat; - unsigned long long now = sched_clock(); stat->stat_lock = SPIN_LOCK_UNLOCKED; stat->total_ns = 0; - stat->cpu_demand = 0; + stat->max_demand = 0; for (i=0; i< NR_CPUS; i++) { - local_stat = &stat->local_stats[i]; - local_stat->run = 0; - local_stat->total = 0; - local_stat->last_sleep = now; - local_stat->cpu_demand = 0; + cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS); } - stat->effective_guarantee = 0; - stat->effective_limit = 0; - stat->glut = 0; - stat->effective_share = 100; - stat->self_effective_share = 100; + stat->egrt = 0; + stat->megrt = 0; + stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/ + stat->mehl = CKRM_SHARE_MAX; /*default: no limit */ + + stat->eshare = CKRM_SHARE_MAX; + stat->meshare = CKRM_SHARE_MAX; } + /**********************************************/ /* cpu demand */ /**********************************************/ @@ -77,52 +125,42 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) */ /** - * update_cpu_demand - update a state change + * update_cpu_demand_stat - * - * should be called whenever the state of a local queue changes + * should be called whenever the state of a task/task local queue changes * -- when deschedule : report how much run * -- when enqueue: report how much sleep * - * to deal with excessive long run/sleep state - * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record + * how often should we recalculate the cpu demand + * the number is in ns */ -#define CKRM_CPU_DEMAND_RUN 0 -#define CKRM_CPU_DEMAND_SLEEP 1 -//how often should we recalculate the cpu demand, in ns -#define CPU_DEMAND_CAL_THRESHOLD (1000000000LL) -static inline void update_local_cpu_demand(struct ckrm_cpu_class_local_stat* local_stat,int state, unsigned long long len) +static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len) { local_stat->total += len; if (state == CKRM_CPU_DEMAND_RUN) local_stat->run += len; - if (local_stat->total >= CPU_DEMAND_CAL_THRESHOLD) { + if (local_stat->total >= local_stat->recalc_interval) { local_stat->total >>= CKRM_SHARE_ACCURACY; - if (local_stat->total > 0xFFFFFFFF) - local_stat->total = 0xFFFFFFFF; + if (unlikely(local_stat->run > 0xFFFFFFFF)) + local_stat->run = 0xFFFFFFFF; + if (local_stat->total > 0xFFFFFFFF) + local_stat->total = 0xFFFFFFFF; + do_div(local_stat->run,(unsigned long)local_stat->total); - local_stat->cpu_demand +=local_stat->run; - local_stat->cpu_demand >>= 1; + + if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep + local_stat->cpu_demand = local_stat->run; + else { + local_stat->cpu_demand += local_stat->run; + local_stat->cpu_demand >>= 1; + } local_stat->total = 0; local_stat->run = 0; } } -static inline void cpu_demand_update_run(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len) -{ - update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_RUN,len); -} - -static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len) -{ - update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_SLEEP,len); -} - -#define CPU_DEMAND_ENQUEUE 0 -#define CPU_DEMAND_DEQUEUE 1 -#define CPU_DEMAND_DESCHEDULE 2 - /** * cpu_demand_event - and cpu_demand event occured * @event: one of the following three events: @@ -131,19 +169,24 @@ static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* loc * CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run */ -void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len) +void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) { switch (event) { case CPU_DEMAND_ENQUEUE: len = sched_clock() - local_stat->last_sleep; local_stat->last_sleep = 0; - cpu_demand_update_sleep(local_stat,len); + update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len); break; case CPU_DEMAND_DEQUEUE: - local_stat->last_sleep = sched_clock(); + if (! local_stat->last_sleep) { + local_stat->last_sleep = sched_clock(); + } break; case CPU_DEMAND_DESCHEDULE: - cpu_demand_update_run(local_stat,len); + update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len); + break; + case CPU_DEMAND_INIT: //for task init only + cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK); break; default: BUG(); @@ -152,18 +195,19 @@ void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, u /** * check all the class local queue - * if local queueu is not in runqueue, then it's in sleep state - * if compare to last sleep, + * + * to deal with excessive long run/sleep state + * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record */ static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) { - struct ckrm_cpu_class_local_stat * local_stat = &stat->local_stats[cpu]; + struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; unsigned long long sleep,now; if (local_stat->last_sleep) { now = sched_clock(); sleep = now - local_stat->last_sleep; local_stat->last_sleep = now; - cpu_demand_update_sleep(local_stat,sleep); + update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); } } @@ -172,51 +216,72 @@ static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int * * self_cpu_demand = sum(cpu demand of all local queues) */ -static unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat - *stat) +static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) { int cpu_demand = 0; int i; + int cpuonline = 0; for_each_online_cpu(i) { cpu_demand_check_sleep(stat,i); cpu_demand += stat->local_stats[i].cpu_demand; + cpuonline ++; } - if (cpu_demand > CKRM_SHARE_MAX) - cpu_demand = CKRM_SHARE_MAX; - return cpu_demand; + return (cpu_demand/cpuonline); } /* - * update effective cpu demand for each class - * assume the root_core->parent == NULL + * my max demand = min(cpu_demand, my effective hard limit) */ -static void update_cpu_demand(struct ckrm_core_class *root_core) +static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) +{ + unsigned long mmax_demand = get_self_cpu_demand(stat); + if (mmax_demand > stat->mehl) + mmax_demand = stat->mehl; + + return mmax_demand; +} + +/** + * update_max_demand: update effective cpu demand for each class + * return -1 on error + * + * Assume: the root_core->parent == NULL + */ +static int update_max_demand(struct ckrm_core_class *root_core) { struct ckrm_core_class *cur_core, *child_core; - struct ckrm_cpu_class *cls; + struct ckrm_cpu_class *cls,*c_cls; + int ret = -1; cur_core = root_core; child_core = NULL; - /* - * iterate the tree - * update cpu_demand of each node - */ - repeat: - if (!cur_core) - return; + + repeat: + if (!cur_core) { //normal exit + ret = 0; + goto out; + } cls = ckrm_get_cpu_class(cur_core); + if (! cls) //invalid c_cls, abort + goto out; + if (!child_core) //first child - cls->stat.cpu_demand = get_self_cpu_demand(&cls->stat); + cls->stat.max_demand = get_mmax_demand(&cls->stat); else { - cls->stat.cpu_demand += - ckrm_get_cpu_class(child_core)->stat.cpu_demand; - if (cls->stat.cpu_demand > CKRM_SHARE_MAX) - cls->stat.cpu_demand = CKRM_SHARE_MAX; + c_cls = ckrm_get_cpu_class(child_core); + if (c_cls) + cls->stat.max_demand += c_cls->stat.max_demand; + else //invalid c_cls, abort + goto out; } + //check class hard limit + if (cls->stat.max_demand > cls->stat.ehl) + cls->stat.max_demand = cls->stat.ehl; + //next child child_core = ckrm_get_next_child(cur_core, child_core); if (child_core) { @@ -229,78 +294,116 @@ static void update_cpu_demand(struct ckrm_core_class *root_core) cur_core = child_core->hnode.parent; } goto repeat; + out: + return ret; } /**********************************************/ /* effective guarantee & limit */ /**********************************************/ -static inline void set_effective_share(struct ckrm_cpu_class_stat *stat, +static inline void set_eshare(struct ckrm_cpu_class_stat *stat, int new_share) { if (!new_share) new_share = 1; - stat->effective_share = new_share; + + BUG_ON(new_share < 0); + stat->eshare = new_share; } -static inline void set_self_effective_share(struct ckrm_cpu_class_stat *stat, +static inline void set_meshare(struct ckrm_cpu_class_stat *stat, int new_share) { if (!new_share) new_share = 1; - stat->self_effective_share = new_share; + + BUG_ON(new_share < 0); + stat->meshare = new_share; } -static inline void update_child_effective(struct ckrm_core_class *parent) +/** + *update_child_effective - update egrt, ehl, mehl for all children of parent + *@parent: the parent node + *return -1 if anything wrong + * + */ +static int update_child_effective(struct ckrm_core_class *parent) { struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent); - struct ckrm_core_class *child_core = ckrm_get_next_child(parent, NULL); + struct ckrm_core_class *child_core; + int ret = -1; + + if (! p_cls) + return ret; + child_core = ckrm_get_next_child(parent, NULL); while (child_core) { struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) + return ret; - c_cls->stat.effective_guarantee = - p_cls->stat.effective_guarantee * + c_cls->stat.egrt = + p_cls->stat.egrt * c_cls->shares.my_guarantee / p_cls->shares.total_guarantee; - c_cls->stat.effective_limit = - p_cls->stat.effective_guarantee * c_cls->shares.my_limit / - p_cls->shares.total_guarantee; + + c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee + / c_cls->shares.total_guarantee; + + c_cls->stat.ehl = + p_cls->stat.ehl * + get_hard_limit(c_cls) / p_cls->shares.total_guarantee; + + c_cls->stat.mehl = + c_cls->stat.ehl * + get_myhard_limit(c_cls) / c_cls->shares.total_guarantee; child_core = ckrm_get_next_child(parent, child_core); }; - + return 0; } -/* - * update effective guarantee and effective limit - * -- effective share = parent->effective->share * share/parent->total_share - * -- effective limit = parent->effective->share * limit/parent->total_share +/** + * update_effectives: update egrt, ehl, mehl for the whole tree * should be called only when class structure changed + * + * return -1 if anything wrong happened (eg: the structure changed during the process) */ -static void update_effective_guarantee_limit(struct ckrm_core_class *root_core) +static int update_effectives(struct ckrm_core_class *root_core) { - struct ckrm_core_class *cur_core, *child_core = NULL; + struct ckrm_core_class *cur_core, *child_core; struct ckrm_cpu_class *cls; + int ret = -1; cur_core = root_core; + child_core = NULL; cls = ckrm_get_cpu_class(cur_core); - cls->stat.effective_guarantee = CKRM_SHARE_MAX; - cls->stat.effective_limit = cls->stat.effective_guarantee; - repeat: + //initialize the effectives for root + cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */ + cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee + / cls->shares.total_guarantee; + cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls) + / cls->shares.total_guarantee; + cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls) + / cls->shares.total_guarantee; + + repeat: //check exit if (!cur_core) - return; + return 0; //visit this node - update_child_effective(cur_core); + if (update_child_effective(cur_core) < 0) + return ret; //invalid cur_core node + //next child child_core = ckrm_get_next_child(cur_core, child_core); + if (child_core) { - //go down + //go down to the next hier cur_core = child_core; child_core = NULL; - goto repeat; - } else { //no more child, go back + } else { //no more child, go back child_core = cur_core; cur_core = child_core->hnode.parent; } @@ -312,12 +415,12 @@ static void update_effective_guarantee_limit(struct ckrm_core_class *root_core) /**********************************************/ /* - * surplus = my_effective_share - demand + * surplus = egrt - demand * if surplus < 0, surplus = 0 */ static inline int get_node_surplus(struct ckrm_cpu_class *cls) { - int surplus = cls->stat.effective_guarantee - cls->stat.cpu_demand; + int surplus = cls->stat.egrt - cls->stat.max_demand; if (surplus < 0) surplus = 0; @@ -325,122 +428,199 @@ static inline int get_node_surplus(struct ckrm_cpu_class *cls) return surplus; } -/* - * consume the surplus +static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) +{ + int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); + + if (surplus < 0) + surplus = 0; + + return surplus; +} + +/** + * node_surplus_consume: consume the surplus + * @ckeck_sl: if check_sl is set, then check soft_limit + * @total_grt: total guarantee * return how much consumed - * set glut when necessary + * return -1 on error + * + * implements all the CKRM Scheduling Requirement + * update total_grt if necessary */ -static inline int node_surplus_consume(int old_surplus, +static inline int node_surplus_consume(int surplus, struct ckrm_core_class *child_core, - struct ckrm_cpu_class *p_cls) + struct ckrm_cpu_class *p_cls, + int check_sl + ) { int consumed = 0; int inc_limit; + int glut = 1; struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core); + int total_grt = p_cls->shares.total_guarantee; - if (c_cls->stat.glut) + BUG_ON(surplus < 0); + + if (! c_cls || ! total_grt) goto out; - //check demand - if (c_cls->stat.effective_share >= c_cls->stat.cpu_demand) { - c_cls->stat.glut = 1; + /*can't consume more than demand or hard limit*/ + if (c_cls->stat.eshare >= c_cls->stat.max_demand) goto out; - } consumed = - old_surplus * c_cls->shares.my_guarantee / - p_cls->shares.total_guarantee; + surplus * c_cls->shares.my_guarantee / total_grt; - //check limit - inc_limit = c_cls->stat.effective_limit - c_cls->stat.effective_share; - if (inc_limit <= consumed) { - c_cls->stat.glut = 1; - consumed = inc_limit; + if (! consumed) //no more share + goto out; + + //hard limit and demand limit + inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare; + + if (check_sl) { + int esl = p_cls->stat.eshare * get_soft_limit(c_cls) + /p_cls->shares.total_guarantee; + if (esl < c_cls->stat.max_demand) + inc_limit = esl - c_cls->stat.eshare; } - c_cls->stat.effective_share += consumed; - out: + + if (consumed > inc_limit) + consumed = inc_limit; + else + glut = 0; + + BUG_ON(consumed < 0); + set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed); + BUG_ON(c_cls->stat.eshare < 0); + + out: return consumed; } -/* - * re-allocate the shares for all the childs under this node +/** + * alloc_surplus_node: re-allocate the shares for children under parent + * @parent: parent node + * return the remaining surplus + * * task: * 1. get total surplus * 2. allocate surplus * 3. set the effective_share of each node */ -static void alloc_surplus_node(struct ckrm_core_class *parent) +static int alloc_surplus_node(struct ckrm_core_class *parent) { - int total_surplus = 0, old_surplus = 0; + int total_surplus , old_surplus; struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent); struct ckrm_core_class *child_core = NULL; int self_share; + int check_sl; + int ret = -1; + + if (! p_cls) + return ret; + + total_surplus = get_my_node_surplus(p_cls); /* - * calculate surplus - * total_surplus = sum(child_surplus) - * reset glut flag * initialize effective_share */ do { child_core = ckrm_get_next_child(parent, child_core); if (child_core) { - struct ckrm_cpu_class *c_cls = - ckrm_get_cpu_class(child_core); - ckrm_stat_t *stat = &c_cls->stat; + struct ckrm_cpu_class *c_cls; + + c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) + return ret; total_surplus += get_node_surplus(c_cls); - stat->glut = 0; - set_effective_share(stat, stat->effective_guarantee); + + set_eshare(&c_cls->stat, c_cls->stat.egrt); } } while (child_core); - /*distribute the surplus */ + if (! total_surplus) + goto realloc_out; + + /* distribute the surplus */ child_core = NULL; + check_sl = 1; + old_surplus = 0; do { - if (!child_core) //keep the surplus of last round + if (!child_core) {//start a new round + + //ok, everybody reached the soft limit + if (old_surplus == total_surplus) + check_sl = 0; old_surplus = total_surplus; + } child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - total_surplus -= - node_surplus_consume(old_surplus, child_core, - p_cls); + if (child_core) { + int consumed = 0; + consumed -= + node_surplus_consume(old_surplus, child_core, + p_cls,check_sl); + if (consumed >= 0) + total_surplus -= consumed; + else + return ret; } //start a new round if something is allocated in the last round - } while (child_core || (total_surplus != old_surplus)); + } while (child_core || check_sl || total_surplus != old_surplus); - //any remaining surplus goes to the default class - self_share = p_cls->stat.effective_share * + realloc_out: + /*how much for itself*/ + self_share = p_cls->stat.eshare * p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee; - self_share += total_surplus; - set_self_effective_share(&p_cls->stat, self_share); + if (self_share < p_cls->stat.max_demand) { + /*any remaining surplus goes to the default class*/ + self_share += total_surplus; + if (self_share > p_cls->stat.max_demand) + self_share = p_cls->stat.max_demand; + } + + set_meshare(&p_cls->stat, self_share); + return 0; } /** * alloc_surplus - reallocate unused shares * * class A's usused share should be allocated to its siblings + * the re-allocation goes downward from the top */ -static void alloc_surplus(struct ckrm_core_class *root_core) +static int alloc_surplus(struct ckrm_core_class *root_core) { - struct ckrm_core_class *cur_core, *child_core = NULL; + struct ckrm_core_class *cur_core, *child_core; struct ckrm_cpu_class *cls; + int ret = -1; + /*initialize*/ cur_core = root_core; + child_core = NULL; cls = ckrm_get_cpu_class(cur_core); - cls->stat.glut = 0; - set_effective_share(&cls->stat, cls->stat.effective_guarantee); + + //set root eshare + set_eshare(&cls->stat, cls->stat.egrt); + + /*the ckrm idle tasks get all what's remaining*/ + /*hzheng: uncomment the following like for hard limit support */ + // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); + repeat: //check exit if (!cur_core) - return; + return 0; //visit this node - alloc_surplus_node(cur_core); + if ( alloc_surplus_node(cur_core) < 0 ) + return ret; + //next child child_core = ckrm_get_next_child(cur_core, child_core); if (child_core) { @@ -455,6 +635,199 @@ static void alloc_surplus(struct ckrm_core_class *root_core) goto repeat; } +/**********************************************/ +/* CKRM Idle Tasks */ +/**********************************************/ +struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class; +struct task_struct* ckrm_idle_tasks[NR_CPUS]; + +/*how many ckrm idle tasks should I wakeup*/ +static inline int get_nr_idle(unsigned long surplus) +{ + int cpu_online = cpus_weight(cpu_online_map); + int nr_idle = 0; + + nr_idle = surplus * cpu_online; + nr_idle >>= CKRM_SHARE_ACCURACY; + + if (surplus) + nr_idle ++; + + if (nr_idle > cpu_online) + nr_idle = cpu_online; + + return nr_idle; +} + +/** + * update_ckrm_idle: update the status of the idle class according to the new surplus + * surplus: new system surplus + * + * Task: + * -- update share of the idle class + * -- wakeup idle tasks according to surplus + */ +void update_ckrm_idle(unsigned long surplus) +{ + int nr_idle = get_nr_idle(surplus); + int i; + struct task_struct* idle_task; + + set_eshare(&ckrm_idle_class->stat,surplus); + set_meshare(&ckrm_idle_class->stat,surplus); + /*wake up nr_idle idle tasks*/ + for_each_online_cpu(i) { + idle_task = ckrm_idle_tasks[i]; + if (unlikely(idle_task->cpu_class != ckrm_idle_class)) { + ckrm_cpu_change_class(idle_task, + idle_task->cpu_class, + ckrm_idle_class); + } + if (! idle_task) + continue; + if (i < nr_idle) { + //activate it + wake_up_process(idle_task); + } else { + //deactivate it + idle_task->state = TASK_INTERRUPTIBLE; + set_tsk_need_resched(idle_task); + } + } +} + +static int ckrm_cpu_idled(void *nothing) +{ + set_user_nice(current,19); + daemonize("ckrm_idle_task"); + + //deactivate it, it will be waked up by ckrm_cpu_monitor + current->state = TASK_INTERRUPTIBLE; + schedule(); + + /*similar to cpu_idle */ + while (1) { + while (!need_resched()) { + ckrm_cpu_monitor(); + if (current_cpu_data.hlt_works_ok) { + local_irq_disable(); + if (!need_resched()) { + set_tsk_need_resched(current); + safe_halt(); + } else + local_irq_enable(); + } + } + schedule(); + } + return 0; +} + +/** + * ckrm_start_ckrm_idle: + * create the ckrm_idle_class and starts the idle tasks + * + */ +void ckrm_start_ckrm_idle(void) +{ + int i; + int ret; + ckrm_shares_t shares; + + ckrm_idle_class = &ckrm_idle_class_obj; + memset(ckrm_idle_class,0,sizeof(shares)); + /*don't care about the shares */ + init_cpu_class(ckrm_idle_class,&shares); + printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class); + + for_each_online_cpu(i) { + ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL); + + /*warn on error, but the system should still work without it*/ + if (ret < 0) + printk(KERN_ERR"Warn: can't start ckrm idle tasks\n"); + else { + ckrm_idle_tasks[i] = find_task_by_pid(ret); + if (!ckrm_idle_tasks[i]) + printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret); + } + } +} + +/**********************************************/ +/* Local Weight */ +/**********************************************/ +/** + * adjust_class_local_weight: adjust the local weight for each cpu + * + * lrq->weight = lpr->pressure * class->weight / total_pressure + */ +static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) +{ + unsigned long total_pressure = 0; + ckrm_lrq_t* lrq; + int i; + unsigned long class_weight; + unsigned long long lw; + + //get total pressure + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + total_pressure += lrq->lrq_load; + } + + if (! total_pressure) + return; + + class_weight = cpu_class_weight(clsptr) * cpu_online; + + /* + * update weight for each cpu, minimun is 1 + */ + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + if (! lrq->lrq_load) + /*give idle class a high share to boost interactiveness */ + lw = cpu_class_weight(clsptr); + else { + lw = lrq->lrq_load * class_weight; + do_div(lw,total_pressure); + if (!lw) + lw = 1; + else if (lw > CKRM_SHARE_MAX) + lw = CKRM_SHARE_MAX; + } + + lrq->local_weight = lw; + } +} + +/* + * assume called with class_list_lock read lock held + */ +void adjust_local_weight(void) +{ + static spinlock_t lock = SPIN_LOCK_UNLOCKED; + struct ckrm_cpu_class *clsptr; + int cpu_online; + + //do nothing if someone already holding the lock + if (! spin_trylock(&lock)) + return; + + cpu_online = cpus_weight(cpu_online_map); + + //class status: demand, share,total_ns prio, index + list_for_each_entry(clsptr,&active_cpu_classes,links) { + adjust_lrq_weight(clsptr,cpu_online); + } + + spin_unlock(&lock); +} + +/**********************************************/ +/* Main */ +/**********************************************/ /** *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress * @@ -464,13 +837,43 @@ static void alloc_surplus(struct ckrm_core_class *root_core) */ void ckrm_cpu_monitor(void) { - struct ckrm_core_class *root_core = default_cpu_class->core; + static spinlock_t lock = SPIN_LOCK_UNLOCKED; + static unsigned long long last_check = 0; + struct ckrm_core_class *root_core = get_default_cpu_class()->core; + unsigned long long now; +#define MIN_CPU_MONITOR_INTERVAL 100000000UL + if (!root_core) return; - update_effective_guarantee_limit(root_core); - update_cpu_demand(root_core); - alloc_surplus(root_core); + //do nothing if someone already holding the lock + if (! spin_trylock(&lock)) + return; + + read_lock(&class_list_lock); + + now = sched_clock(); + + //consecutive check should be at least 100ms apart + if (now - last_check < MIN_CPU_MONITOR_INTERVAL) { + goto outunlock; + } + last_check = now; + + if (update_effectives(root_core) != 0) + goto outunlock; + + if (update_max_demand(root_core) != 0) + goto outunlock; + + if (alloc_surplus(root_core) != 0) + goto outunlock; + + adjust_local_weight(); + + outunlock: + read_unlock(&class_list_lock); + spin_unlock(&lock); } /*****************************************************/ @@ -481,14 +884,11 @@ static int thread_exit = 0; static int ckrm_cpu_monitord(void *nothing) { - wait_queue_head_t wait; - - init_waitqueue_head(&wait); - daemonize("ckrm_cpu_ctrld"); for (;;) { /*sleep for sometime before next try*/ - interruptible_sleep_on_timeout(&wait, CPU_MONITOR_INTERVAL); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(CPU_MONITOR_INTERVAL); ckrm_cpu_monitor(); if (thread_exit) { break; @@ -510,15 +910,14 @@ void ckrm_start_monitor(void) void ckrm_kill_monitor(void) { - wait_queue_head_t wait; int interval = HZ; - init_waitqueue_head(&wait); printk("killing process %d\n", cpu_monitor_pid); if (cpu_monitor_pid > 0) { thread_exit = 1; while (thread_exit != 2) { - interruptible_sleep_on_timeout(&wait, interval); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(CPU_MONITOR_INTERVAL); } } } @@ -526,6 +925,8 @@ void ckrm_kill_monitor(void) int ckrm_cpu_monitor_init(void) { ckrm_start_monitor(); + /*hzheng: uncomment the following like for hard limit support */ + // ckrm_start_ckrm_idle(); return 0; } diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c index 1929aaf4e..0400844a3 100644 --- a/kernel/ckrm_classqueue.c +++ b/kernel/ckrm_classqueue.c @@ -133,12 +133,16 @@ void classqueue_update_prio(struct classqueue_struct *cq, //add to new positon, round robin for classes with same priority list_add_tail(&(node->list), &cq->array.queue[index]); - __set_bit(index, cq->array.bitmap); - + __set_bit(index, cq->array.bitmap); node->index = index; } -cq_node_t *classqueue_get_head(struct classqueue_struct *cq) +/** + *classqueue_get_min_prio: return the priority of the last node in queue + * + * this function can be called without runqueue lock held + */ +static inline int classqueue_get_min_prio(struct classqueue_struct *cq) { cq_node_t *result = NULL; int pos; @@ -147,9 +151,36 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) * search over the bitmap to get the first class in the queue */ pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset); - if (pos >= CLASSQUEUE_SIZE) { //do circular search from the beginning + //do circular search from the beginning + if (pos >= CLASSQUEUE_SIZE) pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); + + if (pos < CLASSQUEUE_SIZE) { + result = list_entry(cq->array.queue[pos].next, cq_node_t, list); + if (list_empty(&cq->array.queue[pos])) + result = NULL; } + if (result) + return result->prio; + else + return 0; +} + +/** + * this function must be called with runqueue lock held + */ +cq_node_t *classqueue_get_head(struct classqueue_struct *cq) +{ + cq_node_t *result = NULL; + int pos; + + /* + * search over the bitmap to get the first class in the queue + */ + pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset); + //do circular search from the beginning + if (pos >= CLASSQUEUE_SIZE) + pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); if (pos < CLASSQUEUE_SIZE) { BUG_ON(list_empty(&cq->array.queue[pos])); @@ -162,15 +193,17 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) * Moving the end of queue forward * the new_base here is logical, we need to translate to the abosule position */ -void classqueue_update_base(struct classqueue_struct *cq, int new_base) +void classqueue_update_base(struct classqueue_struct *cq) { - if (!cq_nr_member(cq)) { + int new_base; + + if (! cq_nr_member(cq)) { cq->base_offset = -1; //not defined return; } - // assert(new_base >= cq->base); - + new_base = classqueue_get_min_prio(cq); + if (new_base > cq->base) { cq->base_offset = get_index(cq, &new_base); cq->base = new_base; diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c index ba716d4c5..9c653a3b6 100644 --- a/kernel/ckrm_sched.c +++ b/kernel/ckrm_sched.c @@ -15,57 +15,192 @@ #include #include +rwlock_t class_list_lock = RW_LOCK_UNLOCKED; +LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor + +struct ckrm_cpu_class default_cpu_class_obj; + +struct ckrm_cpu_class * get_default_cpu_class(void) { + return (&default_cpu_class_obj); +} + /*******************************************************/ /* CVT Management */ /*******************************************************/ -#define CVT_WINDOW_SIZE (CLASSQUEUE_SIZE << CLASS_BONUS_RATE) -static CVT_t max_CVT = CVT_WINDOW_SIZE; -/* - * Also ensure that the classes global cvt is upgraded to the - * minimum CVT in the system, as a class might not have run for a while - */ -static void update_global_cvt(struct ckrm_cpu_class *cpu_class, int cpu) +static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) { - struct ckrm_local_runqueue *class_queue = - get_ckrm_local_runqueue(cpu_class, cpu); CVT_t min_cvt; - CVT_t local_cvt_old = class_queue->local_cvt; + CVT_t bonus; - spin_lock(&cvt_lock); - if (class_queue->uncounted_cvt) { - cpu_class->global_cvt += class_queue->uncounted_cvt; - class_queue->uncounted_cvt = 0; - } - min_cvt = max_CVT - CVT_WINDOW_SIZE; - if (cpu_class->global_cvt < min_cvt) - cpu_class->global_cvt = min_cvt; - else if (cpu_class->global_cvt > max_CVT) - max_CVT = cpu_class->global_cvt; - -/* update local cvt from global cvt*/ -#if 0 - class_queue->local_cvt = cpu_class->global_cvt; -#endif - spin_unlock(&cvt_lock); - - if (class_queue->local_cvt != local_cvt_old) - update_class_priority(class_queue); + //just a safty measure + if (unlikely(! cur_cvt)) + return; + + /* + * Always leaving a small bonus for inactive classes + * allows them to compete for cycles immediately when the become + * active. This should improve interactive behavior + */ + bonus = INTERACTIVE_BONUS(lrq); + //cvt can't be negative + if (cur_cvt > bonus) + min_cvt = cur_cvt - bonus; + else + min_cvt = 0; + + if (lrq->local_cvt < min_cvt) { + CVT_t lost_cvt; + + lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq); + lrq->local_cvt = min_cvt; + + /* add what the class lost to its savings*/ + lrq->savings += lost_cvt; + if (lrq->savings > MAX_SAVINGS) + lrq->savings = MAX_SAVINGS; + } else if (lrq->savings) { + /* + *if a class saving and falling behind + * then start to use it saving in a leaking bucket way + */ + CVT_t savings_used; + + savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq); + if (savings_used > lrq->savings) + savings_used = lrq->savings; + + if (savings_used > SAVINGS_LEAK_SPEED) + savings_used = SAVINGS_LEAK_SPEED; + + BUG_ON(lrq->savings < savings_used); + lrq->savings -= savings_used; + unscale_cvt(savings_used,lrq); + BUG_ON(lrq->local_cvt < savings_used); + // lrq->local_cvt -= savings_used; + } } /* + * return the max_cvt of all the classes + */ +static inline CVT_t get_max_cvt(int this_cpu) +{ + struct ckrm_cpu_class *clsptr; + ckrm_lrq_t * lrq; + CVT_t max_cvt; + + max_cvt = 0; + + /*update class time, at the same time get max_cvt */ + list_for_each_entry(clsptr, &active_cpu_classes, links) { + lrq = get_ckrm_lrq(clsptr, this_cpu); + if (lrq->local_cvt > max_cvt) + max_cvt = lrq->local_cvt; + } + + return max_cvt; +} + +/** + * update_class_cputime - updates cvt of inactive classes + * -- an inactive class shouldn't starve others when it comes back + * -- the cpu time it lost when it's inactive should be accumulated + * -- its accumulated saving should be compensated (in a leaky bucket fashion) + * * class_list_lock must have been acquired */ -void update_global_cvts(int this_cpu) +void update_class_cputime(int this_cpu) { struct ckrm_cpu_class *clsptr; - struct ckrm_local_runqueue *class_queue; + ckrm_lrq_t * lrq; + CVT_t cur_cvt; + + /* + * a class's local_cvt must not be significantly smaller than min_cvt + * of active classes otherwise, it will starve other classes when it + * is reactivated. + * + * Hence we keep all local_cvt's within a range of the min_cvt off + * all active classes (approximated by the local_cvt of the currently + * running class) and account for how many cycles where thus taken + * from an inactive class building a savings (not to exceed a few seconds) + * for a class to gradually make up upon reactivation, without + * starvation of other classes. + * + */ + cur_cvt = get_local_cur_cvt(this_cpu); - /*for each class*/ + /* + * cur_cvt == 0 means the system is now idle + * in this case, we use max_cvt as cur_cvt + * max_cvt roughly represents the cvt of the class + * that has just finished running + * + * fairness wouldn't be a problem since we account for whatever lost in savings + * if the system is not busy, the system responsiveness is not a problem. + * still fine if the sytem is busy, but happened to be idle at this certain point + * since bias toward interactive classes (class priority) is a more important way to improve system responsiveness + */ + if (unlikely(! cur_cvt)) { + cur_cvt = get_max_cvt(this_cpu); + //return; + } + + /* + * - check the local cvt of all the classes + * - update total_ns received by the class + * - do a usage sampling for the whole class + */ list_for_each_entry(clsptr, &active_cpu_classes, links) { - update_global_cvt(clsptr, this_cpu); - class_queue = get_ckrm_local_runqueue(clsptr, this_cpu); - clsptr->stat.total_ns += class_queue->uncounted_ns; - class_queue->uncounted_ns = 0; + lrq = get_ckrm_lrq(clsptr, this_cpu); + + spin_lock(&clsptr->stat.stat_lock); + clsptr->stat.total_ns += lrq->uncounted_ns; + ckrm_sample_usage(clsptr); + spin_unlock(&clsptr->stat.stat_lock); + lrq->uncounted_ns = 0; + + check_inactive_class(lrq,cur_cvt); } } + +/*******************************************************/ +/* PID load balancing stuff */ +/*******************************************************/ +#define PID_SAMPLE_T 32 +#define PID_KP 20 +#define PID_KI 60 +#define PID_KD 20 + +/** + * sample pid load periodically + */ +void ckrm_load_sample(ckrm_load_t* pid,int cpu) +{ + long load; + long err; + + if (jiffies % PID_SAMPLE_T) + return; + + adjust_local_weight(); + + load = ckrm_cpu_load(cpu); + err = load - pid->load_p; + pid->load_d = err; + pid->load_p = load; + pid->load_i *= 9; + pid->load_i += load; + pid->load_i /= 10; +} + +long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group) +{ + long pressure; + pressure = ckrm_load->load_p * PID_KP; + pressure += ckrm_load->load_i * PID_KI; + pressure += ckrm_load->load_d * PID_KD; + pressure /= 100; + return pressure; +} diff --git a/kernel/sched.c b/kernel/sched.c index 74a53bf05..947dda24e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -163,6 +163,21 @@ EXPORT_SYMBOL(dump_oncpu); #define LOW_CREDIT(p) \ ((p)->interactive_credit < -CREDIT_LIMIT) +#ifdef CONFIG_CKRM_CPU_SCHEDULE +/* + * if belong to different class, compare class priority + * otherwise compare task priority + */ +#define TASK_PREEMPTS_CURR(p, rq) \ + ( ((p)->cpu_class != (rq)->curr->cpu_class) \ + && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \ + ? class_preempts_curr((p),(rq)->curr) \ + : ((p)->prio < (rq)->curr->prio) +#else +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) +#endif + /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. @@ -178,14 +193,71 @@ EXPORT_SYMBOL(dump_oncpu); ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -static unsigned int task_timeslice(task_t *p) +unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) -DEFINE_PER_CPU(struct runqueue, runqueues); +/* + * These are the runqueue data structures: + */ + +typedef struct runqueue runqueue_t; +#include +#include + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#if defined(CONFIG_SMP) + unsigned long cpu_load; +#endif + unsigned long long nr_switches, nr_preempt; + unsigned long expired_timestamp, nr_uninterruptible; + unsigned long long timestamp_last_tick; + task_t *curr, *idle; + struct mm_struct *prev_mm; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + struct classqueue_struct classqueue; + ckrm_load_t ckrm_load; +#else + prio_array_t *active, *expired, arrays[2]; +#endif + int best_expired_prio; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_VSERVER_HARDCPU + struct list_head hold_queue; + int idle_tokens; +#endif +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); #define for_each_domain(cpu, domain) \ for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) @@ -204,95 +276,86 @@ DEFINE_PER_CPU(struct runqueue, runqueues); # define task_running(rq, p) ((rq)->curr == (p)) #endif -#ifdef CONFIG_CKRM_CPU_SCHEDULE -#include -spinlock_t cvt_lock = SPIN_LOCK_UNLOCKED; -rwlock_t class_list_lock = RW_LOCK_UNLOCKED; -LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor -struct ckrm_cpu_class default_cpu_class_obj; - /* - * the minimum CVT allowed is the base_cvt - * otherwise, it will starve others + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. */ -CVT_t get_min_cvt(int cpu) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { - cq_node_t *node; - struct ckrm_local_runqueue * lrq; - CVT_t min_cvt; + struct runqueue *rq; - node = classqueue_get_head(bpt_queue(cpu)); - lrq = (node) ? class_list_entry(node) : NULL; - - if (lrq) - min_cvt = lrq->local_cvt; - else - min_cvt = 0; - - return min_cvt; +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); } /* - * update the classueue base for all the runqueues - * TODO: we can only update half of the min_base to solve the movebackward issue + * rq_lock - lock a given runqueue and disable interrupts. */ -static inline void check_update_class_base(int this_cpu) { - unsigned long min_base = 0xFFFFFFFF; - cq_node_t *node; - int i; +static runqueue_t *this_rq_lock(void) +{ + runqueue_t *rq; - if (! cpu_online(this_cpu)) return; + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); - /* - * find the min_base across all the processors - */ - for_each_online_cpu(i) { - /* - * I should change it to directly use bpt->base - */ - node = classqueue_get_head(bpt_queue(i)); - if (node && node->prio < min_base) { - min_base = node->prio; - } - } - if (min_base != 0xFFFFFFFF) - classqueue_update_base(bpt_queue(this_cpu),min_base); + return rq; } -static inline void ckrm_rebalance_tick(int j,int this_cpu) +static inline void rq_unlock(runqueue_t *rq) { -#ifdef CONFIG_CKRM_CPU_SCHEDULE - read_lock(&class_list_lock); - if (!(j % CVT_UPDATE_TICK)) - update_global_cvts(this_cpu); - -#define CKRM_BASE_UPDATE_RATE 400 - if (! (jiffies % CKRM_BASE_UPDATE_RATE)) - check_update_class_base(this_cpu); - - read_unlock(&class_list_lock); -#endif + spin_unlock_irq(&rq->lock); } -static inline struct ckrm_local_runqueue *rq_get_next_class(struct runqueue *rq) +#ifdef CONFIG_CKRM_CPU_SCHEDULE +static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) { cq_node_t *node = classqueue_get_head(&rq->classqueue); return ((node) ? class_list_entry(node) : NULL); } +/* + * return the cvt of the current running class + * if no current running class, return 0 + * assume cpu is valid (cpu_online(cpu) == 1) + */ +CVT_t get_local_cur_cvt(int cpu) +{ + ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu)); + + if (lrq) + return lrq->local_cvt; + else + return 0; +} + static inline struct task_struct * rq_get_next_task(struct runqueue* rq) { prio_array_t *array; struct task_struct *next; - struct ckrm_local_runqueue *queue; + ckrm_lrq_t *queue; + int idx; int cpu = smp_processor_id(); next = rq->idle; retry_next_class: if ((queue = rq_get_next_class(rq))) { - array = queue->active; //check switch active/expired queue - if (unlikely(!queue->active->nr_active)) { + array = queue->active; + if (unlikely(!array->nr_active)) { queue->active = queue->expired; queue->expired = array; queue->expired_timestamp = 0; @@ -305,20 +368,20 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq) &queue->classqueue_linkobj); cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); } - goto retry_next_class; } - BUG_ON(!queue->active->nr_active); - next = task_list_entry(array->queue[queue->top_priority].next); + BUG_ON(!array->nr_active); + + idx = queue->top_priority; + if (queue->top_priority == MAX_PRIO) { + BUG_ON(1); + } + + next = task_list_entry(array->queue[idx].next); } return next; } - -static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load += cpu_class_weight(p->cpu_class); } -static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load -= cpu_class_weight(p->cpu_class); } - -#else /*CONFIG_CKRM_CPU_SCHEDULE*/ - +#else /*! CONFIG_CKRM_CPU_SCHEDULE*/ static inline struct task_struct * rq_get_next_task(struct runqueue* rq) { prio_array_t *array; @@ -345,59 +408,14 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq) static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } static inline void init_cpu_classes(void) { } -static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { } -static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { } +#define rq_ckrm_load(rq) NULL +static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} #endif /* CONFIG_CKRM_CPU_SCHEDULE */ - -/* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. - */ -runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) -{ - struct runqueue *rq; - -repeat_lock_task: - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { - spin_unlock_irqrestore(&rq->lock, *flags); - goto repeat_lock_task; - } - return rq; -} - -void task_rq_unlock(runqueue_t *rq, unsigned long *flags) -{ - spin_unlock_irqrestore(&rq->lock, *flags); -} - -/* - * rq_lock - lock a given runqueue and disable interrupts. - */ -static runqueue_t *this_rq_lock(void) -{ - runqueue_t *rq; - - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); - - return rq; -} - -static inline void rq_unlock(runqueue_t *rq) -{ - spin_unlock_irq(&rq->lock); -} - /* * Adding/removing a task to/from a priority array: */ -void dequeue_task(struct task_struct *p, prio_array_t *array) +static void dequeue_task(struct task_struct *p, prio_array_t *array) { BUG_ON(! array); array->nr_active--; @@ -407,7 +425,7 @@ void dequeue_task(struct task_struct *p, prio_array_t *array) class_dequeue_task(p,array); } -void enqueue_task(struct task_struct *p, prio_array_t *array) +static void enqueue_task(struct task_struct *p, prio_array_t *array) { list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); @@ -471,7 +489,6 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq_active(p,rq)); rq->nr_running++; - rq_load_inc(rq,p); } /* @@ -481,7 +498,6 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq_active(p,rq)); rq->nr_running++; - rq_load_inc(rq,p); } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -613,7 +629,6 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) static void deactivate_task(struct task_struct *p, runqueue_t *rq) { rq->nr_running--; - rq_load_dec(rq,p); if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); @@ -987,6 +1002,10 @@ void fastcall sched_fork(task_t *p) INIT_LIST_HEAD(&p->run_list); p->array = NULL; spin_lock_init(&p->switch_lock); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0); +#endif + #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which @@ -1062,7 +1081,7 @@ void fastcall wake_up_forked_process(task_t * p) p->array = current->array; p->array->nr_active++; rq->nr_running++; - rq_load_inc(rq,p); + class_enqueue_task(p,p->array); } task_rq_unlock(rq, &flags); } @@ -1395,7 +1414,7 @@ lock_again: p->array = current->array; p->array->nr_active++; rq->nr_running++; - rq_load_inc(rq,p); + class_enqueue_task(p,p->array); } } else { /* Not the local CPU - must adjust timestamp */ @@ -1500,13 +1519,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, { dequeue_task(p, src_array); src_rq->nr_running--; - rq_load_dec(src_rq,p); - set_task_cpu(p, this_cpu); this_rq->nr_running++; - rq_load_inc(this_rq,p); enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* @@ -1546,133 +1561,61 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, } #ifdef CONFIG_CKRM_CPU_SCHEDULE - -struct ckrm_cpu_class *find_unbalanced_class(int busiest_cpu, int this_cpu, unsigned long *cls_imbalance) +static inline int ckrm_preferred_task(task_t *tmp,long min, long max, + int phase, enum idle_type idle) { - struct ckrm_cpu_class *most_unbalanced_class = NULL; - struct ckrm_cpu_class *clsptr; - int max_unbalance = 0; - - list_for_each_entry(clsptr,&active_cpu_classes,links) { - struct ckrm_local_runqueue *this_lrq = get_ckrm_local_runqueue(clsptr,this_cpu); - struct ckrm_local_runqueue *busiest_lrq = get_ckrm_local_runqueue(clsptr,busiest_cpu); - int unbalance_degree; - - unbalance_degree = (local_queue_nr_running(busiest_lrq) - local_queue_nr_running(this_lrq)) * cpu_class_weight(clsptr); - if (unbalance_degree >= *cls_imbalance) - continue; // already looked at this class + long pressure = task_load(tmp); + + if (pressure > max) + return 0; - if (unbalance_degree > max_unbalance) { - max_unbalance = unbalance_degree; - most_unbalanced_class = clsptr; - } - } - *cls_imbalance = max_unbalance; - return most_unbalanced_class; + if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) + return 0; + return 1; } - /* - * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. + * move tasks for a specic local class + * return number of tasks pulled */ -static int find_busiest_cpu(runqueue_t *this_rq, int this_cpu, int idle, - int *imbalance) +static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, + runqueue_t *this_rq, + runqueue_t *busiest, + struct sched_domain *sd, + int this_cpu, + enum idle_type idle, + long* pressure_imbalance) { - int cpu_load, load, max_load, i, busiest_cpu; - runqueue_t *busiest, *rq_src; - - - /*Hubertus ... the concept of nr_running is replace with cpu_load */ - cpu_load = this_rq->ckrm_cpu_load; - - busiest = NULL; - busiest_cpu = -1; - - max_load = -1; - for_each_online_cpu(i) { - rq_src = cpu_rq(i); - load = rq_src->ckrm_cpu_load; - - if ((load > max_load) && (rq_src != this_rq)) { - busiest = rq_src; - busiest_cpu = i; - max_load = load; - } - } - - if (likely(!busiest)) - goto out; - - *imbalance = max_load - cpu_load; - - /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && ((*imbalance)*4 < max_load)) { - busiest = NULL; - goto out; - } - - double_lock_balance(this_rq, busiest); - /* - * Make sure nothing changed since we checked the - * runqueue length. - */ - if (busiest->ckrm_cpu_load <= cpu_load) { - spin_unlock(&busiest->lock); - busiest = NULL; - } -out: - return (busiest ? busiest_cpu : -1); -} - -static int load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - int imbalance, idx; - int busiest_cpu; - runqueue_t *busiest; - prio_array_t *array; + prio_array_t *array, *dst_array; struct list_head *head, *curr; task_t *tmp; - struct ckrm_local_runqueue * busiest_local_queue; - struct ckrm_cpu_class *clsptr; - int weight; - unsigned long cls_imbalance; // so we can retry other classes - - // need to update global CVT based on local accumulated CVTs - read_lock(&class_list_lock); - busiest_cpu = find_busiest_cpu(this_rq, this_cpu, idle, &imbalance); - if (busiest_cpu == -1) - goto out; - - busiest = cpu_rq(busiest_cpu); - - /* - * We only want to steal a number of tasks equal to 1/2 the imbalance, - * otherwise we'll just shift the imbalance to the new queue: - */ - imbalance /= 2; - - /* now find class on that runqueue with largest inbalance */ - cls_imbalance = 0xFFFFFFFF; - - retry_other_class: - clsptr = find_unbalanced_class(busiest_cpu, this_cpu, &cls_imbalance); - if (!clsptr) - goto out_unlock; - - busiest_local_queue = get_ckrm_local_runqueue(clsptr,busiest_cpu); - weight = cpu_class_weight(clsptr); - + int idx; + int pulled = 0; + int phase = -1; + long pressure_min, pressure_max; + /*hzheng: magic : 90% balance is enough*/ + long balance_min = *pressure_imbalance / 10; +/* + * we don't want to migrate tasks that will reverse the balance + * or the tasks that make too small difference + */ +#define CKRM_BALANCE_MAX_RATIO 100 +#define CKRM_BALANCE_MIN_RATIO 1 + start: + phase ++; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to * be cache-cold, thus switching CPUs has the least effect * on them. */ - if (busiest_local_queue->expired->nr_active) - array = busiest_local_queue->expired; - else - array = busiest_local_queue->active; + if (src_lrq->expired->nr_active) { + array = src_lrq->expired; + dst_array = dst_lrq->expired; + } else { + array = src_lrq->active; + dst_array = dst_lrq->active; + } new_array: /* Start searching at priority 0: */ @@ -1683,11 +1626,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); if (idx >= MAX_PRIO) { - if (array == busiest_local_queue->expired && busiest_local_queue->active->nr_active) { - array = busiest_local_queue->active; + if (array == src_lrq->expired && src_lrq->active->nr_active) { + array = src_lrq->active; + dst_array = dst_lrq->active; goto new_array; } - goto retry_other_class; + if ((! phase) && (! pulled) && (idle != IDLE)) + goto start; //try again + else + goto out; //finished search for this lrq } head = array->queue + idx; @@ -1697,42 +1644,365 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd,idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, rq_active(tmp,this_rq),this_cpu); + + pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; + pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; /* - * tmp BUG FIX: hzheng - * load balancing can make the busiest local queue empty - * thus it should be removed from bpt + * skip the tasks that will reverse the balance too much */ - if (! local_queue_nr_running(busiest_local_queue)) { - classqueue_dequeue(busiest_local_queue->classqueue,&busiest_local_queue->classqueue_linkobj); - cpu_demand_event(get_rq_local_stat(busiest_local_queue,busiest_cpu),CPU_DEMAND_DEQUEUE,0); + if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { + *pressure_imbalance -= task_load(tmp); + pull_task(busiest, array, tmp, + this_rq, dst_array, this_cpu); + pulled++; + + if (*pressure_imbalance <= balance_min) + goto out; } + + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + out: + return pulled; +} - imbalance -= weight; - if (!idle && (imbalance>0)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; +static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) +{ + long imbalance; + /* + * make sure after balance, imbalance' > - imbalance/2 + * we don't want the imbalance be reversed too much + */ + imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) + - pid_get_pressure(rq_ckrm_load(this_rq),1); + imbalance /= 2; + return imbalance; +} + +/* + * try to balance the two runqueues + * + * Called with both runqueues locked. + * if move_tasks is called, it will try to move at least one task over + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct ckrm_cpu_class *clsptr,*vip_cls = NULL; + ckrm_lrq_t* src_lrq,*dst_lrq; + long pressure_imbalance, pressure_imbalance_old; + int src_cpu = task_cpu(busiest->curr); + struct list_head *list; + int pulled = 0; + long imbalance; + + imbalance = ckrm_rq_imbalance(this_rq,busiest); + + if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) + goto out; + + //try to find the vip class + list_for_each_entry(clsptr,&active_cpu_classes,links) { + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + + if (! lrq_nr_running(src_lrq)) + continue; + + if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) + { + vip_cls = clsptr; + } } - out_unlock: - spin_unlock(&busiest->lock); + + /* + * do search from the most significant class + * hopefully, less tasks will be migrated this way + */ + clsptr = vip_cls; + + move_class: + if (! clsptr) + goto out; + + + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + if (! lrq_nr_running(src_lrq)) + goto other_class; + + dst_lrq = get_ckrm_lrq(clsptr,this_cpu); + + //how much pressure for this class should be transferred + pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; + if (pulled && ! pressure_imbalance) + goto other_class; + + pressure_imbalance_old = pressure_imbalance; + + //move tasks + pulled += + ckrm_cls_move_tasks(src_lrq,dst_lrq, + this_rq, + busiest, + sd,this_cpu,idle, + &pressure_imbalance); + + /* + * hzheng: 2 is another magic number + * stop balancing if the imbalance is less than 25% of the orig + */ + if (pressure_imbalance <= (pressure_imbalance_old >> 2)) + goto out; + + //update imbalance + imbalance *= pressure_imbalance / pressure_imbalance_old; + other_class: + //who is next? + list = clsptr->links.next; + if (list == &active_cpu_classes) + list = list->next; + clsptr = list_entry(list, typeof(*clsptr), links); + if (clsptr != vip_cls) + goto move_class; out: - read_unlock(&class_list_lock); + return pulled; +} + +/** + * ckrm_check_balance - is load balancing necessary? + * return 0 if load balancing is not necessary + * otherwise return the average load of the system + * also, update nr_group + * + * heuristics: + * no load balancing if it's load is over average + * no load balancing if it's load is far more than the min + * task: + * read the status of all the runqueues + */ +static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, + enum idle_type idle, int* nr_group) +{ + struct sched_group *group = sd->groups; + unsigned long min_load, max_load, avg_load; + unsigned long total_load, this_load, total_pwr; + + max_load = this_load = total_load = total_pwr = 0; + min_load = 0xFFFFFFFF; + *nr_group = 0; + + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + /* Tally up the load of all CPUs in the group */ + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto nextgroup; + + avg_load = 0; + local_group = cpu_isset(this_cpu, group->cpumask); + + for_each_cpu_mask(i, tmp) { + load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + } + if (avg_load < min_load) { + min_load = avg_load; + } +nextgroup: + group = group->next; + *nr_group = *nr_group + 1; + } while (group != sd->groups); + + if (!max_load || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + /* hzheng: debugging: 105 is a magic number + * 100*max_load <= sd->imbalance_pct*this_load) + * should use imbalance_pct instead + */ + if (this_load > avg_load + || 100*max_load < 105*this_load + || 100*min_load < 70*this_load + ) + goto out_balanced; + + return avg_load; + out_balanced: return 0; } +/** + * any group that has above average load is considered busy + * find the busiest queue from any of busy group + */ +static runqueue_t * +ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, + unsigned long avg_load, enum idle_type idle, + int nr_group) +{ + struct sched_group *group; + runqueue_t * busiest=NULL; + unsigned long rand; + + group = sd->groups; + rand = get_ckrm_rand(nr_group); + nr_group = 0; + + do { + unsigned long load,total_load,max_load; + cpumask_t tmp; + int i; + runqueue_t * grp_busiest; + + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto find_nextgroup; -static inline void idle_balance(int this_cpu, runqueue_t *this_rq) + total_load = 0; + max_load = 0; + grp_busiest = NULL; + for_each_cpu_mask(i, tmp) { + load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); + total_load += load; + if (load > max_load) { + max_load = load; + grp_busiest = cpu_rq(i); + } + } + + total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; + if (total_load > avg_load) { + busiest = grp_busiest; + if (nr_group >= rand) + break; + } + find_nextgroup: + group = group->next; + nr_group ++; + } while (group != sd->groups); + + return busiest; +} + +/** + * load_balance - pressure based load balancing algorithm used by ckrm + */ +static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) { + runqueue_t *busiest; + unsigned long avg_load; + int nr_moved,nr_group; + + avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); + if (! avg_load) + goto out_balanced; + + busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); + if (! busiest) + goto out_balanced; + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + 0,sd, idle); + spin_unlock(&busiest->lock); + if (nr_moved) { + adjust_local_weight(); + } + } + + if (!nr_moved) + sd->nr_balance_failed ++; + else + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + + return nr_moved; + +out_balanced: + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; } -#else /* CONFIG_CKRM_CPU_SCHEDULE */ + +/* + * this_rq->lock is already held + */ +static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) +{ + int ret; + read_lock(&class_list_lock); + ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + return ret; +} + +static inline int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + int ret; + + spin_lock(&this_rq->lock); + read_lock(&class_list_lock); + ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + spin_unlock(&this_rq->lock); + return ret; +} +#else /*! CONFIG_CKRM_CPU_SCHEDULE */ /* * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, * as part of a balancing operation within "domain". Returns the number of @@ -2097,6 +2367,8 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out: return nr_moved; } +#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ + /* * idle_balance is called by schedule() if this_cpu is about to become @@ -2182,7 +2454,6 @@ next_group: group = group->next; } while (group != sd->groups); } -#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ /* * rebalance_tick will get called every timer tick, on every CPU. @@ -2203,8 +2474,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; - ckrm_rebalance_tick(j,this_cpu); - /* Update our load */ old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; @@ -2243,9 +2512,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, */ static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) { - ckrm_rebalance_tick(jiffies,cpu); } - static inline void idle_balance(int cpu, runqueue_t *rq) { } @@ -2267,7 +2534,6 @@ static inline int wake_priority_sleeper(runqueue_t *rq) } DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; - EXPORT_PER_CPU_SYMBOL(kstat); /* @@ -2291,7 +2557,7 @@ EXPORT_PER_CPU_SYMBOL(kstat); #define EXPIRED_STARVING(rq) \ (STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * (local_queue_nr_running(rq)) + 1))) + STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) #endif /* @@ -2323,8 +2589,10 @@ void scheduler_tick(int user_ticks, int sys_ticks) } if (p == rq->idle) { +#ifdef CONFIG_VSERVER_HARDCPU if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) set_need_resched(); +#endif if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; @@ -2332,6 +2600,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; + ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, IDLE); return; } @@ -2373,7 +2642,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (vx_need_resched(p)) { #ifdef CONFIG_CKRM_CPU_SCHEDULE /* Hubertus ... we can abstract this out */ - struct ckrm_local_runqueue* rq = get_task_class_queue(p); + ckrm_lrq_t* rq = get_task_lrq(p); #endif dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2420,6 +2689,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) out_unlock: spin_unlock(&rq->lock); out: + ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -2569,6 +2839,19 @@ need_resched: spin_lock_irq(&rq->lock); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + if (prev != rq->idle) { + unsigned long long run = now - prev->timestamp; + ckrm_lrq_t * lrq = get_task_lrq(prev); + + lrq->lrq_load -= task_load(prev); + cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); + lrq->lrq_load += task_load(prev); + + cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); + update_local_cvt(prev, run); + } +#endif /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2617,17 +2900,14 @@ pick_next: #endif if (unlikely(!rq->nr_running)) { idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - goto switch_tasks; - } } next = rq_get_next_task(rq); - if (next == rq->idle) + if (next == rq->idle) { + rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); goto switch_tasks; + } if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; @@ -2669,14 +2949,6 @@ switch_tasks: rq->nr_preempt++; RCU_qsctr(task_cpu(prev))++; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - if (prev != rq->idle) { - unsigned long long run = now - prev->timestamp; - cpu_demand_event(get_task_local_stat(prev),CPU_DEMAND_DESCHEDULE,run); - update_local_cvt(prev, run); - } -#endif - prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) { prev->sleep_avg = 0; @@ -2719,7 +2991,6 @@ switch_tasks: } EXPORT_SYMBOL(schedule); - #ifdef CONFIG_PREEMPT /* * this is is the entry point to schedule() from in-kernel preemption @@ -3820,7 +4091,6 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; - set_task_cpu(p, dest_cpu); if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -3831,10 +4101,12 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); - } + } else + set_task_cpu(p, dest_cpu); out: double_rq_unlock(rq_src, rq_dest); @@ -3869,9 +4141,7 @@ static int migration_thread(void * data) } if (rq->active_balance) { -#ifndef CONFIG_CKRM_CPU_SCHEDULE active_load_balance(rq, cpu); -#endif rq->active_balance = 0; } @@ -4346,9 +4616,6 @@ void __init sched_init(void) { runqueue_t *rq; int i; -#ifndef CONFIG_CKRM_CPU_SCHEDULE - int j, k; -#endif #ifdef CONFIG_SMP /* Set up an initial dummy domain for early boot */ @@ -4367,46 +4634,50 @@ void __init sched_init(void) sched_group_init.next = &sched_group_init; sched_group_init.cpu_power = SCHED_LOAD_SCALE; #endif - init_cpu_classes(); for (i = 0; i < NR_CPUS; i++) { #ifndef CONFIG_CKRM_CPU_SCHEDULE + int j, k; prio_array_t *array; -#endif + rq = cpu_rq(i); spin_lock_init(&rq->lock); -#ifndef CONFIG_CKRM_CPU_SCHEDULE + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + rq->active = rq->arrays; rq->expired = rq->arrays + 1; #else - rq->ckrm_cpu_load = 0; + rq = cpu_rq(i); + spin_lock_init(&rq->lock); #endif + rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + ckrm_load_init(rq_ckrm_load(rq)); +#endif rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); #endif +#ifdef CONFIG_VSERVER_HARDCPU INIT_LIST_HEAD(&rq->hold_queue); - atomic_set(&rq->nr_iowait, 0); - -#ifndef CONFIG_CKRM_CPU_SCHEDULE - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } #endif + atomic_set(&rq->nr_iowait, 0); } /* @@ -4418,7 +4689,8 @@ void __init sched_init(void) rq->idle = current; set_task_cpu(current, smp_processor_id()); #ifdef CONFIG_CKRM_CPU_SCHEDULE - current->cpu_class = default_cpu_class; + cpu_demand_event(&(current)->demand_stat,CPU_DEMAND_INIT,0); + current->cpu_class = get_default_cpu_class(); current->array = NULL; #endif wake_up_forked_process(current); @@ -4512,10 +4784,30 @@ EXPORT_SYMBOL(task_running_sys); #ifdef CONFIG_CKRM_CPU_SCHEDULE /** * return the classqueue object of a certain processor - * Note: not supposed to be used in performance sensitive functions */ struct classqueue_struct * get_cpu_classqueue(int cpu) { return (& (cpu_rq(cpu)->classqueue) ); } + +/** + * _ckrm_cpu_change_class - change the class of a task + */ +void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) +{ + prio_array_t *array; + struct runqueue *rq; + unsigned long flags; + + rq = task_rq_lock(tsk,&flags); + array = tsk->array; + if (array) { + dequeue_task(tsk,array); + tsk->cpu_class = newcls; + enqueue_task(tsk,rq_active(tsk,rq)); + } else + tsk->cpu_class = newcls; + + task_rq_unlock(rq,&flags); +} #endif