From: Marc Fiuczynski Date: Sat, 25 Sep 2004 02:24:09 +0000 (+0000) Subject: ckrm_E16rc1 cpu controller v5 X-Git-Tag: ckrm_E16rc1-cpu-controller-v5~1 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=47df59a68e5537f0cb5b9962f18e5e698a4cec20;p=linux-2.6.git ckrm_E16rc1 cpu controller v5 --- diff --git a/fs/exec.c b/fs/exec.c index b0a98b43f..b0acd4297 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -548,18 +547,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); -#ifdef CONFIG_CKRM_RES_MEM - if (old_mm) { - spin_lock(&old_mm->peertask_lock); - list_del(&tsk->mm_peers); - ckrm_mem_evaluate_mm(old_mm); - spin_unlock(&old_mm->peertask_lock); - } - spin_lock(&mm->peertask_lock); - list_add_tail(&tsk->mm_peers, &mm->tasklist); - ckrm_mem_evaluate_mm(mm); - spin_unlock(&mm->peertask_lock); -#endif if (old_mm) { if (active_mm != old_mm) BUG(); mmput(old_mm); diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h index 1bdf9b775..a825336cb 100644 --- a/include/linux/ckrm_classqueue.h +++ b/include/linux/ckrm_classqueue.h @@ -116,7 +116,7 @@ void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int cq_node_t *classqueue_get_head(struct classqueue_struct *cq); /*update the base priority of the classqueue*/ -void classqueue_update_base(struct classqueue_struct *cq, int new_base); +void classqueue_update_base(struct classqueue_struct *cq); /** * class_compare_prio: compare the priority of this two nodes diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h index 9d82214fb..b3e180a5e 100644 --- a/include/linux/ckrm_sched.h +++ b/include/linux/ckrm_sched.h @@ -15,30 +15,34 @@ #ifndef _CKRM_SCHED_H #define _CKRM_SCHED_H -#define CC_BUG_ON_DO(cond,action) do { if (cond) action; BUG_ON(cond); } while(0) -#define CC_BUG_ON(cond) BUG_ON(cond) - #include #include #include -//update every second -#define CVT_UPDATE_TICK (1*HZ/1 ?: 1) -#define CLASS_BONUS_RATE 22 // shift from ns to increase class bonus -#define PRIORITY_BONUS_RATE 0 // ?? Hubertus - #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + struct prio_array { - int nr_active; + unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; }; -struct ckrm_local_runqueue { +#ifdef CONFIG_CKRM_CPU_SCHEDULE +#define rq_active(p,rq) (get_task_lrq(p)->active) +#define rq_expired(p,rq) (get_task_lrq(p)->expired) +int __init init_ckrm_sched_res(void); +#else +#define rq_active(p,rq) (rq->active) +#define rq_expired(p,rq) (rq->expired) +static inline void init_ckrm_sched_res(void) {} +static inline int ckrm_cpu_monitor_init(void) {return 0;} +#endif + +#ifdef CONFIG_CKRM_CPU_SCHEDULE +struct ckrm_runqueue { cq_node_t classqueue_linkobj; /*links in classqueue */ struct ckrm_cpu_class *cpu_class; // class it belongs to struct classqueue_struct *classqueue; // classqueue it belongs tow - CVT_t uncounted_cvt; unsigned long long uncounted_ns; prio_array_t *active, *expired, arrays[2]; @@ -55,19 +59,27 @@ struct ckrm_local_runqueue { * updated on enqueue, dequeue */ int top_priority; - CVT_t local_cvt; // snapshot of local_cvt, update on every loadbalance + CVT_t local_cvt; + + unsigned long lrq_load; + int local_weight; + + + /* + * unused CPU time accumulated while thoe class + * is inactive goes to savings + * + * initialized to be 0 + * a class can't accumulate more than SAVING_THRESHOLD of savings + * savings are kept in normalized form (like cvt) + * so when task share change the savings should be scaled accordingly + */ + unsigned long long savings; + unsigned long magic; //for debugging }; -/** - * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping - */ -struct ckrm_cpu_class_local_stat { - unsigned long long run; - unsigned long long total; - unsigned long long last_sleep; - unsigned long cpu_demand; /*estimated cpu demand */ -}; +typedef struct ckrm_runqueue ckrm_lrq_t; /** * ckrm_cpu_class_stat - cpu usage statistics maintained for each class @@ -78,22 +90,35 @@ struct ckrm_cpu_class_stat { unsigned long long total_ns; /*how much nano-secs it has consumed */ - struct ckrm_cpu_class_local_stat local_stats[NR_CPUS]; - unsigned long cpu_demand; + struct ckrm_cpu_demand_stat local_stats[NR_CPUS]; + + /* + * + */ + unsigned long max_demand; /* the maximun a class can consume */ + int egrt,megrt; /*effective guarantee*/ + int ehl,mehl; /*effective hard limit, my effective hard limit*/ - /*temp stat used by cpu monitor */ - int effective_guarantee; - int effective_limit; - int glut; //true or false /* - * effective_share: for both default class and its children - * self_effective_share: just for the default class + * eshare: for both default class and its children + * meshare: just for the default class */ - int effective_share; - int self_effective_share; + int eshare; + int meshare; }; -typedef struct ckrm_cpu_class_stat ckrm_stat_t; +#define CKRM_CPU_CLASS_MAGIC 0x7af2abe3 + +#define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds +#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ)) +#define USAGE_WINDOW_SIZE 60 //keep the last 60 sample + +struct ckrm_usage { + unsigned long samples[USAGE_WINDOW_SIZE]; //record usages + unsigned long sample_pointer; //pointer for the sliding window + unsigned long long last_ns; //ns for last sample + unsigned long long last_sample_jiffies; //in number of jiffies +}; /* * manages the class status @@ -104,72 +129,189 @@ struct ckrm_cpu_class { struct ckrm_core_class *parent; struct ckrm_shares shares; spinlock_t cnt_lock; // always grab parent's lock first and then child's - CVT_t global_cvt; // total cummulative virtual time struct ckrm_cpu_class_stat stat; struct list_head links; // for linking up in cpu classes - struct ckrm_local_runqueue local_queues[NR_CPUS]; // runqueues + ckrm_lrq_t local_queues[NR_CPUS]; // runqueues + struct ckrm_usage usage; + unsigned long magic; //for debugging }; -#if CONFIG_CKRM_CPU_SCHEDULE -#define rq_active(p,rq) (get_task_class_queue(p)->active) -#define rq_expired(p,rq) (get_task_class_queue(p)->expired) -#else -#define rq_active(p,rq) (rq->active) -#define rq_expired(p,rq) (rq->expired) -#endif - -//#define cpu_class_weight(cls) (cls->shares.my_guarantee) -#define cpu_class_weight(cls) (cls->stat.self_effective_share) +#define cpu_class_weight(cls) (cls->stat.meshare) +#define local_class_weight(lrq) (lrq->local_weight) -#define bpt_queue(cpu) (& (cpu_rq(cpu)->classqueue) ) -CVT_t get_min_cvt(int cpu); +static inline int valid_cpu_class(struct ckrm_cpu_class * cls) +{ + return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC); +} struct classqueue_struct *get_cpu_classqueue(int cpu); +struct ckrm_cpu_class * get_default_cpu_class(void); + + +static inline void ckrm_usage_init(struct ckrm_usage* usage) +{ + int i; + + for (i=0; i < USAGE_WINDOW_SIZE; i++) + usage->samples[i] = 0; + usage->sample_pointer = 0; + usage->last_ns = 0; + usage->last_sample_jiffies = 0; +} + +/* + * this function can be called at any frequency + * it's self-contained + */ +static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr) +{ + struct ckrm_usage* usage = &clsptr->usage; + unsigned long long cur_sample; + int duration = jiffies - usage->last_sample_jiffies; + +// printk("\tckrm_sample_usage %ld %p: %lld\n",jiffies, clsptr,cur_sample); + + if (duration < USAGE_SAMPLE_FREQ) + return; + + cur_sample = clsptr->stat.total_ns - usage->last_ns; + //scale it based on the sample duration + cur_sample *= ((duration << 10)/USAGE_SAMPLE_FREQ); + cur_sample >>= 10; + + usage->samples[usage->sample_pointer++] = cur_sample; + usage->last_sample_jiffies = jiffies; + usage->last_ns = clsptr->stat.total_ns; + if (usage->sample_pointer >= USAGE_WINDOW_SIZE) + usage->sample_pointer = 0; +} -extern struct ckrm_cpu_class default_cpu_class_obj; -#define default_cpu_class (&default_cpu_class_obj) +//duration is specified in number of jiffies +//return the usage in percentage +static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration) +{ + int nr_samples = duration/USAGE_SAMPLE_FREQ?:1; + struct ckrm_usage* usage = &clsptr->usage; + unsigned long long total = 0; + int i, idx; + + if (nr_samples > USAGE_WINDOW_SIZE) + nr_samples = USAGE_WINDOW_SIZE; + + idx = usage->sample_pointer; + for (i = 0; i< nr_samples; i++) { + if (! idx) + idx = USAGE_WINDOW_SIZE; + idx --; + total += usage->samples[idx]; + } + total *= 100; + do_div(total,nr_samples); + do_div(total,NS_PER_SAMPLE); + // printk("percent %lld\n",total); + return total; +} -#define local_queue_nr_running(local_queue) \ - (local_queue->active->nr_active + local_queue->expired->nr_active) -static inline struct ckrm_local_runqueue * -get_ckrm_local_runqueue(struct ckrm_cpu_class*cls, int cpu) +#define lrq_nr_running(lrq) \ + (lrq->active->nr_active + lrq->expired->nr_active) + +static inline ckrm_lrq_t * +get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu) { return &(cls->local_queues[cpu]); } -static inline struct ckrm_local_runqueue *get_task_class_queue(struct task_struct *p) +static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p) { return &(p->cpu_class->local_queues[task_cpu(p)]); } #define task_list_entry(list) list_entry(list,struct task_struct,run_list) -#define class_list_entry(list) list_entry(list,struct ckrm_local_runqueue,classqueue_linkobj) +#define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj) /* some additional interfaces exported from sched.c */ struct runqueue; -void dequeue_task(struct task_struct *p, prio_array_t * array); -void enqueue_task(struct task_struct *p, prio_array_t * array); -struct runqueue *task_rq_lock(task_t * p, unsigned long *flags); -void task_rq_unlock(struct runqueue *rq, unsigned long *flags); -extern spinlock_t cvt_lock; extern rwlock_t class_list_lock; extern struct list_head active_cpu_classes; +unsigned int task_timeslice(task_t *p); +void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls); -/*functions exported by ckrm_cpu_class.c*/ -int __init init_ckrm_sched_res(void); void init_cpu_classes(void); +void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares); +void ckrm_cpu_change_class(void *task, void *old, void *new); + -/*functions exported by ckrm_cpu_monitor.c*/ -void ckrm_cpu_monitor(void); -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); #define CPU_DEMAND_ENQUEUE 0 #define CPU_DEMAND_DEQUEUE 1 #define CPU_DEMAND_DESCHEDULE 2 -void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len); +#define CPU_DEMAND_INIT 3 + +/*functions exported by ckrm_cpu_monitor.c*/ +void ckrm_cpu_monitor(void); +int ckrm_cpu_monitor_init(void); +void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); +void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len); +void adjust_local_weight(void); + +#define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)]) +#define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu]) +#define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu)) + +#define CLASS_QUANTIZER 22 //shift from ns to increase class bonus +#define PRIORITY_QUANTIZER 0 //controls how much a high prio task can borrow +#define CKRM_SHARE_ACCURACY 10 +#define NSEC_PER_MS 1000000 +#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ) + + +#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC) // 10 seconds + +#define CVT_UPDATE_TICK ((HZ/2)?:1) + +// ABSOLUTE_CKRM_TUNING determines whether classes can make up +// lost time in absolute time or in relative values + +#define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior + +#ifdef ABSOLUTE_CKRM_TUNING + +#define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE +//an absolute bonus of 200ms for classes when reactivated +#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) +#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES) + +#define scale_cvt(val,lrq) ((val)*local_class_weight(lrq)) +#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq))) + +#else + +#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) +/* + * to improve system responsiveness + * an inactive class is put a little bit ahead of the current class when it wakes up + * the amount is set in normalized termis to simplify the calculation + * for class with 100% share, it can be 2s ahead + * while for class with 10% share, it can be 200ms ahead + */ +#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS) + +/* + * normalized savings can't be more than MAX_NORMALIZED_SAVINGS + * based on the current configuration + * this means that a class with share 100% will accumulate 10s at most + * while a class with 1% of the share can only accumulate 100ms + */ + +//a class with share 100% can get 100ms every 500ms +//while a class with share 10% can only get 10ms every 500ms +#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY) + +#define scale_cvt(val,lrq) (val) +#define unscale_cvt(val,lrq) (val) + +#endif -#define get_task_local_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)]) -#define get_rq_local_stat(lrq,cpu) (&(lrq)->cpu_class->stat.local_stats[cpu]) /** * get_effective_prio: return the effective priority of a class local queue @@ -181,18 +323,18 @@ void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, u * currently, prio increases by 1 if either: top_priority increase by one * or, local_cvt increases by 4ms */ -static inline int get_effective_prio(struct ckrm_local_runqueue * lcq) +static inline int get_effective_prio(ckrm_lrq_t * lrq) { int prio; - // cumulative usage - prio = lcq->local_cvt >> CLASS_BONUS_RATE; - // queue urgency - prio += lcq->top_priority >> PRIORITY_BONUS_RATE; + prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage + prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency return prio; } +CVT_t get_local_cur_cvt(int cpu); + /** * update_class_priority: * @@ -206,9 +348,8 @@ static inline int get_effective_prio(struct ckrm_local_runqueue * lcq) * -- rq_get_next_task (queue switch) * -- update_local_cvt * -- schedule - * -- update_global_cvt */ -static inline void update_class_priority(struct ckrm_local_runqueue *local_rq) +static inline void update_class_priority(ckrm_lrq_t *local_rq) { int effective_prio = get_effective_prio(local_rq); classqueue_update_prio(local_rq->classqueue, @@ -220,42 +361,80 @@ static inline void update_class_priority(struct ckrm_local_runqueue *local_rq) * set the new top priority and reposition the queue * called when: task enqueue/dequeue and queue switch */ -static inline void set_top_priority(struct ckrm_local_runqueue *class_queue, +static inline void set_top_priority(ckrm_lrq_t *lrq, int new_priority) { - class_queue->top_priority = new_priority; - update_class_priority(class_queue); + lrq->top_priority = new_priority; + update_class_priority(lrq); +} + +/* + * task_load: how much load this task counts + */ +static inline unsigned long task_load(struct task_struct* p) +{ + return (task_timeslice(p) * p->demand_stat.cpu_demand); +} + +/* + * runqueue load is the local_weight of all the classes on this cpu + * must be called with class_list_lock held + */ +static inline unsigned long ckrm_cpu_load(int cpu) +{ + struct ckrm_cpu_class *clsptr; + ckrm_lrq_t* lrq; + struct ckrm_cpu_demand_stat* l_stat; + int total_load = 0; + int load; + + list_for_each_entry(clsptr,&active_cpu_classes,links) { + lrq = get_ckrm_lrq(clsptr,cpu); + l_stat = get_cls_local_stat(clsptr,cpu); + load = lrq->local_weight; + if (l_stat->cpu_demand < load) + load = l_stat->cpu_demand; + total_load += load; + } + return total_load; } static inline void class_enqueue_task(struct task_struct *p, prio_array_t * array) { - struct ckrm_local_runqueue *queue; + ckrm_lrq_t *lrq; int effective_prio; - queue = get_task_class_queue(p); + lrq = get_task_lrq(p); + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); + lrq->lrq_load += task_load(p); - if (! cls_in_classqueue(&queue->classqueue_linkobj)) { - cpu_demand_event(get_task_local_stat(p),CPU_DEMAND_ENQUEUE,0); - /*make sure the cvt of this class is up to date*/ - queue->local_cvt = get_min_cvt(task_cpu(p)); - effective_prio = get_effective_prio(queue); - classqueue_enqueue(queue->classqueue, &queue->classqueue_linkobj, effective_prio); + if ((p->prio < lrq->top_priority) && (array == lrq->active)) + set_top_priority(lrq, p->prio); + + if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { + cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); + effective_prio = get_effective_prio(lrq); + classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio); } - - if ((p->prio < queue->top_priority) && (array == queue->active)) - set_top_priority(queue, p->prio); } static inline void class_dequeue_task(struct task_struct *p, prio_array_t * array) { - struct ckrm_local_runqueue *queue = get_task_class_queue(p); + ckrm_lrq_t *lrq = get_task_lrq(p); + unsigned long load = task_load(p); + + BUG_ON(lrq->lrq_load < load); + lrq->lrq_load -= load; + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); - if ((array == queue->active) && (p->prio == queue->top_priority) + if ((array == lrq->active) && (p->prio == lrq->top_priority) && list_empty(&(array->queue[p->prio]))) - set_top_priority(queue, + set_top_priority(lrq, find_next_bit(array->bitmap, MAX_PRIO, p->prio)); } @@ -266,32 +445,81 @@ static inline void class_dequeue_task(struct task_struct *p, */ static inline void update_local_cvt(struct task_struct *p, unsigned long nsec) { - struct ckrm_local_runqueue *class_queue = get_task_class_queue(p); - struct ckrm_cpu_class *cls = class_queue->cpu_class; + ckrm_lrq_t * lrq = get_task_lrq(p); - unsigned long cvt_inc = nsec / cpu_class_weight(cls); + unsigned long cvt_inc = nsec / local_class_weight(lrq); - class_queue->local_cvt += cvt_inc; - class_queue->uncounted_cvt += cvt_inc; + lrq->local_cvt += cvt_inc; + lrq->uncounted_ns += nsec; - class_queue->uncounted_ns += nsec; - update_class_priority(class_queue); + update_class_priority(lrq); +} + +static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) +{ + struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj); + struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj); + + return (class_compare_prio(node1,node2) < 0); } /* - * called during loadbalancing - * to charge the class with locally accumulated cvt + * return a random value with range [0, (val-1)] */ -void update_global_cvts(int this_cpu); +static inline int get_ckrm_rand(unsigned long val) +{ + int rand; + static int last_rand[NR_CPUS]; + int cpu = smp_processor_id(); + + rand = last_rand[cpu]; + rand ++; + if (rand >= val) + rand = 0; + + last_rand[cpu] = rand; + return rand; +} -/** - * - */ -static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) +void update_class_cputime(int this_cpu); + +/**********************************************/ +/* PID_LOAD_BALANCING */ +/**********************************************/ +struct ckrm_load_struct { + unsigned long load_p; /*propotional*/ + unsigned long load_i; /*integral */ + long load_d; /*derivative */ +}; + +typedef struct ckrm_load_struct ckrm_load_t; + +static inline void ckrm_load_init(ckrm_load_t* ckrm_load) { + ckrm_load->load_p = 0; + ckrm_load->load_i = 0; + ckrm_load->load_d = 0; +} + +void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu); +long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group); +#define rq_ckrm_load(rq) (&((rq)->ckrm_load)) + +static inline void ckrm_sched_tick(int j,int this_cpu,struct ckrm_load_struct* ckrm_load) { - struct cq_node_struct* node1 = &(get_task_class_queue(p)->classqueue_linkobj); - struct cq_node_struct* node2 = &(get_task_class_queue(curr)->classqueue_linkobj); + read_lock(&class_list_lock); - return (class_compare_prio(node1,node2) < 0); +#ifdef CONFIG_SMP + ckrm_load_sample(ckrm_load,this_cpu); +#endif + + if (!(j % CVT_UPDATE_TICK)) { + classqueue_update_base(get_cpu_classqueue(this_cpu)); + update_class_cputime(this_cpu); + } + + read_unlock(&class_list_lock); } + +#endif /*CONFIG_CKRM_CPU_SCHEDULE */ + #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e7989075..5c584cced 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -229,9 +229,6 @@ struct page { void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ -#ifdef CONFIG_CKRM_RES_MEM - void *memclass; -#endif // CONFIG_CKRM_RES_MEM }; /* diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 5edb739b4..47762ca69 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,11 +1,9 @@ -#include static inline void add_page_to_active_list(struct zone *zone, struct page *page) { list_add(&page->lru, &zone->active_list); zone->nr_active++; - ckrm_mem_inc_active(page); } static inline void @@ -13,7 +11,6 @@ add_page_to_inactive_list(struct zone *zone, struct page *page) { list_add(&page->lru, &zone->inactive_list); zone->nr_inactive++; - ckrm_mem_inc_inactive(page); } static inline void @@ -21,7 +18,6 @@ del_page_from_active_list(struct zone *zone, struct page *page) { list_del(&page->lru); zone->nr_active--; - ckrm_mem_dec_active(page); } static inline void @@ -29,7 +25,6 @@ del_page_from_inactive_list(struct zone *zone, struct page *page) { list_del(&page->lru); zone->nr_inactive--; - ckrm_mem_dec_inactive(page); } static inline void @@ -39,9 +34,7 @@ del_page_from_lru(struct zone *zone, struct page *page) if (PageActive(page)) { ClearPageActive(page); zone->nr_active--; - ckrm_mem_dec_active(page); } else { zone->nr_inactive--; - ckrm_mem_dec_inactive(page); } } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c70f46a4e..c6f5063f0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -77,7 +77,6 @@ #define PG_compound 19 /* Part of a compound page */ #define PG_anon 20 /* Anonymous: anon_vma in mapping */ -#define PG_ckrm_account 21 /* This page is accounted by CKRM */ /* diff --git a/include/linux/sched.h b/include/linux/sched.h index f975c7693..c1bd9eaf6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -230,11 +230,6 @@ struct mm_struct { struct kioctx *ioctx_list; struct kioctx default_kioctx; -#ifdef CONFIG_CKRM_RES_MEM - struct ckrm_mem_res *memclass; - struct list_head tasklist; /* list of all tasks sharing this address space */ - spinlock_t peertask_lock; /* protect above tasklist */ -#endif }; extern int mmlist_nr; @@ -393,6 +388,24 @@ int set_current_groups(struct group_info *group_info); struct audit_context; /* See audit.c */ struct mempolicy; +#ifdef CONFIG_CKRM_CPU_SCHEDULE +/** + * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class + * @run: how much time it has been running since the counter started + * @total: total time since the counter started + * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping + * @recalc_interval: how often do we recalculate the cpu_demand + * @cpu_demand: moving average of run/total + */ +struct ckrm_cpu_demand_stat { + unsigned long long run; + unsigned long long total; + unsigned long long last_sleep; + unsigned long long recalc_interval; + unsigned long cpu_demand; /*estimated cpu demand */ +}; +#endif + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -492,7 +505,6 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; - sigset_t blocked, real_blocked; struct sigpending pending; @@ -541,11 +553,14 @@ struct task_struct { // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS struct ckrm_task_class *taskclass; struct list_head taskclass_link; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + struct ckrm_cpu_class *cpu_class; + //track cpu demand of this task + struct ckrm_cpu_demand_stat demand_stat; +#endif //CONFIG_CKRM_CPU_SCHEDULE #endif // CONFIG_CKRM_TYPE_TASKCLASS -#ifdef CONFIG_CKRM_RES_MEM - struct list_head mm_peers; // list of tasks using same mm_struct -#endif // CONFIG_CKRM_RES_MEM #endif // CONFIG_CKRM + struct task_delay_info delays; }; @@ -866,6 +881,7 @@ static inline int capable(int cap) } #endif + /* * Routines for handling mm_structs */ @@ -1000,7 +1016,6 @@ static inline struct mm_struct * get_task_mm(struct task_struct * task) return mm; } - /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available */ diff --git a/init/Kconfig b/init/Kconfig index 4fdce31f9..e5480f047 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -172,25 +172,14 @@ config CKRM_RES_NUMTASKS Say N if unsure, Y to use the feature. -config CKRM_RES_MEM - bool "Class based physical memory controller" +config CKRM_CPU_SCHEDULE + bool "CKRM CPU scheduler" + depends on CKRM_TYPE_TASKCLASS default y - depends on CKRM help - Provide the basic support for collecting physical memory usage information - among classes. Say Y if you want to know the memory usage of each class. - -config CKRM_MEM_LRUORDER_CHANGE - bool "Change the LRU ordering of scanned pages" - default n - depends on CKRM_RES_MEM - help - While trying to free pages, by default(n), scanned pages are left were they - are found if they belong to relatively under-used class. In this case the - LRU ordering of the memory subsystemis left intact. If this option is chosen, - then the scanned pages are moved to the tail of the list(active or inactive). - Changing this to yes reduces the checking overhead but violates the approximate - LRU order that is maintained by the paging subsystem. + Use CKRM CPU scheduler instead of Linux Scheduler + + Say N if unsure, Y to use the feature. config CKRM_TYPE_SOCKETCLASS bool "Class Manager for socket groups" diff --git a/init/main.c b/init/main.c index 44a43d447..7a93e4edf 100644 --- a/init/main.c +++ b/init/main.c @@ -50,6 +50,7 @@ #include #include +#include /* * This is one of the first .c files built. Error out early @@ -680,6 +681,7 @@ static int init(void * unused) do_basic_setup(); + init_ckrm_sched_res(); /* * check if there is an early userspace init. If yes, let it do all * the work diff --git a/kernel/Makefile b/kernel/Makefile index 97364d362..2038a7247 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index da0055430..de490232b 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -9,4 +9,4 @@ endif obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o - obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o + obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c index 0ded7f3c6..2624a4797 100644 --- a/kernel/ckrm/ckrm_cpu_class.c +++ b/kernel/ckrm/ckrm_cpu_class.c @@ -23,17 +23,32 @@ #include #include - struct ckrm_res_ctlr cpu_rcbs; +/** + * insert_cpu_class - insert a class to active_cpu_class list + * + * insert the class in decreasing order of class weight + */ +static inline void insert_cpu_class(struct ckrm_cpu_class *cls) +{ + list_add(&cls->links,&active_cpu_classes); +} + /* * initialize a class object and its local queues */ - static void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) +void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) { int i,j,k; prio_array_t *array; - struct ckrm_local_runqueue* queue; + ckrm_lrq_t* queue; + + cls->shares = *shares; + cls->cnt_lock = SPIN_LOCK_UNLOCKED; + ckrm_cpu_stat_init(&cls->stat); + ckrm_usage_init(&cls->usage); + cls->magic = CKRM_CPU_CLASS_MAGIC; for (i = 0 ; i < NR_CPUS ; i++) { queue = &cls->local_queues[i]; @@ -58,34 +73,37 @@ struct ckrm_res_ctlr cpu_rcbs; queue->top_priority = MAX_PRIO; cq_node_init(&queue->classqueue_linkobj); queue->local_cvt = 0; - queue->uncounted_cvt = 0; + queue->lrq_load = 0; + queue->local_weight = cpu_class_weight(cls); queue->uncounted_ns = 0; + queue->savings = 0; queue->magic = 0x43FF43D7; } - cls->shares = *shares; - cls->global_cvt = 0; - cls->cnt_lock = SPIN_LOCK_UNLOCKED; - ckrm_cpu_stat_init(&cls->stat); - // add to class list write_lock(&class_list_lock); - list_add(&cls->links,&active_cpu_classes); + insert_cpu_class(cls); write_unlock(&class_list_lock); } static inline void set_default_share(ckrm_shares_t *shares) { shares->my_guarantee = 0; - shares->my_limit = CKRM_SHARE_DFLT_MAX_LIMIT; shares->total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - shares->max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - shares->cur_max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + shares->my_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + shares->max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; + shares->cur_max_limit = 0; } -struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) { - return ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); +struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) +{ + struct ckrm_cpu_class * cls; + cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); + if (valid_cpu_class(cls)) + return cls; + else + return NULL; } @@ -94,7 +112,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class struct ckrm_cpu_class *cls; if (! parent) /*root class*/ - cls = default_cpu_class; + cls = get_default_cpu_class(); else cls = (struct ckrm_cpu_class *) kmalloc(sizeof(struct ckrm_cpu_class),GFP_ATOMIC); @@ -113,7 +131,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class cls->parent = parent; } } else - printk("alloc_cpu_class failed GFP_ATOMIC\n"); + printk(KERN_ERR"alloc_cpu_class failed\n"); return cls; } @@ -132,7 +150,7 @@ static void ckrm_free_cpu_class(void *my_res) return; /*the default class can't be freed*/ - if (cls == default_cpu_class) + if (cls == get_default_cpu_class()) return; // Assuming there will be no children when this function is called @@ -187,7 +205,16 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) parres = NULL; } + /* + * hzheng: CKRM_SHARE_DONTCARE should be handled + */ + if (new_share->my_guarantee == CKRM_SHARE_DONTCARE) + new_share->my_guarantee = 0; + rc = set_shares(new_share, cur, par); + if (cur->my_limit == CKRM_SHARE_DONTCARE) + cur->my_limit = cur->max_limit; + spin_unlock(&cls->cnt_lock); if (cls->parent) { @@ -196,9 +223,6 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) return rc; } -/* - * translate the global_CVT to ticks - */ static int ckrm_cpu_get_share(void *my_res, struct ckrm_shares *shares) { @@ -213,35 +237,42 @@ static int ckrm_cpu_get_share(void *my_res, int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) { struct ckrm_cpu_class *cls = my_res; + struct ckrm_cpu_class_stat* stat = &cls->stat; + ckrm_lrq_t* lrq; + int i; if (!cls) return -EINVAL; seq_printf(sfile, "-------- CPU Class Status Start---------\n"); - seq_printf(sfile, " gua= %d limit= %d\n", + seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n", cls->shares.my_guarantee, - cls->shares.my_limit); - seq_printf(sfile, " total_gua= %d limit= %d\n", + cls->shares.my_limit, cls->shares.total_guarantee, cls->shares.max_limit); - seq_printf(sfile, " used_gua= %d cur_limit= %d\n", + seq_printf(sfile, "\tunused_grt= %d cur_max_limit= %d\n", cls->shares.unused_guarantee, cls->shares.cur_max_limit); - seq_printf(sfile, " Share= %d\n",cpu_class_weight(cls)); - seq_printf(sfile, " cvt= %llu\n",cls->local_queues[0].local_cvt); - seq_printf(sfile, " total_ns= %llu\n",cls->stat.total_ns); - seq_printf(sfile, " prio= %d\n",cls->local_queues[0].classqueue_linkobj.prio); - seq_printf(sfile, " index= %d\n",cls->local_queues[0].classqueue_linkobj.index); - seq_printf(sfile, " run= %llu\n",cls->stat.local_stats[0].run); - seq_printf(sfile, " total= %llu\n",cls->stat.local_stats[0].total); - seq_printf(sfile, " cpu_demand= %lu\n",cls->stat.cpu_demand); - - seq_printf(sfile, " effective_guarantee= %d\n",cls->stat.effective_guarantee); - seq_printf(sfile, " effective_limit= %d\n",cls->stat.effective_limit); - seq_printf(sfile, " effective_share= %d\n",cls->stat.effective_share); - seq_printf(sfile, "-------- CPU Class Status END ---------\n"); + seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt); + seq_printf(sfile, "\tmegrt= %d\n",stat->megrt); + seq_printf(sfile, "\tehl= %d\n",stat->ehl); + seq_printf(sfile, "\tmehl= %d\n",stat->mehl); + seq_printf(sfile, "\teshare= %d\n",stat->eshare); + seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls)); + seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand); + seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns); + seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n", + get_ckrm_usage(cls,2*HZ), + get_ckrm_usage(cls,10*HZ), + get_ckrm_usage(cls,60*HZ) + ); + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(cls,i); + seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt); + } + seq_printf(sfile, "-------- CPU Class Status END ---------\n"); return 0; } @@ -249,28 +280,16 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) /* * task will remain in the same cpu but on a different local runqueue */ -static void ckrm_cpu_change_class(void *task, void *old, void *new) +void ckrm_cpu_change_class(void *task, void *old, void *new) { struct task_struct *tsk = task; struct ckrm_cpu_class *newcls = new; - unsigned long flags; - struct runqueue *rq; - prio_array_t *array; /*sanity checking*/ if (!task || ! old || !new) return; - rq = task_rq_lock(tsk,&flags); - array = tsk->array; - if (array) { - dequeue_task(tsk,array); - tsk->cpu_class = newcls; - enqueue_task(tsk,rq_active(tsk,rq)); - } else { - tsk->cpu_class = newcls; - } - task_rq_unlock(rq,&flags); + _ckrm_cpu_change_class(tsk,newcls); } /*dummy function, not used*/ @@ -297,7 +316,7 @@ static int ckrm_cpu_set_config(void *my_res, const char *cfgstr) } struct ckrm_res_ctlr cpu_rcbs = { - .res_name = "CKRM CPU Class", + .res_name = "cpu", .res_hdepth = 1, .resid = -1, .res_alloc = ckrm_alloc_cpu_class, @@ -339,10 +358,11 @@ void init_cpu_classes(void) //init classqueues for each processor for (i=0; i < NR_CPUS; i++) classqueue_init(get_cpu_classqueue(i)); -/* - * hzheng: initialize the default cpu class - * required for E14 since ckrm_init is called after sched_init - */ + + /* + * hzheng: initialize the default cpu class + * required for E14/E15 since ckrm_init is called after sched_init + */ ckrm_alloc_cpu_class(NULL,NULL); } diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c index 674ee6e50..70e155a79 100644 --- a/kernel/ckrm/ckrm_cpu_monitor.c +++ b/kernel/ckrm/ckrm_cpu_monitor.c @@ -28,36 +28,84 @@ #include #include -#define CPU_MONITOR_INTERVAL (4*HZ) /*how often do we adjust the shares*/ -#define CKRM_SHARE_ACCURACY 7 +#define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/ #define CKRM_SHARE_MAX (1<shares.my_limit; +} + +static inline int get_mysoft_limit(struct ckrm_cpu_class *cls) +{ + return cls->shares.total_guarantee; +} + +static inline int get_hard_limit(struct ckrm_cpu_class *cls) +{ + return cls->shares.total_guarantee; +} + +static inline int get_myhard_limit(struct ckrm_cpu_class *cls) +{ + return cls->shares.total_guarantee; +} + + +static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type) +{ + unsigned long long now = sched_clock(); + + local_stat->run = 0; + local_stat->total = 0; + local_stat->last_sleep = now; + switch (type) { + case CPU_DEMAND_TP_CLASS: + local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC; + local_stat->cpu_demand = 0; + break; + case CPU_DEMAND_TP_TASK: + local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC; + //for task, the init cpu_demand is copied from its parent + break; + default: + BUG(); + } +} void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) { int i; - struct ckrm_cpu_class_local_stat* local_stat; - unsigned long long now = sched_clock(); stat->stat_lock = SPIN_LOCK_UNLOCKED; stat->total_ns = 0; - stat->cpu_demand = 0; + stat->max_demand = 0; for (i=0; i< NR_CPUS; i++) { - local_stat = &stat->local_stats[i]; - local_stat->run = 0; - local_stat->total = 0; - local_stat->last_sleep = now; - local_stat->cpu_demand = 0; + cpu_demand_stat_init(&stat->local_stats[i],CPU_DEMAND_TP_CLASS); } - stat->effective_guarantee = 0; - stat->effective_limit = 0; - stat->glut = 0; - stat->effective_share = 100; - stat->self_effective_share = 100; + stat->egrt = 0; + stat->megrt = 0; + stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/ + stat->mehl = CKRM_SHARE_MAX; /*default: no limit */ + + stat->eshare = CKRM_SHARE_MAX; + stat->meshare = CKRM_SHARE_MAX; } + /**********************************************/ /* cpu demand */ /**********************************************/ @@ -77,52 +125,42 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) */ /** - * update_cpu_demand - update a state change + * update_cpu_demand_stat - * - * should be called whenever the state of a local queue changes + * should be called whenever the state of a task/task local queue changes * -- when deschedule : report how much run * -- when enqueue: report how much sleep * - * to deal with excessive long run/sleep state - * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record + * how often should we recalculate the cpu demand + * the number is in ns */ -#define CKRM_CPU_DEMAND_RUN 0 -#define CKRM_CPU_DEMAND_SLEEP 1 -//how often should we recalculate the cpu demand, in ns -#define CPU_DEMAND_CAL_THRESHOLD (1000000000LL) -static inline void update_local_cpu_demand(struct ckrm_cpu_class_local_stat* local_stat,int state, unsigned long long len) +static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len) { local_stat->total += len; if (state == CKRM_CPU_DEMAND_RUN) local_stat->run += len; - if (local_stat->total >= CPU_DEMAND_CAL_THRESHOLD) { + if (local_stat->total >= local_stat->recalc_interval) { local_stat->total >>= CKRM_SHARE_ACCURACY; - if (local_stat->total > 0xFFFFFFFF) - local_stat->total = 0xFFFFFFFF; + if (unlikely(local_stat->run > 0xFFFFFFFF)) + local_stat->run = 0xFFFFFFFF; + if (local_stat->total > 0xFFFFFFFF) + local_stat->total = 0xFFFFFFFF; + do_div(local_stat->run,(unsigned long)local_stat->total); - local_stat->cpu_demand +=local_stat->run; - local_stat->cpu_demand >>= 1; + + if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep + local_stat->cpu_demand = local_stat->run; + else { + local_stat->cpu_demand += local_stat->run; + local_stat->cpu_demand >>= 1; + } local_stat->total = 0; local_stat->run = 0; } } -static inline void cpu_demand_update_run(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len) -{ - update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_RUN,len); -} - -static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* local_stat, unsigned long long len) -{ - update_local_cpu_demand(local_stat,CKRM_CPU_DEMAND_SLEEP,len); -} - -#define CPU_DEMAND_ENQUEUE 0 -#define CPU_DEMAND_DEQUEUE 1 -#define CPU_DEMAND_DESCHEDULE 2 - /** * cpu_demand_event - and cpu_demand event occured * @event: one of the following three events: @@ -131,19 +169,24 @@ static inline void cpu_demand_update_sleep(struct ckrm_cpu_class_local_stat* loc * CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run */ -void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, unsigned long long len) +void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) { switch (event) { case CPU_DEMAND_ENQUEUE: len = sched_clock() - local_stat->last_sleep; local_stat->last_sleep = 0; - cpu_demand_update_sleep(local_stat,len); + update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len); break; case CPU_DEMAND_DEQUEUE: - local_stat->last_sleep = sched_clock(); + if (! local_stat->last_sleep) { + local_stat->last_sleep = sched_clock(); + } break; case CPU_DEMAND_DESCHEDULE: - cpu_demand_update_run(local_stat,len); + update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len); + break; + case CPU_DEMAND_INIT: //for task init only + cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK); break; default: BUG(); @@ -152,18 +195,19 @@ void cpu_demand_event(struct ckrm_cpu_class_local_stat* local_stat, int event, u /** * check all the class local queue - * if local queueu is not in runqueue, then it's in sleep state - * if compare to last sleep, + * + * to deal with excessive long run/sleep state + * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record */ static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) { - struct ckrm_cpu_class_local_stat * local_stat = &stat->local_stats[cpu]; + struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; unsigned long long sleep,now; if (local_stat->last_sleep) { now = sched_clock(); sleep = now - local_stat->last_sleep; local_stat->last_sleep = now; - cpu_demand_update_sleep(local_stat,sleep); + update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); } } @@ -172,51 +216,72 @@ static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int * * self_cpu_demand = sum(cpu demand of all local queues) */ -static unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat - *stat) +static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) { int cpu_demand = 0; int i; + int cpuonline = 0; for_each_online_cpu(i) { cpu_demand_check_sleep(stat,i); cpu_demand += stat->local_stats[i].cpu_demand; + cpuonline ++; } - if (cpu_demand > CKRM_SHARE_MAX) - cpu_demand = CKRM_SHARE_MAX; - return cpu_demand; + return (cpu_demand/cpuonline); } /* - * update effective cpu demand for each class - * assume the root_core->parent == NULL + * my max demand = min(cpu_demand, my effective hard limit) */ -static void update_cpu_demand(struct ckrm_core_class *root_core) +static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) +{ + unsigned long mmax_demand = get_self_cpu_demand(stat); + if (mmax_demand > stat->mehl) + mmax_demand = stat->mehl; + + return mmax_demand; +} + +/** + * update_max_demand: update effective cpu demand for each class + * return -1 on error + * + * Assume: the root_core->parent == NULL + */ +static int update_max_demand(struct ckrm_core_class *root_core) { struct ckrm_core_class *cur_core, *child_core; - struct ckrm_cpu_class *cls; + struct ckrm_cpu_class *cls,*c_cls; + int ret = -1; cur_core = root_core; child_core = NULL; - /* - * iterate the tree - * update cpu_demand of each node - */ - repeat: - if (!cur_core) - return; + + repeat: + if (!cur_core) { //normal exit + ret = 0; + goto out; + } cls = ckrm_get_cpu_class(cur_core); + if (! cls) //invalid c_cls, abort + goto out; + if (!child_core) //first child - cls->stat.cpu_demand = get_self_cpu_demand(&cls->stat); + cls->stat.max_demand = get_mmax_demand(&cls->stat); else { - cls->stat.cpu_demand += - ckrm_get_cpu_class(child_core)->stat.cpu_demand; - if (cls->stat.cpu_demand > CKRM_SHARE_MAX) - cls->stat.cpu_demand = CKRM_SHARE_MAX; + c_cls = ckrm_get_cpu_class(child_core); + if (c_cls) + cls->stat.max_demand += c_cls->stat.max_demand; + else //invalid c_cls, abort + goto out; } + //check class hard limit + if (cls->stat.max_demand > cls->stat.ehl) + cls->stat.max_demand = cls->stat.ehl; + //next child child_core = ckrm_get_next_child(cur_core, child_core); if (child_core) { @@ -229,78 +294,116 @@ static void update_cpu_demand(struct ckrm_core_class *root_core) cur_core = child_core->hnode.parent; } goto repeat; + out: + return ret; } /**********************************************/ /* effective guarantee & limit */ /**********************************************/ -static inline void set_effective_share(struct ckrm_cpu_class_stat *stat, +static inline void set_eshare(struct ckrm_cpu_class_stat *stat, int new_share) { if (!new_share) new_share = 1; - stat->effective_share = new_share; + + BUG_ON(new_share < 0); + stat->eshare = new_share; } -static inline void set_self_effective_share(struct ckrm_cpu_class_stat *stat, +static inline void set_meshare(struct ckrm_cpu_class_stat *stat, int new_share) { if (!new_share) new_share = 1; - stat->self_effective_share = new_share; + + BUG_ON(new_share < 0); + stat->meshare = new_share; } -static inline void update_child_effective(struct ckrm_core_class *parent) +/** + *update_child_effective - update egrt, ehl, mehl for all children of parent + *@parent: the parent node + *return -1 if anything wrong + * + */ +static int update_child_effective(struct ckrm_core_class *parent) { struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent); - struct ckrm_core_class *child_core = ckrm_get_next_child(parent, NULL); + struct ckrm_core_class *child_core; + int ret = -1; + if (! p_cls) + return ret; + + child_core = ckrm_get_next_child(parent, NULL); while (child_core) { struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) + return ret; - c_cls->stat.effective_guarantee = - p_cls->stat.effective_guarantee * + c_cls->stat.egrt = + p_cls->stat.egrt * c_cls->shares.my_guarantee / p_cls->shares.total_guarantee; - c_cls->stat.effective_limit = - p_cls->stat.effective_guarantee * c_cls->shares.my_limit / - p_cls->shares.total_guarantee; + + c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee + / c_cls->shares.total_guarantee; + + c_cls->stat.ehl = + p_cls->stat.ehl * + get_hard_limit(c_cls) / p_cls->shares.total_guarantee; + + c_cls->stat.mehl = + c_cls->stat.ehl * + get_myhard_limit(c_cls) / c_cls->shares.total_guarantee; child_core = ckrm_get_next_child(parent, child_core); }; - + return 0; } -/* - * update effective guarantee and effective limit - * -- effective share = parent->effective->share * share/parent->total_share - * -- effective limit = parent->effective->share * limit/parent->total_share +/** + * update_effectives: update egrt, ehl, mehl for the whole tree * should be called only when class structure changed + * + * return -1 if anything wrong happened (eg: the structure changed during the process) */ -static void update_effective_guarantee_limit(struct ckrm_core_class *root_core) +static int update_effectives(struct ckrm_core_class *root_core) { - struct ckrm_core_class *cur_core, *child_core = NULL; + struct ckrm_core_class *cur_core, *child_core; struct ckrm_cpu_class *cls; + int ret = -1; cur_core = root_core; + child_core = NULL; cls = ckrm_get_cpu_class(cur_core); - cls->stat.effective_guarantee = CKRM_SHARE_MAX; - cls->stat.effective_limit = cls->stat.effective_guarantee; - repeat: + //initialize the effectives for root + cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */ + cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee + / cls->shares.total_guarantee; + cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls) + / cls->shares.total_guarantee; + cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls) + / cls->shares.total_guarantee; + + repeat: //check exit if (!cur_core) - return; + return 0; //visit this node - update_child_effective(cur_core); + if (update_child_effective(cur_core) < 0) + return ret; //invalid cur_core node + //next child child_core = ckrm_get_next_child(cur_core, child_core); + if (child_core) { - //go down + //go down to the next hier cur_core = child_core; child_core = NULL; - goto repeat; - } else { //no more child, go back + } else { //no more child, go back child_core = cur_core; cur_core = child_core->hnode.parent; } @@ -312,12 +415,12 @@ static void update_effective_guarantee_limit(struct ckrm_core_class *root_core) /**********************************************/ /* - * surplus = my_effective_share - demand + * surplus = egrt - demand * if surplus < 0, surplus = 0 */ static inline int get_node_surplus(struct ckrm_cpu_class *cls) { - int surplus = cls->stat.effective_guarantee - cls->stat.cpu_demand; + int surplus = cls->stat.egrt - cls->stat.max_demand; if (surplus < 0) surplus = 0; @@ -325,122 +428,199 @@ static inline int get_node_surplus(struct ckrm_cpu_class *cls) return surplus; } -/* - * consume the surplus +static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) +{ + int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); + + if (surplus < 0) + surplus = 0; + + return surplus; +} + +/** + * node_surplus_consume: consume the surplus + * @ckeck_sl: if check_sl is set, then check soft_limit + * @total_grt: total guarantee * return how much consumed - * set glut when necessary + * return -1 on error + * + * implements all the CKRM Scheduling Requirement + * update total_grt if necessary */ -static inline int node_surplus_consume(int old_surplus, +static inline int node_surplus_consume(int surplus, struct ckrm_core_class *child_core, - struct ckrm_cpu_class *p_cls) + struct ckrm_cpu_class *p_cls, + int check_sl + ) { int consumed = 0; int inc_limit; + int glut = 1; struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core); + int total_grt = p_cls->shares.total_guarantee; + + BUG_ON(surplus < 0); - if (c_cls->stat.glut) + if (! c_cls || ! total_grt) goto out; - //check demand - if (c_cls->stat.effective_share >= c_cls->stat.cpu_demand) { - c_cls->stat.glut = 1; + /*can't consume more than demand or hard limit*/ + if (c_cls->stat.eshare >= c_cls->stat.max_demand) goto out; - } consumed = - old_surplus * c_cls->shares.my_guarantee / - p_cls->shares.total_guarantee; + surplus * c_cls->shares.my_guarantee / total_grt; - //check limit - inc_limit = c_cls->stat.effective_limit - c_cls->stat.effective_share; - if (inc_limit <= consumed) { - c_cls->stat.glut = 1; - consumed = inc_limit; + if (! consumed) //no more share + goto out; + + //hard limit and demand limit + inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare; + + if (check_sl) { + int esl = p_cls->stat.eshare * get_soft_limit(c_cls) + /p_cls->shares.total_guarantee; + if (esl < c_cls->stat.max_demand) + inc_limit = esl - c_cls->stat.eshare; } - c_cls->stat.effective_share += consumed; - out: + + if (consumed > inc_limit) + consumed = inc_limit; + else + glut = 0; + + BUG_ON(consumed < 0); + set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed); + BUG_ON(c_cls->stat.eshare < 0); + + out: return consumed; } -/* - * re-allocate the shares for all the childs under this node +/** + * alloc_surplus_node: re-allocate the shares for children under parent + * @parent: parent node + * return the remaining surplus + * * task: * 1. get total surplus * 2. allocate surplus * 3. set the effective_share of each node */ -static void alloc_surplus_node(struct ckrm_core_class *parent) +static int alloc_surplus_node(struct ckrm_core_class *parent) { - int total_surplus = 0, old_surplus = 0; + int total_surplus , old_surplus; struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent); struct ckrm_core_class *child_core = NULL; int self_share; + int check_sl; + int ret = -1; + + if (! p_cls) + return ret; + + total_surplus = get_my_node_surplus(p_cls); /* - * calculate surplus - * total_surplus = sum(child_surplus) - * reset glut flag * initialize effective_share */ do { child_core = ckrm_get_next_child(parent, child_core); if (child_core) { - struct ckrm_cpu_class *c_cls = - ckrm_get_cpu_class(child_core); - ckrm_stat_t *stat = &c_cls->stat; + struct ckrm_cpu_class *c_cls; + + c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) + return ret; total_surplus += get_node_surplus(c_cls); - stat->glut = 0; - set_effective_share(stat, stat->effective_guarantee); + + set_eshare(&c_cls->stat, c_cls->stat.egrt); } } while (child_core); - /*distribute the surplus */ + if (! total_surplus) + goto realloc_out; + + /* distribute the surplus */ child_core = NULL; + check_sl = 1; + old_surplus = 0; do { - if (!child_core) //keep the surplus of last round + if (!child_core) {//start a new round + + //ok, everybody reached the soft limit + if (old_surplus == total_surplus) + check_sl = 0; old_surplus = total_surplus; + } child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - total_surplus -= - node_surplus_consume(old_surplus, child_core, - p_cls); + if (child_core) { + int consumed = 0; + consumed -= + node_surplus_consume(old_surplus, child_core, + p_cls,check_sl); + if (consumed >= 0) + total_surplus -= consumed; + else + return ret; } //start a new round if something is allocated in the last round - } while (child_core || (total_surplus != old_surplus)); + } while (child_core || check_sl || total_surplus != old_surplus); - //any remaining surplus goes to the default class - self_share = p_cls->stat.effective_share * + realloc_out: + /*how much for itself*/ + self_share = p_cls->stat.eshare * p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee; - self_share += total_surplus; - set_self_effective_share(&p_cls->stat, self_share); + if (self_share < p_cls->stat.max_demand) { + /*any remaining surplus goes to the default class*/ + self_share += total_surplus; + if (self_share > p_cls->stat.max_demand) + self_share = p_cls->stat.max_demand; + } + + set_meshare(&p_cls->stat, self_share); + return 0; } /** * alloc_surplus - reallocate unused shares * * class A's usused share should be allocated to its siblings + * the re-allocation goes downward from the top */ -static void alloc_surplus(struct ckrm_core_class *root_core) +static int alloc_surplus(struct ckrm_core_class *root_core) { - struct ckrm_core_class *cur_core, *child_core = NULL; + struct ckrm_core_class *cur_core, *child_core; struct ckrm_cpu_class *cls; + int ret = -1; + /*initialize*/ cur_core = root_core; + child_core = NULL; cls = ckrm_get_cpu_class(cur_core); - cls->stat.glut = 0; - set_effective_share(&cls->stat, cls->stat.effective_guarantee); + + //set root eshare + set_eshare(&cls->stat, cls->stat.egrt); + + /*the ckrm idle tasks get all what's remaining*/ + /*hzheng: uncomment the following like for hard limit support */ + // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); + repeat: //check exit if (!cur_core) - return; + return 0; //visit this node - alloc_surplus_node(cur_core); + if ( alloc_surplus_node(cur_core) < 0 ) + return ret; + //next child child_core = ckrm_get_next_child(cur_core, child_core); if (child_core) { @@ -455,6 +635,199 @@ static void alloc_surplus(struct ckrm_core_class *root_core) goto repeat; } +/**********************************************/ +/* CKRM Idle Tasks */ +/**********************************************/ +struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class; +struct task_struct* ckrm_idle_tasks[NR_CPUS]; + +/*how many ckrm idle tasks should I wakeup*/ +static inline int get_nr_idle(unsigned long surplus) +{ + int cpu_online = cpus_weight(cpu_online_map); + int nr_idle = 0; + + nr_idle = surplus * cpu_online; + nr_idle >>= CKRM_SHARE_ACCURACY; + + if (surplus) + nr_idle ++; + + if (nr_idle > cpu_online) + nr_idle = cpu_online; + + return nr_idle; +} + +/** + * update_ckrm_idle: update the status of the idle class according to the new surplus + * surplus: new system surplus + * + * Task: + * -- update share of the idle class + * -- wakeup idle tasks according to surplus + */ +void update_ckrm_idle(unsigned long surplus) +{ + int nr_idle = get_nr_idle(surplus); + int i; + struct task_struct* idle_task; + + set_eshare(&ckrm_idle_class->stat,surplus); + set_meshare(&ckrm_idle_class->stat,surplus); + /*wake up nr_idle idle tasks*/ + for_each_online_cpu(i) { + idle_task = ckrm_idle_tasks[i]; + if (unlikely(idle_task->cpu_class != ckrm_idle_class)) { + ckrm_cpu_change_class(idle_task, + idle_task->cpu_class, + ckrm_idle_class); + } + if (! idle_task) + continue; + if (i < nr_idle) { + //activate it + wake_up_process(idle_task); + } else { + //deactivate it + idle_task->state = TASK_INTERRUPTIBLE; + set_tsk_need_resched(idle_task); + } + } +} + +static int ckrm_cpu_idled(void *nothing) +{ + set_user_nice(current,19); + daemonize("ckrm_idle_task"); + + //deactivate it, it will be waked up by ckrm_cpu_monitor + current->state = TASK_INTERRUPTIBLE; + schedule(); + + /*similar to cpu_idle */ + while (1) { + while (!need_resched()) { + ckrm_cpu_monitor(); + if (current_cpu_data.hlt_works_ok) { + local_irq_disable(); + if (!need_resched()) { + set_tsk_need_resched(current); + safe_halt(); + } else + local_irq_enable(); + } + } + schedule(); + } + return 0; +} + +/** + * ckrm_start_ckrm_idle: + * create the ckrm_idle_class and starts the idle tasks + * + */ +void ckrm_start_ckrm_idle(void) +{ + int i; + int ret; + ckrm_shares_t shares; + + ckrm_idle_class = &ckrm_idle_class_obj; + memset(ckrm_idle_class,0,sizeof(shares)); + /*don't care about the shares */ + init_cpu_class(ckrm_idle_class,&shares); + printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class); + + for_each_online_cpu(i) { + ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL); + + /*warn on error, but the system should still work without it*/ + if (ret < 0) + printk(KERN_ERR"Warn: can't start ckrm idle tasks\n"); + else { + ckrm_idle_tasks[i] = find_task_by_pid(ret); + if (!ckrm_idle_tasks[i]) + printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret); + } + } +} + +/**********************************************/ +/* Local Weight */ +/**********************************************/ +/** + * adjust_class_local_weight: adjust the local weight for each cpu + * + * lrq->weight = lpr->pressure * class->weight / total_pressure + */ +static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) +{ + unsigned long total_pressure = 0; + ckrm_lrq_t* lrq; + int i; + unsigned long class_weight; + unsigned long long lw; + + //get total pressure + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + total_pressure += lrq->lrq_load; + } + + if (! total_pressure) + return; + + class_weight = cpu_class_weight(clsptr) * cpu_online; + + /* + * update weight for each cpu, minimun is 1 + */ + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + if (! lrq->lrq_load) + /*give idle class a high share to boost interactiveness */ + lw = cpu_class_weight(clsptr); + else { + lw = lrq->lrq_load * class_weight; + do_div(lw,total_pressure); + if (!lw) + lw = 1; + else if (lw > CKRM_SHARE_MAX) + lw = CKRM_SHARE_MAX; + } + + lrq->local_weight = lw; + } +} + +/* + * assume called with class_list_lock read lock held + */ +void adjust_local_weight(void) +{ + static spinlock_t lock = SPIN_LOCK_UNLOCKED; + struct ckrm_cpu_class *clsptr; + int cpu_online; + + //do nothing if someone already holding the lock + if (! spin_trylock(&lock)) + return; + + cpu_online = cpus_weight(cpu_online_map); + + //class status: demand, share,total_ns prio, index + list_for_each_entry(clsptr,&active_cpu_classes,links) { + adjust_lrq_weight(clsptr,cpu_online); + } + + spin_unlock(&lock); +} + +/**********************************************/ +/* Main */ +/**********************************************/ /** *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress * @@ -464,13 +837,43 @@ static void alloc_surplus(struct ckrm_core_class *root_core) */ void ckrm_cpu_monitor(void) { - struct ckrm_core_class *root_core = default_cpu_class->core; + static spinlock_t lock = SPIN_LOCK_UNLOCKED; + static unsigned long long last_check = 0; + struct ckrm_core_class *root_core = get_default_cpu_class()->core; + unsigned long long now; +#define MIN_CPU_MONITOR_INTERVAL 100000000UL + if (!root_core) return; - update_effective_guarantee_limit(root_core); - update_cpu_demand(root_core); - alloc_surplus(root_core); + //do nothing if someone already holding the lock + if (! spin_trylock(&lock)) + return; + + read_lock(&class_list_lock); + + now = sched_clock(); + + //consecutive check should be at least 100ms apart + if (now - last_check < MIN_CPU_MONITOR_INTERVAL) { + goto outunlock; + } + last_check = now; + + if (update_effectives(root_core) != 0) + goto outunlock; + + if (update_max_demand(root_core) != 0) + goto outunlock; + + if (alloc_surplus(root_core) != 0) + goto outunlock; + + adjust_local_weight(); + + outunlock: + read_unlock(&class_list_lock); + spin_unlock(&lock); } /*****************************************************/ @@ -526,6 +929,8 @@ void ckrm_kill_monitor(void) int ckrm_cpu_monitor_init(void) { ckrm_start_monitor(); + /*hzheng: uncomment the following like for hard limit support */ + // ckrm_start_ckrm_idle(); return 0; } diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c index 1929aaf4e..0400844a3 100644 --- a/kernel/ckrm_classqueue.c +++ b/kernel/ckrm_classqueue.c @@ -133,12 +133,16 @@ void classqueue_update_prio(struct classqueue_struct *cq, //add to new positon, round robin for classes with same priority list_add_tail(&(node->list), &cq->array.queue[index]); - __set_bit(index, cq->array.bitmap); - + __set_bit(index, cq->array.bitmap); node->index = index; } -cq_node_t *classqueue_get_head(struct classqueue_struct *cq) +/** + *classqueue_get_min_prio: return the priority of the last node in queue + * + * this function can be called without runqueue lock held + */ +static inline int classqueue_get_min_prio(struct classqueue_struct *cq) { cq_node_t *result = NULL; int pos; @@ -147,9 +151,36 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) * search over the bitmap to get the first class in the queue */ pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset); - if (pos >= CLASSQUEUE_SIZE) { //do circular search from the beginning + //do circular search from the beginning + if (pos >= CLASSQUEUE_SIZE) pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); + + if (pos < CLASSQUEUE_SIZE) { + result = list_entry(cq->array.queue[pos].next, cq_node_t, list); + if (list_empty(&cq->array.queue[pos])) + result = NULL; } + if (result) + return result->prio; + else + return 0; +} + +/** + * this function must be called with runqueue lock held + */ +cq_node_t *classqueue_get_head(struct classqueue_struct *cq) +{ + cq_node_t *result = NULL; + int pos; + + /* + * search over the bitmap to get the first class in the queue + */ + pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset); + //do circular search from the beginning + if (pos >= CLASSQUEUE_SIZE) + pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); if (pos < CLASSQUEUE_SIZE) { BUG_ON(list_empty(&cq->array.queue[pos])); @@ -162,15 +193,17 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) * Moving the end of queue forward * the new_base here is logical, we need to translate to the abosule position */ -void classqueue_update_base(struct classqueue_struct *cq, int new_base) +void classqueue_update_base(struct classqueue_struct *cq) { - if (!cq_nr_member(cq)) { + int new_base; + + if (! cq_nr_member(cq)) { cq->base_offset = -1; //not defined return; } - // assert(new_base >= cq->base); - + new_base = classqueue_get_min_prio(cq); + if (new_base > cq->base) { cq->base_offset = get_index(cq, &new_base); cq->base = new_base; diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c index ba716d4c5..e762b2d7c 100644 --- a/kernel/ckrm_sched.c +++ b/kernel/ckrm_sched.c @@ -15,57 +15,141 @@ #include #include +rwlock_t class_list_lock = RW_LOCK_UNLOCKED; +LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor + +struct ckrm_cpu_class default_cpu_class_obj; + +struct ckrm_cpu_class * get_default_cpu_class(void) { + return (&default_cpu_class_obj); +} + /*******************************************************/ /* CVT Management */ /*******************************************************/ -#define CVT_WINDOW_SIZE (CLASSQUEUE_SIZE << CLASS_BONUS_RATE) -static CVT_t max_CVT = CVT_WINDOW_SIZE; -/* - * Also ensure that the classes global cvt is upgraded to the - * minimum CVT in the system, as a class might not have run for a while +/** + * update_class_cputime - updates cvt of inactive classes + * -- an inactive class shouldn't starve others when it comes back + * -- the cpu time it lost when it's inactive should be accumulated + * -- its accumulated saving should be compensated (in a leaky bucket fashion) + * + * class_list_lock must have been acquired */ -static void update_global_cvt(struct ckrm_cpu_class *cpu_class, int cpu) +void update_class_cputime(int this_cpu) { - struct ckrm_local_runqueue *class_queue = - get_ckrm_local_runqueue(cpu_class, cpu); - CVT_t min_cvt; - CVT_t local_cvt_old = class_queue->local_cvt; - - spin_lock(&cvt_lock); - if (class_queue->uncounted_cvt) { - cpu_class->global_cvt += class_queue->uncounted_cvt; - class_queue->uncounted_cvt = 0; + struct ckrm_cpu_class *clsptr; + ckrm_lrq_t * lrq; + CVT_t cur_cvt,min_cvt; + + /* + * a class's local_cvt must not be significantly smaller than min_cvt + * of active classes otherwise, it will starve other classes when it + * is reactivated. + * + * Hence we keep all local_cvt's within a range of the min_cvt off + * all active classes (approximated by the local_cvt of the currently + * running class) and account for how many cycles where thus taken + * from an inactive class building a savings (not to exceed a few seconds) + * for a class to gradually make up upon reactivation, without + * starvation of other classes. + * + */ + + // printk("update_class_cputime(%d)\n",this_cpu); + + cur_cvt = get_local_cur_cvt(this_cpu); + + /* + * - check the local cvt of all the classes + * - update total_ns received by the class + * - do a usage sampling for the whole class + */ + list_for_each_entry(clsptr, &active_cpu_classes, links) { + lrq = get_ckrm_lrq(clsptr, this_cpu); + + spin_lock(&clsptr->stat.stat_lock); + clsptr->stat.total_ns += lrq->uncounted_ns; + ckrm_sample_usage(clsptr); + spin_unlock(&clsptr->stat.stat_lock); + + lrq->uncounted_ns = 0; + + /* + * Always leaving a small bonus for inactive classes + * allows them to compete for cycles immediately when the become + * active. This should improve interactive behavior + */ + min_cvt = cur_cvt - INTERACTIVE_BONUS(lrq); + + if (lrq->local_cvt < min_cvt) { + CVT_t lost_cvt; + + lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq); + lrq->local_cvt = min_cvt; + + /* add what the class lost to its savings*/ + lrq->savings += lost_cvt; + if (lrq->savings > MAX_SAVINGS) + lrq->savings = MAX_SAVINGS; + + } else if (lrq->savings) { + /* + *if a class saving and falling behind + * then start to use it saving in a leaking bucket way + */ + CVT_t savings_used; + + savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq); + if (savings_used > lrq->savings) + savings_used = lrq->savings; + + if (savings_used > SAVINGS_LEAK_SPEED) + savings_used = SAVINGS_LEAK_SPEED; + + lrq->savings -= savings_used; + unscale_cvt(savings_used,lrq); + lrq->local_cvt -= savings_used; + } } - min_cvt = max_CVT - CVT_WINDOW_SIZE; - if (cpu_class->global_cvt < min_cvt) - cpu_class->global_cvt = min_cvt; - else if (cpu_class->global_cvt > max_CVT) - max_CVT = cpu_class->global_cvt; - -/* update local cvt from global cvt*/ -#if 0 - class_queue->local_cvt = cpu_class->global_cvt; -#endif - spin_unlock(&cvt_lock); - - if (class_queue->local_cvt != local_cvt_old) - update_class_priority(class_queue); } -/* - * class_list_lock must have been acquired +/*******************************************************/ +/* PID load balancing stuff */ +/*******************************************************/ +#define PID_SAMPLE_T 32 +#define PID_KP 20 +#define PID_KI 60 +#define PID_KD 20 + +/** + * sample pid load periodically */ -void update_global_cvts(int this_cpu) +void ckrm_load_sample(ckrm_load_t* pid,int cpu) { - struct ckrm_cpu_class *clsptr; - struct ckrm_local_runqueue *class_queue; + long load; + long err; - /*for each class*/ - list_for_each_entry(clsptr, &active_cpu_classes, links) { - update_global_cvt(clsptr, this_cpu); - class_queue = get_ckrm_local_runqueue(clsptr, this_cpu); - clsptr->stat.total_ns += class_queue->uncounted_ns; - class_queue->uncounted_ns = 0; - } + if (jiffies % PID_SAMPLE_T) + return; + + adjust_local_weight(); + + load = ckrm_cpu_load(cpu); + err = load - pid->load_p; + pid->load_d = err; + pid->load_p = load; + pid->load_i *= 9; + pid->load_i += load; + pid->load_i /= 10; +} + +long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group) +{ + long pressure; + pressure = ckrm_load->load_p * PID_KP; + pressure += ckrm_load->load_i * PID_KI; + pressure += ckrm_load->load_d * PID_KD; + pressure /= 100; + return pressure; } diff --git a/kernel/exit.c b/kernel/exit.c index 70c92e58b..ca75e5ea5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -514,12 +513,6 @@ static inline void __exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); -#ifdef CONFIG_CKRM_RES_MEM - spin_lock(&mm->peertask_lock); - list_del_init(&tsk->mm_peers); - ckrm_mem_evaluate_mm(mm); - spin_unlock(&mm->peertask_lock); -#endif enter_lazy_tlb(mm, current); task_unlock(tsk); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index e639ce1c8..37c727ae1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include @@ -266,9 +265,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) ckrm_cb_newtask(tsk); /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); -#ifdef CONFIG_CKRM_RES_MEM - INIT_LIST_HEAD(&tsk->mm_peers); -#endif return tsk; } @@ -421,10 +417,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; -#ifdef CONFIG_CKRM_RES_MEM - INIT_LIST_HEAD(&mm->tasklist); - mm->peertask_lock = SPIN_LOCK_UNLOCKED; -#endif if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -445,10 +437,6 @@ struct mm_struct * mm_alloc(void) if (mm) { memset(mm, 0, sizeof(*mm)); mm = mm_init(mm); -#ifdef CONFIG_CKRM_RES_MEM - mm->memclass = GET_MEM_CLASS(current); - mem_class_get(mm->memclass); -#endif } return mm; } @@ -463,13 +451,6 @@ void fastcall __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); -#ifdef CONFIG_CKRM_RES_MEM - /* class can be null and mm's tasklist can be empty here */ - if (mm->memclass) { - mem_class_put(mm->memclass); - mm->memclass = NULL; - } -#endif free_mm(mm); } @@ -597,7 +578,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) good_mm: tsk->mm = mm; tsk->active_mm = mm; - ckrm_init_mm_to_task(mm, tsk); return 0; free_pt: diff --git a/kernel/sched.c b/kernel/sched.c index 0e1d0a2ed..148d1ac9b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -17,7 +17,6 @@ * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin */ - #include #include #include @@ -157,8 +156,19 @@ #define LOW_CREDIT(p) \ ((p)->interactive_credit < -CREDIT_LIMIT) +#ifdef CONFIG_CKRM_CPU_SCHEDULE +/* + * if belong to different class, compare class priority + * otherwise compare task priority + */ +#define TASK_PREEMPTS_CURR(p, rq) \ + (((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio) + +#else + #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) +#endif /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] @@ -175,7 +185,7 @@ ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -static unsigned int task_timeslice(task_t *p) +unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } @@ -186,15 +196,9 @@ static unsigned int task_timeslice(task_t *p) * These are the runqueue data structures: */ -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - typedef struct runqueue runqueue_t; - -struct prio_array { - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; -}; +#include +#include /* * This is the main, per-CPU runqueue data structure. @@ -219,7 +223,12 @@ struct runqueue { unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + struct classqueue_struct classqueue; + ckrm_load_t ckrm_load; +#else + prio_array_t *active, *expired, arrays[2]; +#endif int best_expired_prio; atomic_t nr_iowait; @@ -298,15 +307,108 @@ static inline void rq_unlock(runqueue_t *rq) spin_unlock_irq(&rq->lock); } +#ifdef CONFIG_CKRM_CPU_SCHEDULE +static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) +{ + cq_node_t *node = classqueue_get_head(&rq->classqueue); + return ((node) ? class_list_entry(node) : NULL); +} + +/* + * return the cvt of the current running class + * if no current running class, return 0 + * assume cpu is valid (cpu_online(cpu) == 1) + */ +CVT_t get_local_cur_cvt(int cpu) +{ + ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu)); + + if (lrq) + return lrq->local_cvt; + else + return 0; +} + +static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +{ + prio_array_t *array; + struct task_struct *next; + ckrm_lrq_t *queue; + int idx; + int cpu = smp_processor_id(); + + next = rq->idle; + retry_next_class: + if ((queue = rq_get_next_class(rq))) { + //check switch active/expired queue + array = queue->active; + if (unlikely(!array->nr_active)) { + queue->active = queue->expired; + queue->expired = array; + queue->expired_timestamp = 0; + + if (queue->active->nr_active) + set_top_priority(queue, + find_first_bit(queue->active->bitmap, MAX_PRIO)); + else { + classqueue_dequeue(queue->classqueue, + &queue->classqueue_linkobj); + cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); + } + goto retry_next_class; + } + BUG_ON(!array->nr_active); + + idx = queue->top_priority; + if (queue->top_priority == MAX_PRIO) { + BUG_ON(1); + } + + next = task_list_entry(array->queue[idx].next); + } + return next; +} +#else /*! CONFIG_CKRM_CPU_SCHEDULE*/ +static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +{ + prio_array_t *array; + struct list_head *queue; + int idx; + + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + return list_entry(queue->next, task_t, run_list); +} + +static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } +static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } +static inline void init_cpu_classes(void) { } +#define rq_ckrm_load(rq) NULL +static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} +#endif /* CONFIG_CKRM_CPU_SCHEDULE */ + /* * Adding/removing a task to/from a priority array: */ static void dequeue_task(struct task_struct *p, prio_array_t *array) { + BUG_ON(! array); array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + class_dequeue_task(p,array); } static void enqueue_task(struct task_struct *p, prio_array_t *array) @@ -315,6 +417,7 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + class_enqueue_task(p,array); } /* @@ -328,6 +431,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + class_enqueue_task(p,array); } /* @@ -366,7 +470,7 @@ static int effective_prio(task_t *p) */ static inline void __activate_task(task_t *p, runqueue_t *rq) { - enqueue_task(p, rq->active); + enqueue_task(p, rq_active(p,rq)); rq->nr_running++; } @@ -375,7 +479,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) */ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { - enqueue_task_head(p, rq->active); + enqueue_task_head(p, rq_active(p,rq)); rq->nr_running++; } @@ -881,6 +985,10 @@ void fastcall sched_fork(task_t *p) INIT_LIST_HEAD(&p->run_list); p->array = NULL; spin_lock_init(&p->switch_lock); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0); +#endif + #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which @@ -956,6 +1064,7 @@ void fastcall wake_up_forked_process(task_t * p) p->array = current->array; p->array->nr_active++; rq->nr_running++; + class_enqueue_task(p,p->array); } task_rq_unlock(rq, &flags); } @@ -1278,6 +1387,7 @@ lock_again: p->array = current->array; p->array->nr_active++; rq->nr_running++; + class_enqueue_task(p,p->array); } } else { /* Not the local CPU - must adjust timestamp */ @@ -1423,6 +1533,449 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } +#ifdef CONFIG_CKRM_CPU_SCHEDULE +static inline int ckrm_preferred_task(task_t *tmp,long min, long max, + int phase, enum idle_type idle) +{ + long pressure = task_load(tmp); + + if (pressure > max) + return 0; + + if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) + return 0; + return 1; +} + +/* + * move tasks for a specic local class + * return number of tasks pulled + */ +static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, + runqueue_t *this_rq, + runqueue_t *busiest, + struct sched_domain *sd, + int this_cpu, + enum idle_type idle, + long* pressure_imbalance) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + task_t *tmp; + int idx; + int pulled = 0; + int phase = -1; + long pressure_min, pressure_max; + /*hzheng: magic : 90% balance is enough*/ + long balance_min = *pressure_imbalance / 10; +/* + * we don't want to migrate tasks that will reverse the balance + * or the tasks that make too small difference + */ +#define CKRM_BALANCE_MAX_RATIO 100 +#define CKRM_BALANCE_MIN_RATIO 1 + start: + phase ++; + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (src_lrq->expired->nr_active) { + array = src_lrq->expired; + dst_array = dst_lrq->expired; + } else { + array = src_lrq->active; + dst_array = dst_lrq->active; + } + + new_array: + /* Start searching at priority 0: */ + idx = 0; + skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == src_lrq->expired && src_lrq->active->nr_active) { + array = src_lrq->active; + dst_array = dst_lrq->active; + goto new_array; + } + if ((! phase) && (! pulled) && (idle != IDLE)) + goto start; //try again + else + goto out; //finished search for this lrq + } + + head = array->queue + idx; + curr = head->prev; + skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + + pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; + pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; + /* + * skip the tasks that will reverse the balance too much + */ + if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { + *pressure_imbalance -= task_load(tmp); + pull_task(busiest, array, tmp, + this_rq, dst_array, this_cpu); + pulled++; + + if (*pressure_imbalance <= balance_min) + goto out; + } + + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + out: + return pulled; +} + +static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) +{ + long imbalance; + /* + * make sure after balance, imbalance' > - imbalance/2 + * we don't want the imbalance be reversed too much + */ + imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) + - pid_get_pressure(rq_ckrm_load(this_rq),1); + imbalance /= 2; + return imbalance; +} + +/* + * try to balance the two runqueues + * + * Called with both runqueues locked. + * if move_tasks is called, it will try to move at least one task over + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct ckrm_cpu_class *clsptr,*vip_cls = NULL; + ckrm_lrq_t* src_lrq,*dst_lrq; + long pressure_imbalance, pressure_imbalance_old; + int src_cpu = task_cpu(busiest->curr); + struct list_head *list; + int pulled = 0; + long imbalance; + + imbalance = ckrm_rq_imbalance(this_rq,busiest); + + if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) + goto out; + + //try to find the vip class + list_for_each_entry(clsptr,&active_cpu_classes,links) { + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + + if (! lrq_nr_running(src_lrq)) + continue; + + if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) + { + vip_cls = clsptr; + } + } + + /* + * do search from the most significant class + * hopefully, less tasks will be migrated this way + */ + clsptr = vip_cls; + + move_class: + if (! clsptr) + goto out; + + + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + if (! lrq_nr_running(src_lrq)) + goto other_class; + + dst_lrq = get_ckrm_lrq(clsptr,this_cpu); + + //how much pressure for this class should be transferred + pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; + if (pulled && ! pressure_imbalance) + goto other_class; + + pressure_imbalance_old = pressure_imbalance; + + //move tasks + pulled += + ckrm_cls_move_tasks(src_lrq,dst_lrq, + this_rq, + busiest, + sd,this_cpu,idle, + &pressure_imbalance); + + /* + * hzheng: 2 is another magic number + * stop balancing if the imbalance is less than 25% of the orig + */ + if (pressure_imbalance <= (pressure_imbalance_old >> 2)) + goto out; + + //update imbalance + imbalance *= pressure_imbalance / pressure_imbalance_old; + other_class: + //who is next? + list = clsptr->links.next; + if (list == &active_cpu_classes) + list = list->next; + clsptr = list_entry(list, typeof(*clsptr), links); + if (clsptr != vip_cls) + goto move_class; + out: + return pulled; +} + +/** + * ckrm_check_balance - is load balancing necessary? + * return 0 if load balancing is not necessary + * otherwise return the average load of the system + * also, update nr_group + * + * heuristics: + * no load balancing if it's load is over average + * no load balancing if it's load is far more than the min + * task: + * read the status of all the runqueues + */ +static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, + enum idle_type idle, int* nr_group) +{ + struct sched_group *group = sd->groups; + unsigned long min_load, max_load, avg_load; + unsigned long total_load, this_load, total_pwr; + + max_load = this_load = total_load = total_pwr = 0; + min_load = 0xFFFFFFFF; + *nr_group = 0; + + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + /* Tally up the load of all CPUs in the group */ + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto nextgroup; + + avg_load = 0; + local_group = cpu_isset(this_cpu, group->cpumask); + + for_each_cpu_mask(i, tmp) { + load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + } + if (avg_load < min_load) { + min_load = avg_load; + } +nextgroup: + group = group->next; + *nr_group = *nr_group + 1; + } while (group != sd->groups); + + if (!max_load || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + /* hzheng: debugging: 105 is a magic number + * 100*max_load <= sd->imbalance_pct*this_load) + * should use imbalance_pct instead + */ + if (this_load > avg_load + || 100*max_load < 105*this_load + || 100*min_load < 70*this_load + ) + goto out_balanced; + + return avg_load; + out_balanced: + return 0; +} + +/** + * any group that has above average load is considered busy + * find the busiest queue from any of busy group + */ +static runqueue_t * +ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, + unsigned long avg_load, enum idle_type idle, + int nr_group) +{ + struct sched_group *group; + runqueue_t * busiest=NULL; + unsigned long rand; + + group = sd->groups; + rand = get_ckrm_rand(nr_group); + nr_group = 0; + + do { + unsigned long load,total_load,max_load; + cpumask_t tmp; + int i; + runqueue_t * grp_busiest; + + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto find_nextgroup; + + total_load = 0; + max_load = 0; + grp_busiest = NULL; + for_each_cpu_mask(i, tmp) { + load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); + total_load += load; + if (load > max_load) { + max_load = load; + grp_busiest = cpu_rq(i); + } + } + + total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; + if (total_load > avg_load) { + busiest = grp_busiest; + if (nr_group >= rand) + break; + } + find_nextgroup: + group = group->next; + nr_group ++; + } while (group != sd->groups); + + return busiest; +} + +/** + * load_balance - pressure based load balancing algorithm used by ckrm + */ +static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + runqueue_t *busiest; + unsigned long avg_load; + int nr_moved,nr_group; + + avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); + if (! avg_load) + goto out_balanced; + + busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); + if (! busiest) + goto out_balanced; + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + 0,sd, idle); + spin_unlock(&busiest->lock); + if (nr_moved) { + adjust_local_weight(); + } + } + + if (!nr_moved) + sd->nr_balance_failed ++; + else + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + + return nr_moved; + +out_balanced: + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} + +/* + * this_rq->lock is already held + */ +static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) +{ + int ret; + read_lock(&class_list_lock); + ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + return ret; +} + +static inline int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + int ret; + + spin_lock(&this_rq->lock); + read_lock(&class_list_lock); + ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + spin_unlock(&this_rq->lock); + return ret; +} +#else /*! CONFIG_CKRM_CPU_SCHEDULE */ /* * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, * as part of a balancing operation within "domain". Returns the number of @@ -1787,6 +2340,8 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out: return nr_moved; } +#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ + /* * idle_balance is called by schedule() if this_cpu is about to become @@ -1924,7 +2479,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, } } } -#else +#else /* SMP*/ /* * on UP we do not need to balance between CPUs: */ @@ -1951,8 +2506,7 @@ static inline int wake_priority_sleeper(runqueue_t *rq) return 0; } -DEFINE_PER_CPU(struct kernel_stat, kstat); - +DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; EXPORT_PER_CPU_SYMBOL(kstat); /* @@ -1965,11 +2519,19 @@ EXPORT_PER_CPU_SYMBOL(kstat); * increasing number of running tasks. We also ignore the interactivity * if a better static_prio task has expired: */ + +#ifndef CONFIG_CKRM_CPU_SCHEDULE #define EXPIRED_STARVING(rq) \ ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ ((rq)->curr->static_prio > (rq)->best_expired_prio)) +#else +#define EXPIRED_STARVING(rq) \ + (STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) +#endif /* * This function gets called by the timer code, with HZ frequency. @@ -2006,6 +2568,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; +//will break ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, IDLE); return; } @@ -2016,7 +2579,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { + if (p->array != rq_active(p,rq)) { set_tsk_need_resched(p); goto out; } @@ -2039,12 +2602,16 @@ void scheduler_tick(int user_ticks, int sys_ticks) set_tsk_need_resched(p); /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); + dequeue_task(p, rq_active(p,rq)); + enqueue_task(p, rq_active(p,rq)); } goto out_unlock; } if (!--p->time_slice) { +#ifdef CONFIG_CKRM_CPU_SCHEDULE + /* Hubertus ... we can abstract this out */ + ckrm_lrq_t* rq = get_task_lrq(p); +#endif dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); @@ -2055,8 +2622,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; + if (p->static_prio < this_rq()->best_expired_prio) + this_rq()->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); } else { @@ -2079,17 +2646,18 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY(p)) && (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { + (p->array == rq_active(p,rq))) { - dequeue_task(p, rq->active); + dequeue_task(p, rq_active(p,rq)); set_tsk_need_resched(p); p->prio = effective_prio(p); - enqueue_task(p, rq->active); + enqueue_task(p, rq_active(p,rq)); } } out_unlock: spin_unlock(&rq->lock); out: + ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -2187,10 +2755,9 @@ asmlinkage void __sched schedule(void) task_t *prev, *next; runqueue_t *rq; prio_array_t *array; - struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx; + int cpu; /* * Test if we are atomic. Since do_exit() needs to call into @@ -2226,6 +2793,19 @@ need_resched: spin_lock_irq(&rq->lock); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + if (prev != rq->idle) { + unsigned long long run = now - prev->timestamp; + ckrm_lrq_t * lrq = get_task_lrq(prev); + + lrq->lrq_load -= task_load(prev); + cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); + lrq->lrq_load += task_load(prev); + + cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); + update_local_cvt(prev, run); + } +#endif /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2243,30 +2823,15 @@ need_resched: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - goto switch_tasks; - } } - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; + next = rq_get_next_task(rq); + if (next == rq->idle) { rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; + wake_sleeping_dependent(cpu, rq); + goto switch_tasks; } - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); - if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; goto switch_tasks; @@ -2321,7 +2886,6 @@ switch_tasks: } EXPORT_SYMBOL(schedule); - #ifdef CONFIG_PREEMPT /* * this is is the entry point to schedule() from in-kernel preemption @@ -3009,7 +3573,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; - prio_array_t *target = rq->expired; + prio_array_t *target = rq_expired(current,rq); /* * We implement yielding by moving the task into the expired @@ -3019,7 +3583,7 @@ asmlinkage long sys_sched_yield(void) * array.) */ if (unlikely(rt_task(current))) - target = rq->active; + target = rq_active(current,rq); dequeue_task(current, array); enqueue_task(current, target); @@ -3396,7 +3960,6 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; - set_task_cpu(p, dest_cpu); if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -3407,10 +3970,12 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); - } + } else + set_task_cpu(p, dest_cpu); out: double_rq_unlock(rq_src, rq_dest); @@ -3919,7 +4484,7 @@ int in_sched_functions(unsigned long addr) void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i; #ifdef CONFIG_SMP /* Set up an initial dummy domain for early boot */ @@ -3938,36 +4503,49 @@ void __init sched_init(void) sched_group_init.next = &sched_group_init; sched_group_init.cpu_power = SCHED_LOAD_SCALE; #endif + init_cpu_classes(); for (i = 0; i < NR_CPUS; i++) { +#ifndef CONFIG_CKRM_CPU_SCHEDULE + int j, k; prio_array_t *array; rq = cpu_rq(i); spin_lock_init(&rq->lock); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + rq->active = rq->arrays; rq->expired = rq->arrays + 1; +#else + rq = cpu_rq(i); + spin_lock_init(&rq->lock); +#endif + rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + ckrm_load_init(rq_ckrm_load(rq)); +#endif rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); #endif atomic_set(&rq->nr_iowait, 0); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } } + /* * We have to do a little magic to get the first * thread right in SMP mode. @@ -3976,6 +4554,10 @@ void __init sched_init(void) rq->curr = current; rq->idle = current; set_task_cpu(current, smp_processor_id()); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + current->cpu_class = get_default_cpu_class(); + current->array = NULL; +#endif wake_up_forked_process(current); /* @@ -4061,3 +4643,33 @@ int task_running_sys(struct task_struct *p) EXPORT_SYMBOL(task_running_sys); #endif +#ifdef CONFIG_CKRM_CPU_SCHEDULE +/** + * return the classqueue object of a certain processor + */ +struct classqueue_struct * get_cpu_classqueue(int cpu) +{ + return (& (cpu_rq(cpu)->classqueue) ); +} + +/** + * _ckrm_cpu_change_class - change the class of a task + */ +void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) +{ + prio_array_t *array; + struct runqueue *rq; + unsigned long flags; + + rq = task_rq_lock(tsk,&flags); + array = tsk->array; + if (array) { + dequeue_task(tsk,array); + tsk->cpu_class = newcls; + enqueue_task(tsk,rq_active(tsk,rq)); + } else + tsk->cpu_class = newcls; + + task_rq_unlock(rq,&flags); +} +#endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0ccf1ee0a..6708f4f80 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -31,7 +31,6 @@ #include #include #include -#include #include @@ -269,7 +268,6 @@ free_pages_bulk(struct zone *zone, int count, /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); __free_pages_bulk(page, base, zone, area, order); - ckrm_clear_page_class(page); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -612,10 +610,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, might_sleep_if(wait); - if (!ckrm_class_limit_ok((GET_MEM_CLASS(current)))) { - return NULL; - } - zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ if (zones[0] == NULL) /* no zones in the zonelist */ return NULL; @@ -745,7 +739,6 @@ nopage: return NULL; got_pg: kernel_map_pages(page, 1 << order, 1); - ckrm_set_pages_class(page, 1 << order, GET_MEM_CLASS(current)); return page; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 4911729ce..8e3b69342 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -37,7 +37,6 @@ #include #include -#include /* possible outcome of pageout() */ typedef enum { @@ -72,9 +71,6 @@ struct scan_control { /* This context's GFP mask */ unsigned int gfp_mask; - /* Flag used by CKRM */ - unsigned int ckrm_flags; - int may_writepage; }; @@ -546,23 +542,19 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - int max_scan = sc->nr_to_scan, nr_pass; - unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; + int max_scan = sc->nr_to_scan; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); -redo: - ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); - nr_pass = zone->nr_inactive; while (max_scan > 0) { struct page *page; int nr_taken = 0; int nr_scan = 0; int nr_freed; - while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX && + while (nr_scan++ < SWAP_CLUSTER_MAX && !list_empty(&zone->inactive_list)) { page = lru_to_page(&zone->inactive_list); @@ -580,25 +572,15 @@ redo: SetPageLRU(page); list_add(&page->lru, &zone->inactive_list); continue; - } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { - __put_page(page); - SetPageLRU(page); -#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE - list_add_tail(&page->lru, &zone->inactive_list); -#else - list_add(&page->lru, &zone->inactive_list); -#endif - continue; } list_add(&page->lru, &page_list); - ckrm_mem_dec_inactive(page); nr_taken++; } zone->nr_inactive -= nr_taken; zone->pages_scanned += nr_taken; spin_unlock_irq(&zone->lru_lock); - if ((bit_flag == 0) && (nr_taken == 0)) + if (nr_taken == 0) goto done; max_scan -= nr_scan; @@ -631,9 +613,6 @@ redo: spin_lock_irq(&zone->lru_lock); } } - if (ckrm_flags && (nr_pass <= 0)) { - goto redo; - } } spin_unlock_irq(&zone->lru_lock); done: @@ -673,17 +652,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) long mapped_ratio; long distress; long swap_tendency; - unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; - int nr_pass; lru_add_drain(); pgmoved = 0; spin_lock_irq(&zone->lru_lock); -redo: - ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); - nr_pass = zone->nr_active; - while (pgscanned < nr_pages && !list_empty(&zone->active_list) && - nr_pass) { + while (pgscanned < nr_pages && !list_empty(&zone->active_list)) { page = lru_to_page(&zone->active_list); prefetchw_prev_lru_page(page, &zone->active_list, flags); if (!TestClearPageLRU(page)) @@ -699,24 +672,11 @@ redo: __put_page(page); SetPageLRU(page); list_add(&page->lru, &zone->active_list); - pgscanned++; - } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { - __put_page(page); - SetPageLRU(page); -#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE - list_add_tail(&page->lru, &zone->active_list); -#else - list_add(&page->lru, &zone->active_list); -#endif } else { list_add(&page->lru, &l_hold); - ckrm_mem_dec_active(page); pgmoved++; - pgscanned++; - } - if (!--nr_pass && ckrm_flags) { - goto redo; } + pgscanned++; } zone->nr_active -= pgmoved; spin_unlock_irq(&zone->lru_lock); @@ -790,7 +750,6 @@ redo: if (!TestClearPageActive(page)) BUG(); list_move(&page->lru, &zone->inactive_list); - ckrm_mem_inc_inactive(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_inactive += pgmoved; @@ -819,7 +778,6 @@ redo: BUG(); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); - ckrm_mem_inc_active(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_active += pgmoved; @@ -867,7 +825,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_to_reclaim = SWAP_CLUSTER_MAX; while (nr_active || nr_inactive) { - sc->ckrm_flags = ckrm_setup_reclamation(); if (nr_active) { sc->nr_to_scan = min(nr_active, (unsigned long)SWAP_CLUSTER_MAX); @@ -883,113 +840,9 @@ shrink_zone(struct zone *zone, struct scan_control *sc) if (sc->nr_to_reclaim <= 0) break; } - ckrm_teardown_reclamation(); - } -} - -#ifdef CONFIG_CKRM_RES_MEM -// This function needs to be given more thought. -// Shrink the class to be at 90% of its limit -static void -ckrm_shrink_class(ckrm_mem_res_t *cls) -{ - struct scan_control sc; - struct zone *zone; - int zindex = 0, active_credit = 0, inactive_credit = 0; - - if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically - // if it is already set somebody is working on it. so... leave - return; - } - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.ckrm_flags = ckrm_get_reclaim_flags(cls); - sc.nr_reclaimed = 0; - sc.priority = 0; // always very high priority - - for_each_zone(zone) { - int zone_total, zone_limit, active_limit, inactive_limit; - int active_over, inactive_over; - unsigned long nr_active, nr_inactive; - u64 temp; - - zone->temp_priority = zone->prev_priority; - zone->prev_priority = sc.priority; - - zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; - - temp = (u64) cls->pg_limit * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_limit = (int) temp; - active_limit = (6 * zone_limit) / 10; // 2/3rd in active list - inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list - - active_over = cls->nr_active[zindex] - active_limit + active_credit; - inactive_over = active_over + - (cls->nr_inactive[zindex] - inactive_limit) + inactive_credit; - - if (active_over > 0) { - zone->nr_scan_active += active_over + 1; - nr_active = zone->nr_scan_active; - active_credit = 0; - } else { - active_credit += active_over; - nr_active = 0; - } - - if (inactive_over > 0) { - zone->nr_scan_inactive += inactive_over; - nr_inactive = zone->nr_scan_inactive; - inactive_credit = 0; - } else { - inactive_credit += inactive_over; - nr_inactive = 0; - } - while (nr_active || nr_inactive) { - if (nr_active) { - sc.nr_to_scan = min(nr_active, - (unsigned long)SWAP_CLUSTER_MAX); - nr_active -= sc.nr_to_scan; - refill_inactive_zone(zone, &sc); - } - - if (nr_inactive) { - sc.nr_to_scan = min(nr_inactive, - (unsigned long)SWAP_CLUSTER_MAX); - nr_inactive -= sc.nr_to_scan; - shrink_cache(zone, &sc); - if (sc.nr_to_reclaim <= 0) - break; - } - } - zone->prev_priority = zone->temp_priority; - zindex++; } - ckrm_clear_shrink(cls); } -static void -ckrm_shrink_classes(void) -{ - ckrm_mem_res_t *cls; - - spin_lock(&ckrm_mem_lock); - while (!ckrm_shrink_list_empty()) { - cls = list_entry(ckrm_shrink_list.next, ckrm_mem_res_t, - shrink_list); - spin_unlock(&ckrm_mem_lock); - ckrm_shrink_class(cls); - spin_lock(&ckrm_mem_lock); - list_del(&cls->shrink_list); - cls->flags &= ~MEM_AT_LIMIT; - } - spin_unlock(&ckrm_mem_lock); -} - -#else -#define ckrm_shrink_classes() do { } while(0) -#endif - /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1295,9 +1148,6 @@ static int kswapd(void *p) schedule(); finish_wait(&pgdat->kswapd_wait, &wait); - if (!ckrm_shrink_list_empty()) - ckrm_shrink_classes(); - else balance_pgdat(pgdat, 0); } return 0; @@ -1308,7 +1158,7 @@ static int kswapd(void *p) */ void wakeup_kswapd(struct zone *zone) { - if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty()) + if (zone->free_pages > zone->pages_low) return; if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) return;