From fff615ac2fec079542bc1cc3a5fc5fe1ec4d09af Mon Sep 17 00:00:00 2001 From: Marc Fiuczynski Date: Mon, 10 Jan 2005 19:02:49 +0000 Subject: [PATCH] ckrm-e16 cpu controller v9rc1 --- init/Kconfig | 12 + kernel/ckrm/Makefile | 4 +- kernel/ckrm/ckrm.c | 10 +- kernel/ckrm/ckrm_cpu_class.c | 400 ++++++-- kernel/ckrm/ckrm_cpu_monitor.c | 1117 ++++++++++++++-------- kernel/ckrm/ckrm_listenaq.c | 12 +- kernel/ckrm/rbce/rbcemod.c | 2 +- kernel/ckrm_classqueue.c | 109 ++- kernel/ckrm_sched.c | 182 +++- kernel/sched.c | 1580 +++++++++++++++++++------------- 10 files changed, 2267 insertions(+), 1161 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index e5480f047..3db7aa02e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -181,6 +181,18 @@ config CKRM_CPU_SCHEDULE Say N if unsure, Y to use the feature. +config CKRM_CPU_SCHEDULE_AT_BOOT + bool "Turn on at boot time" + depends on CKRM_CPU_SCHEDULE + default n + help + Enable CKRM CPU Scheduler at boot time. Otherwise + it can be turned on dynamically at runtime. If not + turned on the default Linux Scheduler behavior + will be obtained. + + Say N if unsure, Y to use this feature + config CKRM_TYPE_SOCKETCLASS bool "Class Manager for socket groups" depends on CKRM diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index de490232b..0d5b1b776 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -8,5 +8,5 @@ endif obj-$(CONFIG_CKRM_TYPE_TASKCLASS) += ckrm_tc.o obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o - obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o - obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o + obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o + obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c index 2a611c1d0..12b7c2691 100644 --- a/kernel/ckrm/ckrm.c +++ b/kernel/ckrm/ckrm.c @@ -82,6 +82,7 @@ inline unsigned int is_res_regd(struct ckrm_classtype *clstype, int resid) ); } +static struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype, const char *resname) { @@ -101,10 +102,8 @@ struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype, return NULL; } -EXPORT_SYMBOL(ckrm_resctlr_lookup); - /* given a classname return the class handle and its classtype*/ -void *ckrm_classobj(char *classname, int *classTypeID) +void *ckrm_classobj(const char *classname, int *classTypeID) { int i; @@ -864,7 +863,10 @@ int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq) atomic_inc(&clstype->nr_resusers[i]); rcbs = clstype->res_ctlrs[i]; if (rcbs && rcbs->get_share_values) { - (*rcbs->get_share_values) (core->res_class[i], &shares); + int rc = (*rcbs->get_share_values)(core->res_class[i], + &shares); + if (rc == -ENOSYS) + continue; seq_printf(seq,"res=%s,guarantee=%d,limit=%d," "total_guarantee=%d,max_limit=%d\n", rcbs->res_name, shares.my_guarantee, diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c index 09ea6ba80..3b4bd2a8c 100644 --- a/kernel/ckrm/ckrm_cpu_class.c +++ b/kernel/ckrm/ckrm_cpu_class.c @@ -22,9 +22,35 @@ #include #include #include +#include + +#define CPU_CTRL_NAME "cpu" struct ckrm_res_ctlr cpu_rcbs; +#define CKRM_CPU_USAGE_DETAIL_MAX 3 +static int usage_detail = 3; /* 0: show usage + * 1: show settings + * 2: show effectives + * 3: show per runqueue stats + */ + +static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode); + +/* + * update effective share setting after: + * -- remove class + * -- change class share + * we don't need to call update_effectives() when add new class since + * the defaults grt of new class is 0 + * CAUTION: might need a lock here + */ +static inline void update_class_effectives(void) +{ + // update_effectives(); + ckrm_cpu_monitor(0); +} + /** * insert_cpu_class - insert a class to active_cpu_class list * @@ -38,49 +64,81 @@ static inline void insert_cpu_class(struct ckrm_cpu_class *cls) /* * initialize a class object and its local queues */ + +CVT_t get_min_cvt_locking(int cpu); +ckrm_lrq_t *rq_get_dflt_lrq(int cpu); + +static void init_cpu_class_lrq(struct ckrm_cpu_class *cls, + int cpu, int isdflt) +{ + int j,k; + ckrm_lrq_t *queue = cls->local_queues[cpu]; + + queue->active = queue->arrays; + queue->expired = queue->arrays+1; + + for (j = 0; j < 2; j++) { + prio_array_t *array = queue->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + array->nr_active = 0; + } + + queue->expired_timestamp = 0; + queue->best_expired_prio = MAX_PRIO; + + queue->cpu_class = cls; + queue->classqueue = get_cpu_classqueue(cpu); + queue->top_priority = MAX_PRIO; + cq_node_init(&queue->classqueue_linkobj); + queue->local_cvt = isdflt ? 0 : get_min_cvt_locking(cpu); + queue->lrq_load = 0; + queue->local_weight = cpu_class_weight(cls); + if (queue->local_weight == 0) + queue->local_weight = 1; + queue->over_weight = 0; + queue->skewed_weight = CKRM_MAX_WEIGHT/2; /*otherwise class might starve on start*/ + queue->uncounted_ns = 0; + queue->savings = 0; + queue->magic = CKRM_LRQ_MAGIC; +} + void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) { - int i,j,k; - prio_array_t *array; - ckrm_lrq_t* queue; + int i; + int isdflt; + struct ckrm_cpu_class *dfltcls; + + dfltcls = get_default_cpu_class(); + + isdflt = (cls==dfltcls); cls->shares = *shares; cls->cnt_lock = SPIN_LOCK_UNLOCKED; - ckrm_cpu_stat_init(&cls->stat); + ckrm_cpu_stat_init(&cls->stat,isdflt ? CKRM_SHARE_MAX : 1); ckrm_usage_init(&cls->usage); cls->magic = CKRM_CPU_CLASS_MAGIC; - for (i = 0 ; i < NR_CPUS ; i++) { - queue = &cls->local_queues[i]; - queue->active = queue->arrays; - queue->expired = queue->arrays+1; - - for (j = 0; j < 2; j++) { - array = queue->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - array->nr_active = 0; + memset(cls->local_queues,0,NR_CPUS*sizeof(ckrm_lrq_t*)); + + if (isdflt) { + for (i=0; i< NR_CPUS; i++) { + cls->local_queues[i] = rq_get_dflt_lrq(i); + init_cpu_class_lrq(cls,i,1); + } + } else { + for_each_cpu(i) { + cls->local_queues[i] = kmalloc(sizeof(ckrm_lrq_t), + GFP_KERNEL); + BUG_ON(cls->local_queues[i]==NULL); + init_cpu_class_lrq(cls,i,0); } - - queue->expired_timestamp = 0; - - queue->cpu_class = cls; - queue->classqueue = get_cpu_classqueue(i); - queue->top_priority = MAX_PRIO; - cq_node_init(&queue->classqueue_linkobj); - queue->local_cvt = 0; - queue->lrq_load = 0; - queue->local_weight = cpu_class_weight(cls); - queue->uncounted_ns = 0; - queue->savings = 0; - queue->magic = 0x43FF43D7; } - // add to class list write_lock(&class_list_lock); insert_cpu_class(cls); write_unlock(&class_list_lock); @@ -100,14 +158,14 @@ struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) { struct ckrm_cpu_class * cls; cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); - if (valid_cpu_class(cls)) - return cls; + if (valid_cpu_class(cls)) + return (ckrm_cpu_enabled() ? cls : get_default_cpu_class()); else return NULL; } - -void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) +void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, + struct ckrm_core_class *parent) { struct ckrm_cpu_class *cls; @@ -128,7 +186,7 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class set_default_share(&shares); init_cpu_class(cls,&shares); cls->core = core; - cls->parent = parent; + cls->parent = parent; } } else printk(KERN_ERR"alloc_cpu_class failed\n"); @@ -136,15 +194,14 @@ void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class return cls; } -/* - * hzheng: this is not a stable implementation - * need to check race condition issue here - */ +void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr); + static void ckrm_free_cpu_class(void *my_res) { struct ckrm_cpu_class *cls = my_res, *parres, *childres; ckrm_core_class_t *child = NULL; int maxlimit; + int i; if (!cls) return; @@ -179,10 +236,19 @@ static void ckrm_free_cpu_class(void *my_res) list_del(&cls->links); write_unlock(&class_list_lock); + ckrm_cpu_class_queue_delete_sync(cls); + + for_each_cpu(i) { + ckrm_lrq_t *lrq = get_ckrm_lrq(cls,i); + if (!lrq) continue; + lrq->magic = -99; + kfree(lrq); + } kfree(cls); - //call ckrm_cpu_monitor after class removed - ckrm_cpu_monitor(0); + //call ckrm_cpu_monitor after class is removed + if (ckrm_cpu_enabled()) + update_class_effectives(); } /* @@ -194,8 +260,12 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) struct ckrm_shares *cur = &cls->shares, *par; int rc = -EINVAL; - if (!cls) - return rc; + if (ckrm_cpu_disabled()) + return -ENOSYS; + if (!cls) + return rc; + if (new_share->total_guarantee > CKRM_SHARE_MAX) + return -E2BIG; if (cls->parent) { parres = ckrm_get_cpu_class(cls->parent); @@ -215,7 +285,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) new_share->my_guarantee = 0; rc = set_shares(new_share, cur, par); - if (cur->my_limit == CKRM_SHARE_DONTCARE) + if (!rc && cur->my_limit == CKRM_SHARE_DONTCARE) cur->my_limit = cur->max_limit; @@ -225,7 +295,7 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) } //call ckrm_cpu_monitor after changes are changed - ckrm_cpu_monitor(0); + update_class_effectives(); return rc; } @@ -235,22 +305,90 @@ static int ckrm_cpu_get_share(void *my_res, { struct ckrm_cpu_class *cls = my_res; - if (!cls) + if (ckrm_cpu_disabled()) + return -ENOSYS; + if (!cls) return -EINVAL; + *shares = cls->shares; return 0; } +/* + * get_ckrm_usage(): + * obtain a sequence of usage informations + * returns number of usages reported. + * + * report IN: specifies the sequence of jiffies for which to report + * must be ordered (smallest first) + * OUT: returns the usage in each field + * + */ + + +int ckrm_cpu_get_usage(struct ckrm_cpu_class* clsptr, + int num, ulong report[]) +{ + struct ckrm_usage* usage = &clsptr->usage; + unsigned long long total = 0; + int i, idx, cur, num_ofs; + + num_ofs = cur = i = 0; + idx = usage->sample_pointer; + + for ( num_ofs = 0; num_ofs < num ; num_ofs++ ) { + int nr_samples; + int duration = report[num_ofs]; + unsigned long long totval = 0; + + nr_samples = duration/USAGE_SAMPLE_FREQ?:1; + + if (nr_samples > USAGE_MAX_HISTORY) + nr_samples = USAGE_MAX_HISTORY; + + for ( ; i< nr_samples; i++) { + if (! idx) + idx = USAGE_MAX_HISTORY; + idx --; + total += usage->samples[idx]; + } + totval = total * 1000; + do_div(totval,NS_PER_SAMPLE); + do_div(totval,nr_samples * cpus_weight(cpu_online_map)); + report[num_ofs] = totval; + } + + return num; +} + int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) { struct ckrm_cpu_class *cls = my_res; struct ckrm_cpu_class_stat* stat = &cls->stat; ckrm_lrq_t* lrq; int i; + ulong usage[3] = { 2*HZ, 10*HZ, 60*HZ }; - if (!cls) + if (!cls || ckrm_cpu_disabled()) return -EINVAL; + ckrm_cpu_get_usage(cls,3,usage); + + /* this will after full stabilization become the only cpu usage stats + */ + + seq_printf(sfile, "cpu-usage(2,10,60)= %lu %lu %lu\n", + usage[0],usage[1],usage[2]); + + if (usage_detail < 1) + return 0; + + /* the extended statistics we can decide whether we want to make the + * additional statistics available over config options + * eitherway they should be reported in a more concised form + * during stabilization, this is OK + */ + seq_printf(sfile, "-------- CPU Class Status Start---------\n"); seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n", cls->shares.my_guarantee, @@ -261,26 +399,35 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) cls->shares.unused_guarantee, cls->shares.cur_max_limit); + if (usage_detail < 2) + goto out; + seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt); seq_printf(sfile, "\tmegrt= %d\n",stat->megrt); seq_printf(sfile, "\tehl= %d\n",stat->ehl); seq_printf(sfile, "\tmehl= %d\n",stat->mehl); seq_printf(sfile, "\teshare= %d\n",stat->eshare); - seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls)); + seq_printf(sfile, "\tmeshare= %d\n",stat->meshare); seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand); seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns); - seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n", - get_ckrm_usage(cls,2*HZ), - get_ckrm_usage(cls,10*HZ), - get_ckrm_usage(cls,60*HZ) - ); + seq_printf(sfile, "\tusage(2,10,60)= %lu %lu %lu\n", + usage[0],usage[1],usage[2]); + + if (usage_detail < 3) + goto out; + + /* provide per run queue information */ for_each_online_cpu(i) { lrq = get_ckrm_lrq(cls,i); - seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings); + seq_printf(sfile, "\tlrq %d demand= %lu weight= %d " + "lrq_load= %lu cvt= %llu sav= %llu\n", + i,stat->local_stats[i].cpu_demand, + local_class_weight(lrq),lrq->lrq_load, + lrq->local_cvt,lrq->savings); } +out: seq_printf(sfile, "-------- CPU Class Status END ---------\n"); - return 0; } @@ -296,10 +443,34 @@ void ckrm_cpu_change_class(void *task, void *old, void *new) if (!task || ! old || !new) return; + if (ckrm_cpu_disabled()) + newcls = get_default_cpu_class(); _ckrm_cpu_change_class(tsk,newcls); } -/*dummy function, not used*/ +enum config_token_t { + config_usage_detail, /* define usage level */ + config_disable, /* always use default linux scheduling */ + /* effectively disables the ckrm scheduler */ + config_enable, /* always uses ckrm scheduling behavior */ + config_err /* parsing error */ +}; + +#define CKRM_SCHED_MODE_DISABLED_STR "disabled" +#define CKRM_SCHED_MODE_ENABLED_STR "enabled" + +static char *ckrm_sched_mode_str[] = { + CKRM_SCHED_MODE_DISABLED_STR, + CKRM_SCHED_MODE_ENABLED_STR +}; + +static match_table_t config_tokens = { + { config_disable, "mode="CKRM_SCHED_MODE_DISABLED_STR }, + { config_enable, "mode="CKRM_SCHED_MODE_ENABLED_STR }, + { config_usage_detail, "usage_detail=%u" }, + { config_err, NULL } +}; + static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile) { struct ckrm_cpu_class *cls = my_res; @@ -307,23 +478,61 @@ static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile) if (!cls) return -EINVAL; - seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class"); + seq_printf(sfile, "res=%s,mode=%s", + CPU_CTRL_NAME,ckrm_sched_mode_str[ckrm_sched_mode]); + if (!ckrm_cpu_disabled()) /* enabled || mixed */ + seq_printf(sfile, ",usage_detail=%u",usage_detail); + seq_printf(sfile,"\n"); return 0; } -/*dummy function, not used*/ static int ckrm_cpu_set_config(void *my_res, const char *cfgstr) { struct ckrm_cpu_class *cls = my_res; + char *p; + char **cfgstr_p = (char**)&cfgstr; + substring_t args[MAX_OPT_ARGS]; + int option,rc; + enum ckrm_sched_mode new_sched_mode; if (!cls) return -EINVAL; - printk("ckrm_cpu config='%s'\n",cfgstr); - return 0; + + new_sched_mode = ckrm_sched_mode; + rc = 0; + + while ((p = strsep(cfgstr_p, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, config_tokens, args); + switch (token) { + case config_usage_detail: + if (ckrm_cpu_disabled() || + (match_int(&args[0], &option)) || + (option > CKRM_CPU_USAGE_DETAIL_MAX)) + { + return -EINVAL; + } + usage_detail = option; + break; + case config_disable: + new_sched_mode = CKRM_SCHED_MODE_DISABLED; + break; + case config_enable: + new_sched_mode = CKRM_SCHED_MODE_ENABLED; + break; + case config_err: + return -EINVAL; + } + } + rc = ckrm_cpu_set_mode(new_sched_mode); + return rc; } struct ckrm_res_ctlr cpu_rcbs = { - .res_name = "cpu", + .res_name = CPU_CTRL_NAME, .res_hdepth = 1, .resid = -1, .res_alloc = ckrm_alloc_cpu_class, @@ -364,14 +573,69 @@ void init_cpu_classes(void) //init classqueues for each processor for (i=0; i < NR_CPUS; i++) - classqueue_init(get_cpu_classqueue(i)); + classqueue_init(get_cpu_classqueue(i),ckrm_cpu_enabled()); - /* - * hzheng: initialize the default cpu class - * required for E14/E15 since ckrm_init is called after sched_init - */ ckrm_alloc_cpu_class(NULL,NULL); } +void ckrm_cpu_class_queue_update(int on); +void ckrm_cpu_start_monitor(void); +void ckrm_cpu_kill_monitor(void); + +static int ckrm_cpu_set_mode(enum ckrm_sched_mode mode) +{ + struct task_struct *proc, *tsk; + struct ckrm_cpu_class *new_cls = NULL; + int i; + + if (mode == ckrm_sched_mode) + return 0; + + printk("ckrm_cpu_set_mode from <%s> to <%s> pid=%d\n", + ckrm_sched_mode_str[ckrm_sched_mode], + ckrm_sched_mode_str[mode], + current->pid); + + if (mode == CKRM_SCHED_MODE_DISABLED) { + ckrm_cpu_kill_monitor(); + new_cls = get_default_cpu_class(); + } else { + ckrm_cpu_class_queue_update(1); + } + + /* run twice through the list to catch everyone, + * current and transient once + */ + + read_lock(&tasklist_lock); + + ckrm_sched_mode = mode; + /* we have to run through the list twice + * first catch all existing tasks + * and then deal with some potential race condition + */ + for ( i=2 ; i-- ; ) { + /* lock class_list_lock ? */ + + do_each_thread(proc, tsk) { + if (mode == CKRM_SCHED_MODE_ENABLED) { + new_cls = ckrm_get_res_class(class_core(tsk->taskclass), + cpu_rcbs.resid, + struct ckrm_cpu_class); + } + _ckrm_cpu_change_class(tsk,new_cls); + } while_each_thread(proc, tsk); + } + read_unlock(&tasklist_lock); + + if (mode == CKRM_SCHED_MODE_DISABLED) + ckrm_cpu_class_queue_update(0); + else + ckrm_cpu_start_monitor(); + return 0; +} EXPORT_SYMBOL(ckrm_get_cpu_class); + + + diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c index 11f65d73b..3cb8225c4 100644 --- a/kernel/ckrm/ckrm_cpu_monitor.c +++ b/kernel/ckrm/ckrm_cpu_monitor.c @@ -28,21 +28,30 @@ #include #include +// #define CONFIG_CKRM_SUPPORT_MAXLIMITS + #define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/ -#define CKRM_SHARE_MAX (1<shares.unused_guarantee; +} + static inline int get_soft_limit(struct ckrm_cpu_class *cls) { return cls->shares.my_limit; @@ -63,6 +72,57 @@ static inline int get_myhard_limit(struct ckrm_cpu_class *cls) return cls->shares.total_guarantee; } +static inline void set_eshare(struct ckrm_cpu_class_stat *stat, + int new_share) +{ + if (!new_share) + new_share = 1; + + BUG_ON(new_share < 0); + stat->eshare = new_share; +} + +static inline void set_meshare(struct ckrm_cpu_class_stat *stat, + int new_share) +{ + if (!new_share) + new_share = 1; + + BUG_ON(new_share < 0); + stat->meshare = new_share; +} + +/** + *get_self_cpu_demand - get cpu demand of the class itself (excluding children) + * + * self_cpu_demand = sum(cpu demand of all local queues) + */ +static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) +{ + int cpu_demand = 0; + int i; + int cpuonline = 0; + + for_each_online_cpu(i) { + cpu_demand_check_sleep(stat,i); + cpu_demand += stat->local_stats[i].cpu_demand; + cpuonline ++; + } + + return (cpu_demand/cpuonline); +} + +/* + * my max demand = min(cpu_demand, my effective hard limit) + */ +static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) +{ + unsigned long mmax_demand = get_self_cpu_demand(stat); + if (mmax_demand > stat->mehl) + mmax_demand = stat->mehl; + + return mmax_demand; +} static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type) { @@ -85,7 +145,7 @@ static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, } } -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) +void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat, int eshares) { int i; @@ -93,7 +153,7 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) stat->total_ns = 0; stat->max_demand = 0; - for (i=0; i< NR_CPUS; i++) { + for (i=0; ilocal_stats[i],CPU_DEMAND_TP_CLASS); } @@ -102,10 +162,517 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/ stat->mehl = CKRM_SHARE_MAX; /*default: no limit */ - stat->eshare = CKRM_SHARE_MAX; - stat->meshare = CKRM_SHARE_MAX; + stat->eshare = eshares; + stat->meshare = eshares; + + stat->has_savings = 0; + stat->demand_per_share = 0; + } +#if 0 // keep handy for debugging if necessary +void ckrm_cpu_class_dump(struct ckrm_cpu_class *clsptr,int num) +{ + struct ckrm_cpu_class_stat* stat = &clsptr->stat; + printk("%d> %p[%d] mg=%d lim=%d tg=%d maxlim=%d ug=%d\n",num, + clsptr, (clsptr == get_default_cpu_class()), + clsptr->shares.my_guarantee, + clsptr->shares.my_limit, + clsptr->shares.total_guarantee, + clsptr->shares.max_limit, + clsptr->shares.unused_guarantee); + printk(" egrt=%d megrt=%d ehl=%d mehl=%d esh=%d mesh=%d\n", + stat->egrt,stat->megrt,stat->ehl,stat->mehl, + stat->eshare,stat->meshare); +} +#endif + +/**********************************************/ +/* surplus allocation */ +/**********************************************/ + +/* + * surplus = egrt - demand + * if surplus < 0, surplus = 0 + */ +static inline int get_node_surplus(struct ckrm_cpu_class *cls) +{ + int surplus = cls->stat.egrt - cls->stat.max_demand; + + if (surplus < 0) + surplus = 0; + + return surplus; +} + +/* + * consume savings in advance because this class give surplus to others + * this is a quick hack, should be integrated with balance_savings() + */ +static inline void consumed_surplus_savings(struct ckrm_cpu_class *clsptr, + int savings_consumed) +{ + long long total_savings; + ckrm_lrq_t* lrq; + int i; + int cpu_online = 0; + + total_savings = 0; + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + total_savings += lrq->savings; + cpu_online ++; + } + + total_savings -= savings_consumed; + if (total_savings < 0) + total_savings = 0; + + //get the average savings + do_div(total_savings,cpu_online); + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + lrq->savings = total_savings; + } +} + +static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) +{ + int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); + int savings_consumed; + + if (surplus < 0) + surplus = 0; + + /* + * a quick hack about the hierarchy savings distribution + * may not be the right way to do + * + * since this node give its surplus to other nodes, + * it's savings should be consumed + * suppose CPU_MONITOR_INTERVAL = (HZ) + * savings_consumed is roughly how much savings will be consumed for the next second + */ + if (surplus) { + savings_consumed = surplus * HZ * (NSEC_PER_MS >> CKRM_SHARE_SHIFT); + consumed_surplus_savings(cls, savings_consumed) ; + } + + return surplus; +} + +/* + * all the class in the queue consume the surplus in order + * each class consume the amount propotional to its egrt + */ +static int consume_surplus_in_order(struct list_head* queue, + struct ckrm_cpu_class *p_cls, + int total_surplus) +{ + int total_grt = 0; + struct ckrm_cpu_class *clsptr; + + /* + * get total_grt of the classes in the queue + * total_grt can be maintained instead of re-calcuated each time + */ + list_for_each_entry(clsptr,queue,surplus_queue) { + if (unlikely(clsptr == p_cls)) + total_grt += clsptr->stat.megrt; + else + total_grt += clsptr->stat.egrt; + } + + if (! total_grt) + goto consume_out; + + //allocate in order + list_for_each_entry(clsptr,queue,surplus_queue) { + int surplus_per_share; + int consumed, my_grt; + + BUG_ON(! total_grt); + surplus_per_share = + (total_surplus << CKRM_SHARE_SHIFT) / total_grt; + + if (surplus_per_share <= 0) + break; + + if (unlikely(clsptr == p_cls)) //self_node consuming + my_grt = clsptr->stat.megrt; + else + my_grt = clsptr->stat.egrt; + + BUG_ON(clsptr->stat.demand_per_share <= 0); + + if (clsptr->stat.demand_per_share < surplus_per_share) + surplus_per_share = clsptr->stat.demand_per_share; + + consumed = surplus_per_share * my_grt; + consumed >>= CKRM_SHARE_SHIFT; + total_surplus -= consumed; + BUG_ON(total_surplus < 0); + total_grt -= my_grt; + + if (unlikely(clsptr == p_cls)) + set_meshare(&clsptr->stat,clsptr->stat.meshare + consumed); + else + set_eshare(&clsptr->stat,clsptr->stat.eshare + consumed); + } + consume_out: + if (total_surplus <= 1) //if total_suplus too small, no need to allocate again + total_surplus = 0; + return total_surplus; +} + +/* + * link all the children of parent and the parent itself using their surplus_queue field + * link the whole queue using src_queue + * if anything wrong return -1 + */ +static int get_class_surplus_queue(struct ckrm_core_class *parent, + struct list_head* src_queue) +{ + struct ckrm_core_class *child_core = NULL; + struct ckrm_cpu_class *p_cls,*c_cls; + int ret = -1; + + p_cls = ckrm_get_cpu_class(parent); + if (! p_cls) + goto link_out; + + INIT_LIST_HEAD(src_queue); + + //add the parent node itself + list_add(&p_cls->surplus_queue,src_queue); + do { + child_core = ckrm_get_next_child(parent, child_core); + if (child_core) { + c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) + goto link_out; + list_add(&c_cls->surplus_queue,src_queue); + } + } while (child_core); + + ret = 0; + + link_out: + return ret; +} + +/* + * insert the class to queue based on stat->demand_per_share + * status: tested + */ +static void insert_surplus_queue(struct list_head* queue, struct ckrm_cpu_class *clsptr) +{ + struct ckrm_cpu_class *cur_cls = NULL; + int end_of_queue = 1; + + list_for_each_entry(cur_cls,queue,surplus_queue) { + if (cur_cls->stat.demand_per_share >= clsptr->stat.demand_per_share) { + end_of_queue = 0; + break; + } + } + + //insert the clsptr + if (! cur_cls || end_of_queue) + list_add_tail(&clsptr->surplus_queue,queue); + else + list_add_tail(&clsptr->surplus_queue,&cur_cls->surplus_queue); +} + +/* + * copy all classes in src_queue to dst_queue, + * reorder the classes based on their normalized demand + * if a class already saturate (eshare >= demand), also remove it from src_queue + * return the total guarantee of the selected classes + * + * @src_queue: source queue + * @dst_queue: destination queue + * @check_sl: check soft limit + * @check_savings: only class has savings should be considered + */ + +static unsigned long reorder_surplus_queue(struct list_head* src_queue, + struct list_head* dst_queue, + int check_sl, int check_savings, + struct ckrm_cpu_class *p_cls) +{ + struct ckrm_cpu_class *clsptr, *tmp; + + INIT_LIST_HEAD(dst_queue); + + list_for_each_entry_safe(clsptr,tmp,src_queue,surplus_queue) { + struct ckrm_cpu_class_stat* stat = &clsptr->stat; + int inc_limit; + int max_demand, eshare, esl,grt; + + if (unlikely(clsptr == p_cls)) { + max_demand = get_mmax_demand(stat); + eshare = stat->meshare; + esl = get_mysoft_limit(clsptr); + grt = stat->megrt; + } else { + max_demand = stat->max_demand; + eshare = stat->eshare; + esl = get_soft_limit(clsptr); + grt = stat->egrt; + } + + //hard limit and demand limit + inc_limit = max_demand - eshare; + + //no additional share needed + if (inc_limit <= 0 || ! grt) { + list_del(&clsptr->surplus_queue); + continue; + } + + //or no more savings + if (check_savings && ! stat->has_savings) + continue; + + //check soft limit + if (check_sl) { + int soft_limit; + + soft_limit = p_cls->stat.eshare * esl + / p_cls->shares.total_guarantee; + + if (soft_limit < max_demand) + inc_limit = soft_limit - eshare; + if ( inc_limit <= 0) /* can turn negative */ + continue; + } + + BUG_ON(! grt); + //get the stat->demand_per_share + stat->demand_per_share = + (inc_limit << CKRM_SHARE_SHIFT) / grt; + + list_del_init(&clsptr->surplus_queue); + //insert the class to the queue + insert_surplus_queue(dst_queue,clsptr); + } + return 0; +} + +/* + * get all the surplus that should be reallocated to the children + */ +static inline int get_total_surplus(struct ckrm_cpu_class *p_cls, + struct ckrm_core_class *parent) +{ + struct ckrm_cpu_class *c_cls; + int total_surplus; + struct ckrm_core_class *child_core = NULL; + + //additional share assigned to this sub node from parent + total_surplus = p_cls->stat.eshare - p_cls->stat.egrt; + BUG_ON(total_surplus < 0); + + //surplus of this node + total_surplus += get_my_node_surplus(p_cls); + do { + child_core = ckrm_get_next_child(parent, child_core); + if (child_core) { + c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) { + total_surplus = 0; + break; + } + + total_surplus += get_node_surplus(c_cls); + } + } while (child_core); + + return total_surplus; +} +/** + * alloc_surplus_node: re-allocate the shares for a single level + * @parent: parent node + * return the remaining surplus + * + * The surplus reallocation policy is like below. + * -- the classes that have eshare >= demand don't need any additional share. + * So they don't participate the surplus allocation. + * -- all the other classes received share in this order: + * 1. has savings, not over soft limit + * 2. has savings, but over soft limit + * 3. no savings, not over soft limit + * 4. no savings, over soft limit + * + * In each of the 4 levels above, classes get surplus propotionally to its guarantee + */ +static int alloc_surplus_node(struct ckrm_core_class *parent) +{ + struct ckrm_cpu_class *p_cls; + int total_surplus; + int ret = -1; + struct list_head src_queue, dst_queue; + + p_cls = ckrm_get_cpu_class(parent); + if (! p_cls) //safty check + goto realloc_out; + + ret = 0; + total_surplus = get_total_surplus(p_cls,parent); + + if (! total_surplus) //no surplus to be allocated + goto realloc_out; + + /* + * first round, allocated to tasks with savings, check_sl + */ + get_class_surplus_queue(parent,&src_queue); + reorder_surplus_queue(&src_queue, &dst_queue, 1, 1,p_cls); + if (! list_empty(&dst_queue)) { + total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); + if (! total_surplus) + goto realloc_out; + } + + /* + * second round, check savings, but no check_sl + */ + //merge the src_queue and dst_queue and reorder + list_splice(&dst_queue, &src_queue); + reorder_surplus_queue(&src_queue, &dst_queue, 0, 1,p_cls); + if (! list_empty(&dst_queue)) { + total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); + if (! total_surplus) + goto realloc_out; + } + + /* + * third round, no check savings, but check_sl + */ + //merge the src_queue and dst_queue and reorder + list_splice(&dst_queue, &src_queue); + reorder_surplus_queue(&src_queue, &dst_queue, 1, 0,p_cls); + if (! list_empty(&dst_queue)) { + total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); + if (! total_surplus) + goto realloc_out; + } + /* + * fourth round, no check savings, no check_sl + */ + //merge the src_queue and dst_queue and reorder + list_splice(&dst_queue, &src_queue); + reorder_surplus_queue(&src_queue, &dst_queue, 0, 0,p_cls); + if (! list_empty(&dst_queue)) + total_surplus = consume_surplus_in_order(&dst_queue,p_cls,total_surplus); + + realloc_out: + return ret; +} + +/* + * return true if the class total savings > MIN_SAVINGS + */ +static int balance_local_savings(struct ckrm_cpu_class *clsptr, int cpu_online) +{ + unsigned long long total_savings; + ckrm_lrq_t* lrq; + int i; +#define CLASS_MIN_SAVINGS (10 * NSEC_PER_MS) + + total_savings = 0; + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + total_savings += lrq->savings; + } + + if (total_savings < CLASS_MIN_SAVINGS) + return 0; + + //get the average savings + do_div(total_savings,cpu_online); + for_each_online_cpu(i) { + lrq = get_ckrm_lrq(clsptr,i); + lrq->savings = total_savings; + } + + /* + * hzheng: this is another quick hack + * only say I have savings when this node has more demand + * ignoring the requirement of child classes + */ + if (clsptr->stat.megrt < get_mmax_demand(&clsptr->stat)) + return 1; + else + return 0; +} + +/* + * check savings status + * set has_savings field if the class or its sub class has savings + */ +static void check_savings_status(struct ckrm_core_class *root_core) +{ + struct ckrm_cpu_class *clsptr; + int cpu_online; + + cpu_online = cpus_weight(cpu_online_map); + + //class status: demand, share,total_ns prio, index + list_for_each_entry(clsptr,&active_cpu_classes,links) + clsptr->stat.has_savings = balance_local_savings(clsptr,cpu_online); +} + +/** + * alloc_surplus - reallocate unused shares + * + * class A's usused share should be allocated to its siblings + * the re-allocation goes downward from the top + */ +int alloc_surplus(struct ckrm_core_class *root_core) +{ + struct ckrm_core_class *cur_core, *child_core; + // struct ckrm_cpu_class *cls; + int ret = -1; + + check_savings_status(root_core); + + /*initialize*/ + cur_core = root_core; + child_core = NULL; + // cls = ckrm_get_cpu_class(cur_core); + + /*the ckrm idle tasks get all what's remaining*/ + /*hzheng: uncomment the following like for hard limit support */ + // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); + + repeat: + //check exit + if (!cur_core) + return 0; + + //visit this node only once + if (! child_core) + if ( alloc_surplus_node(cur_core) < 0 ) + return ret; + + //next child + child_core = ckrm_get_next_child(cur_core, child_core); + if (child_core) { + //go down + cur_core = child_core; + child_core = NULL; + goto repeat; + } else { //no more child, go back + child_core = cur_core; + cur_core = child_core->hnode.parent; + } + goto repeat; +} + + + /**********************************************/ /* cpu demand */ /**********************************************/ @@ -134,27 +701,29 @@ void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) * how often should we recalculate the cpu demand * the number is in ns */ -static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len) +static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat, + int state, unsigned long long len) { local_stat->total += len; if (state == CKRM_CPU_DEMAND_RUN) local_stat->run += len; if (local_stat->total >= local_stat->recalc_interval) { - local_stat->total >>= CKRM_SHARE_ACCURACY; - if (unlikely(local_stat->run > 0xFFFFFFFF)) - local_stat->run = 0xFFFFFFFF; + local_stat->total >>= CKRM_SHARE_SHIFT; + if (unlikely(local_stat->run > ULONG_MAX)) + local_stat->run = ULONG_MAX; - if (local_stat->total > 0xFFFFFFFF) - local_stat->total = 0xFFFFFFFF; + if (unlikely(local_stat->total > ULONG_MAX)) + local_stat->total = ULONG_MAX; do_div(local_stat->run,(unsigned long)local_stat->total); - if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep + if (unlikely(local_stat->total > ULONG_MAX)) { + //happens after very long sleep local_stat->cpu_demand = local_stat->run; - else { - local_stat->cpu_demand += local_stat->run; - local_stat->cpu_demand >>= 1; + } else { + local_stat->cpu_demand = + (local_stat->cpu_demand + local_stat->run) >> 1; } local_stat->total = 0; local_stat->run = 0; @@ -187,60 +756,28 @@ void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsign break; case CPU_DEMAND_INIT: //for task init only cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK); - break; - default: - BUG(); - } -} - -/** - * check all the class local queue - * - * to deal with excessive long run/sleep state - * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record - */ -static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) -{ - struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; - unsigned long long sleep,now; - if (local_stat->last_sleep) { - now = sched_clock(); - sleep = now - local_stat->last_sleep; - local_stat->last_sleep = now; - update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); - } -} - -/** - *get_self_cpu_demand - get cpu demand of the class itself (excluding children) - * - * self_cpu_demand = sum(cpu demand of all local queues) - */ -static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) -{ - int cpu_demand = 0; - int i; - int cpuonline = 0; - - for_each_online_cpu(i) { - cpu_demand_check_sleep(stat,i); - cpu_demand += stat->local_stats[i].cpu_demand; - cpuonline ++; + break; + default: + BUG(); } - - return (cpu_demand/cpuonline); } -/* - * my max demand = min(cpu_demand, my effective hard limit) +/** + * check all the class local queue + * + * to deal with excessive long run/sleep state + * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record */ -static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) +void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) { - unsigned long mmax_demand = get_self_cpu_demand(stat); - if (mmax_demand > stat->mehl) - mmax_demand = stat->mehl; - - return mmax_demand; + struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; + unsigned long long sleep,now; + if (local_stat->last_sleep) { + now = sched_clock(); + sleep = now - local_stat->last_sleep; + local_stat->last_sleep = now; + update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); + } } /** @@ -301,26 +838,6 @@ static int update_max_demand(struct ckrm_core_class *root_core) /**********************************************/ /* effective guarantee & limit */ /**********************************************/ -static inline void set_eshare(struct ckrm_cpu_class_stat *stat, - int new_share) -{ - if (!new_share) - new_share = 1; - - BUG_ON(new_share < 0); - stat->eshare = new_share; -} - -static inline void set_meshare(struct ckrm_cpu_class_stat *stat, - int new_share) -{ - if (!new_share) - new_share = 1; - - BUG_ON(new_share < 0); - stat->meshare = new_share; -} - /** *update_child_effective - update egrt, ehl, mehl for all children of parent *@parent: the parent node @@ -346,7 +863,7 @@ static int update_child_effective(struct ckrm_core_class *parent) p_cls->stat.egrt * c_cls->shares.my_guarantee / p_cls->shares.total_guarantee; - c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee + c_cls->stat.megrt = c_cls->stat.egrt * get_my_grt(c_cls) / c_cls->shares.total_guarantee; c_cls->stat.ehl = @@ -372,8 +889,9 @@ static int update_child_effective(struct ckrm_core_class *parent) * * return -1 if anything wrong happened (eg: the structure changed during the process) */ -static int update_effectives(struct ckrm_core_class *root_core) +int update_effectives(void) { + struct ckrm_core_class *root_core = get_default_cpu_class()->core; struct ckrm_core_class *cur_core, *child_core; struct ckrm_cpu_class *cls; int ret = -1; @@ -384,7 +902,7 @@ static int update_effectives(struct ckrm_core_class *root_core) //initialize the effectives for root cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */ - cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee + cls->stat.megrt = cls->stat.egrt * get_my_grt(cls) / cls->shares.total_guarantee; cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls) / cls->shares.total_guarantee; @@ -418,288 +936,11 @@ static int update_effectives(struct ckrm_core_class *root_core) } /**********************************************/ -/* surplus allocation */ +/* CKRM Idle Tasks */ /**********************************************/ -/* - * surplus = egrt - demand - * if surplus < 0, surplus = 0 - */ -static inline int get_node_surplus(struct ckrm_cpu_class *cls) -{ - int surplus = cls->stat.egrt - cls->stat.max_demand; - - if (surplus < 0) - surplus = 0; - - return surplus; -} - -static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) -{ - int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); - - if (surplus < 0) - surplus = 0; - - return surplus; -} - -/** - * consume_surplus: decides how much surplus a node can consume - * @ckeck_sl: if check_sl is set, then check soft_limitx - * return how much consumed - * - * implements all the CKRM Scheduling Requirement - * assume c_cls is valid - */ -static inline int consume_surplus(int surplus, - struct ckrm_cpu_class *c_cls, - struct ckrm_cpu_class *p_cls, - int check_sl - ) -{ - int consumed = 0; - int inc_limit; - int total_grt = p_cls->shares.total_guarantee; - - BUG_ON(surplus < 0); - - /*can't consume more than demand or hard limit*/ - if (c_cls->stat.eshare >= c_cls->stat.max_demand) - goto out; - - //the surplus allocation is propotional to grt - consumed = - surplus * c_cls->shares.my_guarantee / total_grt; - - if (! consumed) //no more share - goto out; - - //hard limit and demand limit - inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare; - - if (check_sl) { - int esl = p_cls->stat.eshare * get_soft_limit(c_cls) - /total_grt; - if (esl < c_cls->stat.max_demand) - inc_limit = esl - c_cls->stat.eshare; - } - - if (consumed > inc_limit) - consumed = inc_limit; - - BUG_ON(consumed < 0); - out: - return consumed; -} - -/* - * how much a node can consume for itself? - */ -static inline int consume_self_surplus(int surplus, - struct ckrm_cpu_class *p_cls, - int check_sl - ) -{ - int consumed = 0; - int inc_limit; - int total_grt = p_cls->shares.total_guarantee; - int max_demand = get_mmax_demand(&p_cls->stat); - - BUG_ON(surplus < 0); - - /*can't consume more than demand or hard limit*/ - if (p_cls->stat.meshare >= max_demand) - goto out; - - //the surplus allocation is propotional to grt - consumed = - surplus * p_cls->shares.unused_guarantee / total_grt; - - if (! consumed) //no more share - goto out; - - //hard limit and demand limit - inc_limit = max_demand - p_cls->stat.meshare; - - if (check_sl) { - int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls) - /total_grt; - if (mesl < max_demand) - inc_limit = mesl - p_cls->stat.meshare; - } - - if (consumed > inc_limit) - consumed = inc_limit; - - BUG_ON(consumed < 0); - out: - return consumed; -} - - -/* - * allocate surplus to all its children and also its default class - */ -static int alloc_surplus_single_round( - int surplus, - struct ckrm_core_class *parent, - struct ckrm_cpu_class *p_cls, - int check_sl) -{ - struct ckrm_cpu_class *c_cls; - struct ckrm_core_class *child_core = NULL; - int total_consumed = 0,consumed; - - //first allocate to the default class - consumed = - consume_self_surplus(surplus,p_cls,check_sl); - - if (consumed > 0) { - set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed); - total_consumed += consumed; - } - - do { - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - return -1; - - consumed = - consume_surplus(surplus, c_cls, - p_cls,check_sl); - if (consumed > 0) { - set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed); - total_consumed += consumed; - } - } - } while (child_core); - - return total_consumed; -} - -/** - * alloc_surplus_node: re-allocate the shares for children under parent - * @parent: parent node - * return the remaining surplus - * - * task: - * 1. get total surplus - * 2. allocate surplus - * 3. set the effective_share of each node - */ -static int alloc_surplus_node(struct ckrm_core_class *parent) -{ - struct ckrm_cpu_class *p_cls,*c_cls; - int total_surplus,consumed; - int check_sl; - int ret = -1; - struct ckrm_core_class *child_core = NULL; - - p_cls = ckrm_get_cpu_class(parent); - if (! p_cls) - goto realloc_out; - - /* - * get total surplus - */ - total_surplus = p_cls->stat.eshare - p_cls->stat.egrt; - BUG_ON(total_surplus < 0); - total_surplus += get_my_node_surplus(p_cls); - - do { - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - goto realloc_out; - - total_surplus += get_node_surplus(c_cls); - } - } while (child_core); - - - if (! total_surplus) { - ret = 0; - goto realloc_out; - } - - /* - * distributing the surplus - * first with the check_sl enabled - * once all the tasks has research the soft limit, disable check_sl and try again - */ - - check_sl = 1; - do { - consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl); - if (consumed < 0) //something is wrong - goto realloc_out; - - if (! consumed) - check_sl = 0; - else - total_surplus -= consumed; - - } while ((total_surplus > 0) && (consumed || check_sl) ); - - ret = 0; - - realloc_out: - return ret; -} - -/** - * alloc_surplus - reallocate unused shares - * - * class A's usused share should be allocated to its siblings - * the re-allocation goes downward from the top - */ -static int alloc_surplus(struct ckrm_core_class *root_core) -{ - struct ckrm_core_class *cur_core, *child_core; - // struct ckrm_cpu_class *cls; - int ret = -1; - - /*initialize*/ - cur_core = root_core; - child_core = NULL; - // cls = ckrm_get_cpu_class(cur_core); - - /*the ckrm idle tasks get all what's remaining*/ - /*hzheng: uncomment the following like for hard limit support */ - // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); - - repeat: - //check exit - if (!cur_core) - return 0; - - //visit this node only once - if (! child_core) - if ( alloc_surplus_node(cur_core) < 0 ) - return ret; - - //next child - child_core = ckrm_get_next_child(cur_core, child_core); - if (child_core) { - //go down - cur_core = child_core; - child_core = NULL; - goto repeat; - } else { //no more child, go back - child_core = cur_core; - cur_core = child_core->hnode.parent; - } - goto repeat; -} +#ifdef CONFIG_CKRM_SUPPORT_MAXLIMITS -/**********************************************/ -/* CKRM Idle Tasks */ -/**********************************************/ struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class; struct task_struct* ckrm_idle_tasks[NR_CPUS]; @@ -710,7 +951,7 @@ static inline int get_nr_idle(unsigned long surplus) int nr_idle = 0; nr_idle = surplus * cpu_online; - nr_idle >>= CKRM_SHARE_ACCURACY; + nr_idle >>= CKRM_SHARE_SHIFT; if (surplus) nr_idle ++; @@ -722,7 +963,8 @@ static inline int get_nr_idle(unsigned long surplus) } /** - * update_ckrm_idle: update the status of the idle class according to the new surplus + * update_ckrm_idle: update the status of the idle class according + * to the new surplus * surplus: new system surplus * * Task: @@ -816,6 +1058,20 @@ void ckrm_start_ckrm_idle(void) } } +void ckrm_stop_ckrm_idle(void) +{ + BUG_ON(1); // not yet implemented +} + +#else + +static inline void ckrm_start_ckrm_idle(void) { }; +static inline void ckrm_stop_ckrm_idle(void) { }; +static inline void update_ckrm_idle(unsigned long surplus) { }; + +#endif + + /**********************************************/ /* Local Weight */ /**********************************************/ @@ -831,8 +1087,19 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) int i; unsigned long class_weight; unsigned long long lw; - - //get total pressure + struct ckrm_cpu_class_stat *stat; + unsigned long oweight; + unsigned long skewed_limit; + /* + * if a local queue gets less than 1/SKEWED_SHARE_RATIO of the eshare + * then we set the skewed_share + */ +#define SKEWED_SHARE_RATIO 8 +#define SKEWED_WEIGHT_MIN 3 + + /* get total pressure of the class, if there is not pressure (.. class is + * idle, then leave the weights as is + */ for_each_online_cpu(i) { lrq = get_ckrm_lrq(clsptr,i); total_pressure += lrq->lrq_load; @@ -841,32 +1108,61 @@ static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) if (! total_pressure) return; + stat = &clsptr->stat; + class_weight = cpu_class_weight(clsptr) * cpu_online; + /* calculate or skewed limit weight */ + skewed_limit = SHARE_TO_WEIGHT(stat->meshare/SKEWED_SHARE_RATIO); + if (skewed_limit < SKEWED_WEIGHT_MIN) + skewed_limit = SKEWED_WEIGHT_MIN; + + /* calculate over_weight */ + BUG_ON(stat->meshare < stat->megrt); + oweight = ((stat->meshare - stat->megrt) << CKRM_SHARE_SHIFT) / stat->meshare; + oweight = SHARE_TO_WEIGHT(oweight); + /* * update weight for each cpu, minimun is 1 */ for_each_online_cpu(i) { lrq = get_ckrm_lrq(clsptr,i); - if (! lrq->lrq_load) - /*give idle class a high share to boost interactiveness */ + lrq->over_weight = oweight; + if (! lrq->lrq_load) { + /* give idle class a high share to boost + * interactiveness + */ lw = cpu_class_weight(clsptr); - else { - lw = lrq->lrq_load * class_weight; + if (unlikely(lw==0)) + lw = 1; + } else { + lw = lrq->lrq_load; + lw *= class_weight; do_div(lw,total_pressure); - if (!lw) + if (unlikely(lw==0)) lw = 1; - else if (lw > CKRM_SHARE_MAX) - lw = CKRM_SHARE_MAX; - } - + else if (unlikely(lw > CKRM_MAX_WEIGHT)) + lw = CKRM_MAX_WEIGHT; + } + BUG_ON(lw > CKRM_MAX_WEIGHT); + + /* + * set is_skewed and local_weight in proper order + * to avoid race condition + */ lrq->local_weight = lw; + if (lw < skewed_limit) + lrq->skewed_weight = skewed_limit; + else + lrq->skewed_weight = 0; + BUG_ON((local_class_weight(lrq) == 1) && (! lrq->skewed_weight)); } } /* * assume called with class_list_lock read lock held */ + void adjust_local_weight(void) { static spinlock_t lock = SPIN_LOCK_UNLOCKED; @@ -904,9 +1200,11 @@ void ckrm_cpu_monitor(int check_min) static unsigned long long last_check = 0; struct ckrm_core_class *root_core = get_default_cpu_class()->core; unsigned long long now; -#define MIN_CPU_MONITOR_INTERVAL 100000000UL + int loc; + +#define MIN_CPU_MONITOR_INTERVAL (100*1000*1000) /* 100 MSEC */ - if (!root_core) + if (ckrm_cpu_disabled() || !root_core) return; //do nothing if someone already holding the lock @@ -919,24 +1217,35 @@ void ckrm_cpu_monitor(int check_min) //consecutive check should be at least 100ms apart if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL)) - goto outunlock; + goto outunlock_np; last_check = now; - if (update_effectives(root_core) != 0) + if (update_effectives() != 0) { + loc = 0; goto outunlock; + } - if (update_max_demand(root_core) != 0) + if (update_max_demand(root_core) != 0) { + loc = 1; goto outunlock; + } - if (alloc_surplus(root_core) != 0) + if (alloc_surplus(root_core) != 0) { + loc = 2; goto outunlock; + } adjust_local_weight(); - outunlock: + outunlock_np: read_unlock(&class_list_lock); spin_unlock(&lock); + return; + + outunlock: + printk("ckrm_cpu_monitor(%d) exits prematurely cause=%d\n",check_min,loc); + goto outunlock_np; } /*****************************************************/ @@ -948,6 +1257,8 @@ static int thread_exit = 0; static int ckrm_cpu_monitord(void *nothing) { daemonize("ckrm_cpu_ctrld"); + printk("cpu_monitord started\n"); + thread_exit = 0; for (;;) { /*sleep for sometime before next try*/ set_current_state(TASK_INTERRUPTIBLE); @@ -963,15 +1274,19 @@ static int ckrm_cpu_monitord(void *nothing) return 0; } -void ckrm_start_monitor(void) +void ckrm_cpu_start_monitor(void) { + if (cpu_monitor_pid != -1) { + /* already started ... */ + return; + } cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL); if (cpu_monitor_pid < 0) { printk("ckrm_cpu_monitord for failed\n"); } } -void ckrm_kill_monitor(void) +void ckrm_cpu_kill_monitor(void) { printk("killing process %d\n", cpu_monitor_pid); if (cpu_monitor_pid > 0) { @@ -983,22 +1298,12 @@ void ckrm_kill_monitor(void) } } -int ckrm_cpu_monitor_init(void) +static int __init ckrm_cpu_init_monitor(void) { - ckrm_start_monitor(); - /*hzheng: uncomment the following like for hard limit support */ - // ckrm_start_ckrm_idle(); + if (ckrm_cpu_enabled()) + ckrm_cpu_start_monitor(); return 0; } -void ckrm_cpu_monitor_exit(void) -{ - ckrm_kill_monitor(); -} - -module_init(ckrm_cpu_monitor_init); -module_exit(ckrm_cpu_monitor_exit); +__initcall(ckrm_cpu_init_monitor); -MODULE_AUTHOR("Haoqiang Zheng "); -MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor"); -MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c index 0fe858633..103e3f957 100644 --- a/kernel/ckrm/ckrm_listenaq.c +++ b/kernel/ckrm/ckrm_listenaq.c @@ -1,4 +1,4 @@ -/* ckrm_socketaq.c - accept queue resource controller +/* ckrm_listenaq.c - accept queue resource controller * * Copyright (C) Vivek Kashyap, IBM Corp. 2004 * @@ -251,7 +251,7 @@ static int laq_set_share_values(void *my_res, struct ckrm_shares *shares) } parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // socket_class does not have a share interface + if (!parent) // socketclass does not have a share interface return -EINVAL; // Ensure that we ignore limit values @@ -380,7 +380,7 @@ static int laq_get_stats(void *my_res, struct seq_file *sfile) } parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) { // socket_class does not have a stat interface + if (!parent) { // socketclass does not have a stat interface printk(KERN_ERR "socketaq internal fs inconsistency\n"); return -EINVAL; } @@ -451,7 +451,7 @@ static void laq_change_resclass(void *n, void *old, void *r) } struct ckrm_res_ctlr laq_rcbs = { - .res_name = "laq", + .res_name = "listenaq", .resid = -1, // dynamically assigned .res_alloc = laq_res_alloc, .res_free = laq_res_free, @@ -467,9 +467,9 @@ int __init init_ckrm_laq_res(void) struct ckrm_classtype *clstype; int resid; - clstype = ckrm_find_classtype_by_name("socket_class"); + clstype = ckrm_find_classtype_by_name("socketclass"); if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); + printk(KERN_INFO " Unknown ckrm classtype"); return -ENOENT; } diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c index 40d6407a3..37a2ae9f8 100644 --- a/kernel/ckrm/rbce/rbcemod.c +++ b/kernel/ckrm/rbce/rbcemod.c @@ -409,7 +409,7 @@ static struct rbce_class *create_rbce_class(const char *classname, return cls; } -static struct rbce_class *get_class(char *classname, int *classtype) +static struct rbce_class *get_class(const char *classname, int *classtype) { struct rbce_class *cls; void *classobj; diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c index 0400844a3..fd7f8a2b4 100644 --- a/kernel/ckrm_classqueue.c +++ b/kernel/ckrm_classqueue.c @@ -27,14 +27,19 @@ #include #define cq_nr_member(cq) (cq->array.nr_active) +#define CLASSQUEUE_MASK (CLASSQUEUE_SIZE - 1) /** - * get_index - translate the logical priority to the real index in the queue + * get_node_index - + * translate the logical priority to the real index in the queue * * validate the position * a valid prio is [cq->base,cq->base + size -1] + * check whether node is supposed to be enqeued beyond above window and + * if so set the need_repos flag */ -static inline unsigned long get_index(struct classqueue_struct *cq, int *prio) +static inline unsigned long get_node_index(struct classqueue_struct *cq, + cq_node_t * node) { unsigned long index; int max_prio; @@ -43,22 +48,24 @@ static inline unsigned long get_index(struct classqueue_struct *cq, int *prio) return 0; max_prio = cq->base + (CLASSQUEUE_SIZE - 1); - if (*prio > max_prio) - *prio = max_prio; - if (*prio < cq->base) - *prio = cq->base; + if (unlikely(node->prio > max_prio)) { + node->real_prio = node->prio; + node->prio = max_prio; + node->need_repos = 1; + } else + node->need_repos = 0; - index = (cq->base_offset + (*prio - cq->base)) ; - if (index >= CLASSQUEUE_SIZE) - index -= CLASSQUEUE_SIZE; + if (unlikely(node->prio < cq->base)) + node->prio = cq->base; - return index; + index = (cq->base_offset + (node->prio - cq->base)) ; + return ( index & CLASSQUEUE_MASK ); // ensure its in limits } /** * initialize a class queue object */ -int classqueue_init(struct classqueue_struct *cq) +int classqueue_init(struct classqueue_struct *cq, int enabled) { int i; struct cq_prio_array *array; @@ -73,7 +80,8 @@ int classqueue_init(struct classqueue_struct *cq) array->nr_active = 0; cq->base = 0; - cq->base_offset = -1; //not valid yet + cq->base_offset = 0; + cq->enabled = enabled; return 0; } @@ -87,8 +95,8 @@ void classqueue_enqueue(struct classqueue_struct *cq, int index; //get real index - if (cq_nr_member(cq)) { - index = get_index(cq, &prio); + if (cq_nr_member(cq)) { + index = get_node_index(cq, node); } else { //the first one cq->base = prio; cq->base_offset = 0; @@ -123,8 +131,8 @@ void classqueue_update_prio(struct classqueue_struct *cq, if (! cls_in_classqueue(node)) return; - index = get_index(cq, &new_pos); node->prio = new_pos; + index = get_node_index(cq, node); //remove from the original position list_del_init(&(node->list)); @@ -137,10 +145,32 @@ void classqueue_update_prio(struct classqueue_struct *cq, node->index = index; } + +static inline void __classqueue_update_base(struct classqueue_struct *cq, + int new_base) +{ + int max_prio; + if (unlikely(new_base <= cq->base)) // base will never move back + return; + if (unlikely(!cq_nr_member(cq))) { + cq->base_offset = 0; + cq->base = new_base; // is this necessary ?? + return; + } + + max_prio = cq->base + (CLASSQUEUE_SIZE - 1); + if (unlikely(new_base > max_prio)) + new_base = max_prio; + + cq->base_offset = (cq->base_offset + (new_base - cq->base)) & CLASSQUEUE_MASK; + cq->base = new_base; +} + /** *classqueue_get_min_prio: return the priority of the last node in queue * * this function can be called without runqueue lock held + * return 0 if there's nothing in the queue */ static inline int classqueue_get_min_prio(struct classqueue_struct *cq) { @@ -171,9 +201,13 @@ static inline int classqueue_get_min_prio(struct classqueue_struct *cq) */ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) { - cq_node_t *result = NULL; + cq_node_t *node; int pos; + int index; + int new_base; +search_again: + node = NULL; /* * search over the bitmap to get the first class in the queue */ @@ -183,10 +217,38 @@ cq_node_t *classqueue_get_head(struct classqueue_struct *cq) pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); if (pos < CLASSQUEUE_SIZE) { - BUG_ON(list_empty(&cq->array.queue[pos])); - result = list_entry(cq->array.queue[pos].next, cq_node_t, list); + //BUG_ON(list_empty(&cq->array.queue[pos])); + node = list_entry(cq->array.queue[pos].next, cq_node_t, list); } - return result; + + //check if the node need to be repositioned + if (likely(! node || ! node->need_repos)) + return node; + + // We need to reposition this node in the class queue + // BUG_ON(node->prio == node->real_prio); + + //remove from the original position + list_del_init(&(node->list)); + if (list_empty(&cq->array.queue[node->index])) + __clear_bit(node->index, cq->array.bitmap); + + new_base = classqueue_get_min_prio(cq); + node->prio = node->real_prio; + + if (! new_base) + new_base = node->real_prio; + else if (node->real_prio < new_base) + new_base = node->real_prio; + __classqueue_update_base(cq,new_base); + + index = get_node_index(cq, node); + //add to new positon, round robin for classes with same priority + list_add_tail(&(node->list), &cq->array.queue[index]); + __set_bit(index, cq->array.bitmap); + node->index = index; + + goto search_again; } /** @@ -198,14 +260,11 @@ void classqueue_update_base(struct classqueue_struct *cq) int new_base; if (! cq_nr_member(cq)) { - cq->base_offset = -1; //not defined + cq->base = 0; + cq->base_offset = 0; return; } new_base = classqueue_get_min_prio(cq); - - if (new_base > cq->base) { - cq->base_offset = get_index(cq, &new_base); - cq->base = new_base; - } + __classqueue_update_base(cq,new_base); } diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c index 1ca2611dc..f16e990c1 100644 --- a/kernel/ckrm_sched.c +++ b/kernel/ckrm_sched.c @@ -20,6 +20,28 @@ LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor struct ckrm_cpu_class default_cpu_class_obj; +unsigned int ckrm_sched_mode __cacheline_aligned_in_smp = +#ifdef CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT + CKRM_SCHED_MODE_ENABLED; +#else + CKRM_SCHED_MODE_DISABLED; +#endif + +static int __init ckrm_cpu_enabled_setup(char *str) +{ + ckrm_sched_mode = CKRM_SCHED_MODE_ENABLED; + return 1; +} + +static int __init ckrm_cpu_disabled_setup(char *str) +{ + ckrm_sched_mode = CKRM_SCHED_MODE_DISABLED; + return 1; +} + +__setup("ckrmcpu", ckrm_cpu_enabled_setup); +__setup("nockrmcpu",ckrm_cpu_disabled_setup); + struct ckrm_cpu_class * get_default_cpu_class(void) { return (&default_cpu_class_obj); } @@ -28,7 +50,10 @@ struct ckrm_cpu_class * get_default_cpu_class(void) { /* CVT Management */ /*******************************************************/ -static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) +//an absolute bonus of 200ms for classes when reactivated +#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) + +static void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) { CVT_t min_cvt; CVT_t bonus; @@ -44,47 +69,40 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) */ bonus = INTERACTIVE_BONUS(lrq); //cvt can't be negative - if (cur_cvt > bonus) + if (likely(cur_cvt > bonus)) min_cvt = cur_cvt - bonus; else min_cvt = 0; - - if (lrq->local_cvt < min_cvt) { + + if (lrq->local_cvt < min_cvt) { + // if (lrq->local_cvt < min_cvt && ! lrq_nr_running(lrq)) { CVT_t lost_cvt; - lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq); + if (unlikely(lrq->local_cvt == 0)) { + lrq->local_cvt = cur_cvt; + return; + } + lost_cvt = min_cvt - lrq->local_cvt; + lost_cvt *= local_class_weight(lrq); lrq->local_cvt = min_cvt; + BUG_ON(lost_cvt < 0); /* add what the class lost to its savings*/ - lrq->savings += lost_cvt; +#if 1 /*zhq debugging*/ + lrq->savings += lost_cvt; +#endif if (lrq->savings > MAX_SAVINGS) lrq->savings = MAX_SAVINGS; - } else if (lrq->savings) { - /* - *if a class saving and falling behind - * then start to use it saving in a leaking bucket way - */ - CVT_t savings_used; - - savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq); - if (savings_used > lrq->savings) - savings_used = lrq->savings; - - if (savings_used > SAVINGS_LEAK_SPEED) - savings_used = SAVINGS_LEAK_SPEED; - - BUG_ON(lrq->savings < savings_used); - lrq->savings -= savings_used; - unscale_cvt(savings_used,lrq); - BUG_ON(lrq->local_cvt < savings_used); - lrq->local_cvt -= savings_used; - } +#if 0 /* zhq debugging*/ + printk("lrq= %x savings: %llu lost= %llu\n",(int)lrq,lrq->savings,lost_cvt); +#endif + } } /* * return the max_cvt of all the classes */ -static inline CVT_t get_max_cvt(int this_cpu) +CVT_t get_max_cvt(int this_cpu) { struct ckrm_cpu_class *clsptr; ckrm_lrq_t * lrq; @@ -92,7 +110,6 @@ static inline CVT_t get_max_cvt(int this_cpu) max_cvt = 0; - /*update class time, at the same time get max_cvt */ list_for_each_entry(clsptr, &active_cpu_classes, links) { lrq = get_ckrm_lrq(clsptr, this_cpu); if (lrq->local_cvt > max_cvt) @@ -102,6 +119,23 @@ static inline CVT_t get_max_cvt(int this_cpu) return max_cvt; } +CVT_t get_min_cvt(int this_cpu) +{ + struct ckrm_cpu_class *clsptr; + ckrm_lrq_t * lrq; + CVT_t max_cvt; + + max_cvt = 0xFFFFFFFFFFFFFLLU; + + list_for_each_entry(clsptr, &active_cpu_classes, links) { + lrq = get_ckrm_lrq(clsptr, this_cpu); + if (lrq->local_cvt < max_cvt) + max_cvt = lrq->local_cvt; + } + + return max_cvt; +} + /** * update_class_cputime - updates cvt of inactive classes * -- an inactive class shouldn't starve others when it comes back @@ -110,7 +144,7 @@ static inline CVT_t get_max_cvt(int this_cpu) * * class_list_lock must have been acquired */ -void update_class_cputime(int this_cpu) +void update_class_cputime(int this_cpu, int idle) { struct ckrm_cpu_class *clsptr; ckrm_lrq_t * lrq; @@ -168,24 +202,45 @@ void update_class_cputime(int this_cpu) /*******************************************************/ /* PID load balancing stuff */ /*******************************************************/ -#define PID_SAMPLE_T 32 #define PID_KP 20 #define PID_KI 60 #define PID_KD 20 +/* + * runqueue load is the local_weight of all the classes on this cpu + * must be called with class_list_lock held + */ +static unsigned long ckrm_cpu_load(int cpu) +{ + struct ckrm_cpu_class *clsptr; + ckrm_lrq_t* lrq; + struct ckrm_cpu_demand_stat* l_stat; + int total_load = 0; + int load; + + list_for_each_entry(clsptr,&active_cpu_classes,links) { + lrq = get_ckrm_lrq(clsptr,cpu); + l_stat = get_cls_local_stat(clsptr,cpu); + + load = WEIGHT_TO_SHARE(lrq->local_weight); + + if (l_stat->cpu_demand < load) + load = l_stat->cpu_demand; + total_load += load; + } + return total_load; +} + + /** * sample pid load periodically */ + void ckrm_load_sample(ckrm_load_t* pid,int cpu) { long load; long err; - if (jiffies % PID_SAMPLE_T) - return; - - adjust_local_weight(); - load = ckrm_cpu_load(cpu); err = load - pid->load_p; pid->load_d = err; @@ -195,7 +250,7 @@ void ckrm_load_sample(ckrm_load_t* pid,int cpu) pid->load_i /= 10; } -long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group) +long ckrm_get_pressure(ckrm_load_t* ckrm_load, int local_group) { long pressure; pressure = ckrm_load->load_p * PID_KP; @@ -204,3 +259,58 @@ long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group) pressure /= 100; return pressure; } + +/* + * called after a task is switched out. Update the local cvt accounting + * we need to stick with long instead of long long due to nonexistent + * 64-bit division + */ +void update_local_cvt(struct task_struct *p, unsigned long nsec) +{ + ckrm_lrq_t * lrq = get_task_lrq(p); + unsigned long cvt_inc; + + /* + * consume from savings if eshare is larger than egrt + */ + if (lrq->savings && lrq->over_weight) { + unsigned long savings_used; + + savings_used = nsec; + savings_used >>= CKRM_WEIGHT_SHIFT; + savings_used *= lrq->over_weight; + if (savings_used > lrq->savings) + savings_used = lrq->savings; + lrq->savings -= savings_used; + } + + //BUG_ON(local_class_weight(lrq) == 0); + cvt_inc = nsec / local_class_weight(lrq); + + /* + * For a certain processor, CKRM allocates CPU time propotional + * to the class's local_weight. So once a class consumed nsec, + * it will wait for X (nsec) for its next turn. + * + * X is calculated based on the following fomular + * nsec / local_weight < X / (CKRM_MAX_WEIGHT - local_weight) + * if local_weight is small, then approximated as + * nsec / local_weight < X / (CKRM_MAX_WEIGHT) + */ +#define CVT_STARVATION_LIMIT (200LL*NSEC_PER_MS) +#define CVT_STARVATION_INC_LIMIT (CVT_STARVATION_LIMIT >> CKRM_WEIGHT_SHIFT) + + if (unlikely(lrq->skewed_weight)) { + unsigned long long starvation_limit = CVT_STARVATION_INC_LIMIT; + + starvation_limit *= local_class_weight(lrq); + if (unlikely(cvt_inc > starvation_limit)) + cvt_inc = nsec / lrq->skewed_weight; + } + + /* now update the CVT accounting */ + + lrq->local_cvt += cvt_inc; + lrq->uncounted_ns += nsec; + update_class_priority(lrq); +} diff --git a/kernel/sched.c b/kernel/sched.c index 743e5bc8a..d65addbff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -42,6 +42,8 @@ #include #include +#include +#include #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) @@ -198,8 +200,6 @@ unsigned int task_timeslice(task_t *p) */ typedef struct runqueue runqueue_t; -#include -#include /* * This is the main, per-CPU runqueue data structure. @@ -220,17 +220,21 @@ struct runqueue { unsigned long cpu_load; #endif unsigned long long nr_switches; - unsigned long expired_timestamp, nr_uninterruptible; + unsigned long nr_uninterruptible; +#ifndef CONFIG_CKRM_CPU_SCHEDULE + unsigned long expired_timestamp; + int best_expired_prio; +#endif unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; #ifdef CONFIG_CKRM_CPU_SCHEDULE struct classqueue_struct classqueue; ckrm_load_t ckrm_load; + ckrm_lrq_t dflt_lrq; /* local runqueue of the default class */ #else prio_array_t *active, *expired, arrays[2]; #endif - int best_expired_prio; atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -308,10 +312,72 @@ static inline void rq_unlock(runqueue_t *rq) spin_unlock_irq(&rq->lock); } +static inline void idle_balance(int this_cpu, runqueue_t *this_rq); +static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq); + #ifdef CONFIG_CKRM_CPU_SCHEDULE + +#define ckrm_rq_cpu_disabled(rq) (!rq->classqueue.enabled) +#define ckrm_rq_cpu_enabled(rq) ( rq->classqueue.enabled) + +static inline void class_enqueue_task(struct task_struct *p, + prio_array_t * array) +{ + ckrm_lrq_t *lrq; + int effective_prio; + + if (ckrm_rq_cpu_disabled(task_rq(p))) + return; + + lrq = get_task_lrq(p); + // BUG_ON(lrq==NULL); + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); + lrq->lrq_load += task_load(p); + + if ((p->prio < lrq->top_priority) && (array == lrq->active)) + set_top_priority(lrq, p->prio); + + if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { + cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); + effective_prio = get_effective_prio(lrq); + classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, + effective_prio); + } + +} + +static inline void class_dequeue_task(struct task_struct *p, + prio_array_t * array) +{ + ckrm_lrq_t *lrq; + unsigned long load; + + if (ckrm_rq_cpu_disabled(task_rq(p))) + return; + + lrq = get_task_lrq(p); + load = task_load(p); + + // BUG_ON(lrq->lrq_load < load); + + lrq->lrq_load -= load; + + cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); + + if ((array == lrq->active) && (p->prio == lrq->top_priority) + && list_empty(&(array->queue[p->prio]))) + set_top_priority(lrq,find_next_bit(array->bitmap, MAX_PRIO, + p->prio)); +} + static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) { - cq_node_t *node = classqueue_get_head(&rq->classqueue); + cq_node_t *node; + + if (ckrm_rq_cpu_disabled(rq)) + return &rq->dflt_lrq; + node = classqueue_get_head(&rq->classqueue); return ((node) ? class_list_entry(node) : NULL); } @@ -330,51 +396,189 @@ CVT_t get_local_cur_cvt(int cpu) return 0; } -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +static inline struct task_struct * rq_get_next_task(struct runqueue* rq, + int cpu) { prio_array_t *array; struct task_struct *next; ckrm_lrq_t *queue; int idx; - int cpu = smp_processor_id(); - // it is guaranteed be the ( rq->nr_running > 0 ) check in - // schedule that a task will be found. + if (ckrm_rq_cpu_disabled(rq)) { + /* original code from schedule(void) + * see also code in non CKRM configuration + */ + struct list_head *array_queue; + ckrm_lrq_t *lrq = get_ckrm_lrq(get_default_cpu_class(),cpu); + + if (unlikely(!rq->nr_running)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { + rq->dflt_lrq.expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + return NULL; + } + } + + array = lrq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + lrq->active = lrq->expired; + lrq->expired = array; + array = lrq->active; + lrq->expired_timestamp = 0; + lrq->best_expired_prio = MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + array_queue = array->queue + idx; + next = list_entry(array_queue->next, task_t, run_list); + return next; + } + /*-- CKRM SCHEDULER --*/ + retry_next_class: + /* we can't use (rq->nr_running == 0) to declare idleness + * first we have to make sure that the class runqueue is properly + * processed. This is due to two facts/requirements: + * (a) when the last task is removed form an lrq we do not remove + * the lrq from the class runqueue. As a result the lrq is + * selected again and we can perform necessary + * expired switches. + * (b) perform outstanding expired switches + * + */ + queue = rq_get_next_class(rq); - // BUG_ON( !queue ); + if (unlikely(queue == NULL)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { + rq->dflt_lrq.expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + return NULL; + } + goto retry_next_class; // try again + } array = queue->active; if (unlikely(!array->nr_active)) { queue->active = queue->expired; queue->expired = array; + array = queue->active; queue->expired_timestamp = 0; - if (queue->active->nr_active) + if (array->nr_active) set_top_priority(queue, - find_first_bit(queue->active->bitmap, MAX_PRIO)); + find_first_bit(array->bitmap,MAX_PRIO)); else { + /* since we do not dequeue a lrq when it becomes empty + * but rely on the switching mechanism, we must dequeue + * at this point + */ classqueue_dequeue(queue->classqueue, &queue->classqueue_linkobj); - cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); + cpu_demand_event(get_rq_local_stat(queue,cpu), + CPU_DEMAND_DEQUEUE,0); } goto retry_next_class; } - // BUG_ON(!array->nr_active); idx = queue->top_priority; - // BUG_ON (idx == MAX_PRIO); + //BUG_ON(!array->nr_active); + //BUG_ON(idx == MAX_PRIO); + //BUG_ON(list_empty(array->queue+idx)); next = task_list_entry(array->queue[idx].next); return next; } + +static inline void ckrm_account_task(struct runqueue* rq, + struct task_struct *prev, + unsigned long long now) +{ + if ((prev != rq->idle) && ckrm_rq_cpu_enabled(rq) ) { + unsigned long long run = now - prev->timestamp; + ckrm_lrq_t * lrq = get_task_lrq(prev); + + lrq->lrq_load -= task_load(prev); + cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); + lrq->lrq_load += task_load(prev); + + cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); + update_local_cvt(prev, run); + } + +} + +#ifdef CONFIG_SMP +#define COND_SMP(dflt,cond) (cond) +#else +#define COND_SMP(dflt,cond) (dflt) +#endif + +static inline void ckrm_sched_tick(unsigned long j,int this_cpu, int idle, + runqueue_t *rq) +{ + /* first determine whether we have to do anything + * without grabing the global lock + */ + + int sample, update; + +#ifdef __SIMULATOR__ + if ((this_cpu == 0) && (j % 1000) == 0) { + ckrm_cpu_monitor(1); + } +#endif + + if (ckrm_rq_cpu_disabled(rq)) + return; + + update = (j % CVT_UPDATE_TICK); + sample = COND_SMP(1,(j % CPU_PID_CTRL_TICK)); + +// avoid taking the global class_list lock on every tick + if (likely(update && sample)) + return; // nothing to be done; + + read_lock(&class_list_lock); + +#ifdef CONFIG_SMP + if (sample==0) { + ckrm_load_sample(rq_ckrm_load(rq),this_cpu); + } +#endif + + if (update==0) { + classqueue_update_base(get_cpu_classqueue(this_cpu)); + update_class_cputime(this_cpu,idle); + // occasionally we need to call the weight adjustment + // for SMP systems + if (COND_SMP(0,(this_cpu==0))) + adjust_local_weight(); + } + + read_unlock(&class_list_lock); +} + #else /*! CONFIG_CKRM_CPU_SCHEDULE*/ -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +static inline struct task_struct * rq_get_next_task(struct runqueue* rq, + int cpu) { prio_array_t *array; struct list_head *queue; int idx; + if (unlikely(!rq->nr_running)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { + rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + return NULL; + } + } array = rq->active; if (unlikely(!array->nr_active)) { /* @@ -392,11 +596,17 @@ static inline struct task_struct * rq_get_next_task(struct runqueue* rq) return list_entry(queue->next, task_t, run_list); } -static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } -static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } +static inline void class_enqueue_task(struct task_struct* p, + prio_array_t *array) { } +static inline void class_dequeue_task(struct task_struct* p, + prio_array_t *array) { } static inline void init_cpu_classes(void) { } +static inline void ckrm_sched_tick(int j,int this_cpu,int idle, void* arg) {} +static inline void ckrm_account_task(struct runqueue* rq, struct + task_struct *prev, + unsigned long long now) { } #define rq_ckrm_load(rq) NULL -static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} + #endif /* CONFIG_CKRM_CPU_SCHEDULE */ /* @@ -1533,261 +1743,129 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } -#ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline int ckrm_preferred_task(task_t *tmp,long min, long max, - int phase, enum idle_type idle) -{ - long pressure = task_load(tmp); - - if (pressure > max) - return 0; - - if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) - return 0; - return 1; -} - /* - * move tasks for a specic local class - * return number of tasks pulled + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. */ -static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, - runqueue_t *this_rq, - runqueue_t *busiest, - struct sched_domain *sd, - int this_cpu, - enum idle_type idle, - long* pressure_imbalance) +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) { prio_array_t *array, *dst_array; struct list_head *head, *curr; + int idx, pulled = 0; task_t *tmp; - int idx; - int pulled = 0; - int phase = -1; - long pressure_min, pressure_max; - /*hzheng: magic : 90% balance is enough*/ - long balance_min = *pressure_imbalance / 10; -/* - * we don't want to migrate tasks that will reverse the balance - * or the tasks that make too small difference - */ -#define CKRM_BALANCE_MAX_RATIO 100 -#define CKRM_BALANCE_MIN_RATIO 1 - start: - phase ++; +#if CONFIG_CKRM_CPU_SCHEDULE + /* need to distinguish between the runqueues and the class + * local runqueues. + * we know we can get here only if the dflt class is present + */ + ckrm_lrq_t *l_this_rq = &this_rq->dflt_lrq; + ckrm_lrq_t *l_busiest = &busiest->dflt_lrq; +#else +#define l_busiest busiest +#define l_this_rq this_rq +#endif + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to * be cache-cold, thus switching CPUs has the least effect * on them. */ - if (src_lrq->expired->nr_active) { - array = src_lrq->expired; - dst_array = dst_lrq->expired; + if (l_busiest->expired->nr_active) { + array = l_busiest->expired; + dst_array = l_this_rq->expired; } else { - array = src_lrq->active; - dst_array = dst_lrq->active; + array = l_busiest->active; + dst_array = l_this_rq->active; } - - new_array: + +new_array: /* Start searching at priority 0: */ idx = 0; - skip_bitmap: +skip_bitmap: if (!idx) idx = sched_find_first_bit(array->bitmap); else idx = find_next_bit(array->bitmap, MAX_PRIO, idx); if (idx >= MAX_PRIO) { - if (array == src_lrq->expired && src_lrq->active->nr_active) { - array = src_lrq->active; - dst_array = dst_lrq->active; + if (array == l_busiest->expired && l_busiest->active->nr_active) { + array = l_busiest->active; + dst_array = l_this_rq->active; goto new_array; } - if ((! phase) && (! pulled) && (idle != IDLE)) - goto start; //try again - else - goto out; //finished search for this lrq + goto out; } - + head = array->queue + idx; curr = head->prev; - skip_queue: +skip_queue: tmp = list_entry(curr, task_t, run_list); - + curr = curr->prev; - + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; - pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; - pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; - /* - * skip the tasks that will reverse the balance too much - */ - if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { - *pressure_imbalance -= task_load(tmp); - pull_task(busiest, array, tmp, - this_rq, dst_array, this_cpu); - pulled++; - - if (*pressure_imbalance <= balance_min) - goto out; + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } - - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - out: +out: return pulled; } -static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) -{ - long imbalance; - /* - * make sure after balance, imbalance' > - imbalance/2 - * we don't want the imbalance be reversed too much - */ - imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) - - pid_get_pressure(rq_ckrm_load(this_rq),1); - imbalance /= 2; - return imbalance; -} - /* - * try to balance the two runqueues - * - * Called with both runqueues locked. - * if move_tasks is called, it will try to move at least one task over + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) { - struct ckrm_cpu_class *clsptr,*vip_cls = NULL; - ckrm_lrq_t* src_lrq,*dst_lrq; - long pressure_imbalance, pressure_imbalance_old; - int src_cpu = task_cpu(busiest->curr); - struct list_head *list; - int pulled = 0; - long imbalance; - - imbalance = ckrm_rq_imbalance(this_rq,busiest); + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; - if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) - goto out; + max_load = this_load = total_load = total_pwr = 0; - //try to find the vip class - list_for_each_entry(clsptr,&active_cpu_classes,links) { - src_lrq = get_ckrm_lrq(clsptr,src_cpu); + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; - if (! lrq_nr_running(src_lrq)) - continue; - - if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) - { - vip_cls = clsptr; - } - } - - /* - * do search from the most significant class - * hopefully, less tasks will be migrated this way - */ - clsptr = vip_cls; - - move_class: - if (! clsptr) - goto out; - - - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - if (! lrq_nr_running(src_lrq)) - goto other_class; - - dst_lrq = get_ckrm_lrq(clsptr,this_cpu); - - //how much pressure for this class should be transferred - pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; - if (pulled && ! pressure_imbalance) - goto other_class; - - pressure_imbalance_old = pressure_imbalance; - - //move tasks - pulled += - ckrm_cls_move_tasks(src_lrq,dst_lrq, - this_rq, - busiest, - sd,this_cpu,idle, - &pressure_imbalance); - - /* - * hzheng: 2 is another magic number - * stop balancing if the imbalance is less than 25% of the orig - */ - if (pressure_imbalance <= (pressure_imbalance_old >> 2)) - goto out; - - //update imbalance - imbalance *= pressure_imbalance / pressure_imbalance_old; - other_class: - //who is next? - list = clsptr->links.next; - if (list == &active_cpu_classes) - list = list->next; - clsptr = list_entry(list, typeof(*clsptr), links); - if (clsptr != vip_cls) - goto move_class; - out: - return pulled; -} - -/** - * ckrm_check_balance - is load balancing necessary? - * return 0 if load balancing is not necessary - * otherwise return the average load of the system - * also, update nr_group - * - * heuristics: - * no load balancing if it's load is over average - * no load balancing if it's load is far more than the min - * task: - * read the status of all the runqueues - */ -static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, - enum idle_type idle, int* nr_group) -{ - struct sched_group *group = sd->groups; - unsigned long min_load, max_load, avg_load; - unsigned long total_load, this_load, total_pwr; - - max_load = this_load = total_load = total_pwr = 0; - min_load = 0xFFFFFFFF; - *nr_group = 0; - - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; + local_group = cpu_isset(this_cpu, group->cpumask); /* Tally up the load of all CPUs in the group */ + avg_load = 0; cpus_and(tmp, group->cpumask, cpu_online_map); if (unlikely(cpus_empty(tmp))) goto nextgroup; - avg_load = 0; - local_group = cpu_isset(this_cpu, group->cpumask); - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i); + else + load = source_load(i); + nr_cpus++; avg_load += load; } @@ -1803,386 +1881,86 @@ static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, if (local_group) { this_load = avg_load; + this = group; goto nextgroup; } else if (avg_load > max_load) { max_load = avg_load; - } - if (avg_load < min_load) { - min_load = avg_load; + busiest = group; } nextgroup: group = group->next; - *nr_group = *nr_group + 1; } while (group != sd->groups); - if (!max_load || this_load >= max_load) + if (!busiest || this_load >= max_load) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - /* hzheng: debugging: 105 is a magic number - * 100*max_load <= sd->imbalance_pct*this_load) - * should use imbalance_pct instead - */ - if (this_load > avg_load - || 100*max_load < 105*this_load - || 100*min_load < 70*this_load - ) + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) goto out_balanced; - return avg_load; - out_balanced: - return 0; -} - -/** - * any group that has above average load is considered busy - * find the busiest queue from any of busy group - */ -static runqueue_t * -ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, - unsigned long avg_load, enum idle_type idle, - int nr_group) -{ - struct sched_group *group; - runqueue_t * busiest=NULL; - unsigned long rand; - - group = sd->groups; - rand = get_ckrm_rand(nr_group); - nr_group = 0; + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + *imbalance = min(max_load - avg_load, avg_load - this_load); - do { - unsigned long load,total_load,max_load; - cpumask_t tmp; - int i; - runqueue_t * grp_busiest; + /* How much load to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + / SCHED_LOAD_SCALE; - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto find_nextgroup; + if (*imbalance < SCHED_LOAD_SCALE - 1) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; - total_load = 0; - max_load = 0; - grp_busiest = NULL; - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); - total_load += load; - if (load > max_load) { - max_load = load; - grp_busiest = cpu_rq(i); - } + if (max_load - this_load >= SCHED_LOAD_SCALE*2) { + *imbalance = 1; + return busiest; } - total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; - if (total_load > avg_load) { - busiest = grp_busiest; - if (nr_group >= rand) - break; - } - find_nextgroup: - group = group->next; - nr_group ++; - } while (group != sd->groups); + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ - return busiest; -} + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now /= SCHED_LOAD_SCALE; -/** - * load_balance - pressure based load balancing algorithm used by ckrm - */ -static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - runqueue_t *busiest; - unsigned long avg_load; - int nr_moved,nr_group; + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); - avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); - if (! avg_load) - goto out_balanced; + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + if (max_load < tmp) + tmp = max_load; + pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; - busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); - if (! busiest) - goto out_balanced; - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; - nr_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - 0,sd, idle); - spin_unlock(&busiest->lock); - if (nr_moved) { - adjust_local_weight(); - } + *imbalance = 1; + return busiest; } - if (!nr_moved) - sd->nr_balance_failed ++; - else - sd->nr_balance_failed = 0; + /* Get rid of the scaling factor, rounding down as we divide */ + *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - - return nr_moved; - -out_balanced: - /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - - return 0; -} - -/* - * this_rq->lock is already held - */ -static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd) -{ - int ret; - read_lock(&class_list_lock); - ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - return ret; -} - -static inline int load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - int ret; - - spin_lock(&this_rq->lock); - read_lock(&class_list_lock); - ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - spin_unlock(&this_rq->lock); - return ret; -} -#else /*! CONFIG_CKRM_CPU_SCHEDULE */ -/* - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, - * as part of a balancing operation within "domain". Returns the number of - * tasks moved. - * - * Called with both runqueues locked. - */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) -{ - prio_array_t *array, *dst_array; - struct list_head *head, *curr; - int idx, pulled = 0; - task_t *tmp; - - if (max_nr_move <= 0 || busiest->nr_running <= 1) - goto out; - - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } - -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } - goto out; - } - - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, task_t, run_list); - - curr = curr->prev; - - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); - pulled++; - - /* We only want to steal up to the prescribed number of tasks. */ - if (pulled < max_nr_move) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } -out: - return pulled; -} - -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the number of tasks which should be - * moved to restore balance via the imbalance parameter. - */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle) -{ - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; - unsigned long max_load, avg_load, total_load, this_load, total_pwr; - - max_load = this_load = total_load = total_pwr = 0; - - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; - - local_group = cpu_isset(this_cpu, group->cpumask); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; - - for_each_cpu_mask(i, tmp) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = target_load(i); - else - load = source_load(i); - - nr_cpus++; - avg_load += load; - } - - if (!nr_cpus) - goto nextgroup; - - total_load += avg_load; - total_pwr += group->cpu_power; - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - this = group; - goto nextgroup; - } else if (avg_load > max_load) { - max_load = avg_load; - busiest = group; - } -nextgroup: - group = group->next; - } while (group != sd->groups); - - if (!busiest || this_load >= max_load) - goto out_balanced; - - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - - if (this_load >= avg_load || - 100*max_load <= sd->imbalance_pct*this_load) - goto out_balanced; - - /* - * We're trying to get all the cpus to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load, as either of these - * actions would just result in more rebalancing later, and ping-pong - * tasks around. Thus we look for the minimum possible imbalance. - * Negative imbalances (*we* are more loaded than anyone else) will - * be counted as no imbalance for these purposes -- we can't fix that - * by pulling tasks to us. Be careful of negative numbers as they'll - * appear as very large values with unsigned longs. - */ - *imbalance = min(max_load - avg_load, avg_load - this_load); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) - / SCHED_LOAD_SCALE; - - if (*imbalance < SCHED_LOAD_SCALE - 1) { - unsigned long pwr_now = 0, pwr_move = 0; - unsigned long tmp; - - if (max_load - this_load >= SCHED_LOAD_SCALE*2) { - *imbalance = 1; - return busiest; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. - */ - - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); - pwr_now /= SCHED_LOAD_SCALE; - - /* Amount of load we'd subtract */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; - if (max_load > tmp) - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, - max_load - tmp); - - /* Amount of load we'd add */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; - if (max_load < tmp) - tmp = max_load; - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; - - /* Move if we gain another 8th of a CPU worth of throughput */ - if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) - goto out_balanced; - - *imbalance = 1; - return busiest; - } - - /* Get rid of the scaling factor, rounding down as we divide */ - *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; - - return busiest; + return busiest; out_balanced: if (busiest && (idle == NEWLY_IDLE || @@ -2224,6 +2002,17 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) * * Called with this_rq unlocked. */ + +static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, + enum idle_type idle) +#ifndef CONFIG_CKRM_CPU_SCHEDULE +{ + return -1; +} +#endif +; + static int load_balance(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd, enum idle_type idle) { @@ -2234,6 +2023,9 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, spin_lock(&this_rq->lock); + if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,idle)) != -1) + goto out_balanced; + group = find_busiest_group(sd, this_cpu, &imbalance, idle); if (!group) goto out_balanced; @@ -2319,8 +2111,12 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, struct sched_group *group; runqueue_t *busiest = NULL; unsigned long imbalance; - int nr_moved = 0; + int nr_moved; + + if ((nr_moved = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE)) != -1) + goto out; + nr_moved = 0; group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) goto out; @@ -2340,8 +2136,6 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out: return nr_moved; } -#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ - /* * idle_balance is called by schedule() if this_cpu is about to become @@ -2447,6 +2241,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; + ckrm_sched_tick(j,this_cpu,(idle != NOT_IDLE),this_rq); + /* Update our load */ old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; @@ -2485,7 +2281,9 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, */ static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) { + ckrm_sched_tick(jiffies,cpu,(idle != NOT_IDLE),rq); } + static inline void idle_balance(int cpu, runqueue_t *rq) { } @@ -2522,15 +2320,19 @@ EXPORT_PER_CPU_SYMBOL(kstat); #ifndef CONFIG_CKRM_CPU_SCHEDULE #define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ + ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ ((rq)->curr->static_prio > (rq)->best_expired_prio)) #else +/* we need to scale the starvation based on weight + * classes with small weight have longer expiration starvation + */ #define EXPIRED_STARVING(rq) \ - (STARVATION_LIMIT && ((rq)->expired_timestamp && \ + ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) + (((STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)*CKRM_MAX_WEIGHT)/rq->local_weight)))) || \ + (this_rq()->curr->static_prio > (rq)->best_expired_prio)) #endif /* @@ -2568,7 +2370,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, IDLE); return; } @@ -2609,8 +2410,11 @@ void scheduler_tick(int user_ticks, int sys_ticks) } if (!--p->time_slice) { #ifdef CONFIG_CKRM_CPU_SCHEDULE - /* Hubertus ... we can abstract this out */ - ckrm_lrq_t* rq = get_task_lrq(p); + /* we redefine RQ to be a local runqueue */ + ckrm_lrq_t* rq; + runqueue_t *cpu_rq = this_rq(); + rq = ckrm_rq_cpu_enabled(cpu_rq) ? get_task_lrq(p) + : &(cpu_rq->dflt_lrq); #endif dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2622,8 +2426,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { enqueue_task(p, rq->expired); - if (p->static_prio < this_rq()->best_expired_prio) - this_rq()->best_expired_prio = p->static_prio; + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); } else { @@ -2657,7 +2461,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) out_unlock: spin_unlock(&rq->lock); out: - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -2793,19 +2596,8 @@ need_resched: spin_lock_irq(&rq->lock); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - if (prev != rq->idle) { - unsigned long long run = now - prev->timestamp; - ckrm_lrq_t * lrq = get_task_lrq(prev); - - lrq->lrq_load -= task_load(prev); - cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); - lrq->lrq_load += task_load(prev); + ckrm_account_task(rq,prev,now); - cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); - update_local_cvt(prev, run); - } -#endif /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2821,19 +2613,12 @@ need_resched: } cpu = smp_processor_id(); - if (unlikely(!rq->nr_running)) { - idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - rq->expired_timestamp = 0; -#endif - wake_sleeping_dependent(cpu, rq); - goto switch_tasks; - } - } - next = rq_get_next_task(rq); + next = rq_get_next_task(rq,cpu); + if (unlikely(next == NULL)) { + next = rq->idle; + goto switch_tasks; + } if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; @@ -4528,13 +4313,12 @@ void __init sched_init(void) rq->active = rq->arrays; rq->expired = rq->arrays + 1; + rq->best_expired_prio = MAX_PRIO; #else rq = cpu_rq(i); spin_lock_init(&rq->lock); #endif - rq->best_expired_prio = MAX_PRIO; - #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; @@ -4647,6 +4431,21 @@ int task_running_sys(struct task_struct *p) EXPORT_SYMBOL(task_running_sys); #endif + + +/******************************************************************** + * + * CKRM Scheduler additions + * + * (a) helper functions + * (b) load balancing code + * + * These are required here to avoid having to externalize many + * of the definitions in sched.c + * + * + ********************************************************************/ + #ifdef CONFIG_CKRM_CPU_SCHEDULE /** * return the classqueue object of a certain processor @@ -4676,4 +4475,559 @@ void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) task_rq_unlock(rq,&flags); } + +/** + * get_min_cvt_locking - get the mininum cvt on a particular cpu under rqlock + */ + +CVT_t get_min_cvt(int cpu); + +CVT_t get_min_cvt_locking(int cpu) +{ + CVT_t cvt; + struct runqueue *rq = cpu_rq(cpu); + spin_lock(&rq->lock); + cvt = get_min_cvt(cpu); + spin_unlock(&rq->lock); + return cvt; +} + +ckrm_lrq_t *rq_get_dflt_lrq(int cpu) +{ + return &(cpu_rq(cpu)->dflt_lrq); +} + +#ifdef CONFIG_SMP + +/************** CKRM Load Balancing code ************************/ + +static inline int ckrm_preferred_task(task_t *tmp,long min, long max, + int phase, enum idle_type idle) +{ + long pressure = task_load(tmp); + + if (pressure > max) + return 0; + + if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) + return 0; + return 1; +} + +/* + * move tasks for a specic local class + * return number of tasks pulled + */ +static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, + runqueue_t *this_rq, + runqueue_t *busiest, + struct sched_domain *sd, + int this_cpu, + enum idle_type idle, + long* pressure_imbalance) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + task_t *tmp; + int idx; + int pulled = 0; + int phase = -1; + long pressure_min, pressure_max; + /*hzheng: magic : 90% balance is enough*/ + long balance_min = *pressure_imbalance / 10; +/* + * we don't want to migrate tasks that will reverse the balance + * or the tasks that make too small difference + */ +#define CKRM_BALANCE_MAX_RATIO 100 +#define CKRM_BALANCE_MIN_RATIO 1 + start: + phase ++; + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (src_lrq->expired->nr_active) { + array = src_lrq->expired; + dst_array = dst_lrq->expired; + } else { + array = src_lrq->active; + dst_array = dst_lrq->active; + } + + new_array: + /* Start searching at priority 0: */ + idx = 0; + skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == src_lrq->expired && src_lrq->active->nr_active) { + array = src_lrq->active; + dst_array = dst_lrq->active; + goto new_array; + } + if ((! phase) && (! pulled) && (idle != IDLE)) + goto start; //try again + else + goto out; //finished search for this lrq + } + + head = array->queue + idx; + curr = head->prev; + skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + + pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; + pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; + /* + * skip the tasks that will reverse the balance too much + */ + if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { + *pressure_imbalance -= task_load(tmp); + pull_task(busiest, array, tmp, + this_rq, dst_array, this_cpu); + pulled++; + + if (*pressure_imbalance <= balance_min) + goto out; + } + + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + out: + return pulled; +} + +static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) +{ + long imbalance; + /* + * make sure after balance, imbalance' > - imbalance/2 + * we don't want the imbalance be reversed too much + */ + imbalance = ckrm_get_pressure(rq_ckrm_load(dst_rq),0) + - ckrm_get_pressure(rq_ckrm_load(this_rq),1); + imbalance /= 2; + return imbalance; +} + +/* + * try to balance the two runqueues + * + * Called with both runqueues locked. + * if move_tasks is called, it will try to move at least one task over + */ +static int ckrm_move_tasks(runqueue_t *this_rq, int this_cpu, + runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct ckrm_cpu_class *clsptr,*vip_cls = NULL; + ckrm_lrq_t* src_lrq,*dst_lrq; + long pressure_imbalance, pressure_imbalance_old; + int src_cpu = task_cpu(busiest->curr); + struct list_head *list; + int pulled = 0; + long imbalance; + + imbalance = ckrm_rq_imbalance(this_rq,busiest); + + if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) + goto out; + + //try to find the vip class + list_for_each_entry(clsptr,&active_cpu_classes,links) { + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + + if (! lrq_nr_running(src_lrq)) + continue; + + if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) + { + vip_cls = clsptr; + } + } + + /* + * do search from the most significant class + * hopefully, less tasks will be migrated this way + */ + clsptr = vip_cls; + + move_class: + if (! clsptr) + goto out; + + + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + if (! lrq_nr_running(src_lrq)) + goto other_class; + + dst_lrq = get_ckrm_lrq(clsptr,this_cpu); + + //how much pressure for this class should be transferred + pressure_imbalance = (src_lrq->lrq_load * imbalance)/WEIGHT_TO_SHARE(src_lrq->local_weight); + if (pulled && ! pressure_imbalance) + goto other_class; + + pressure_imbalance_old = pressure_imbalance; + + //move tasks + pulled += + ckrm_cls_move_tasks(src_lrq,dst_lrq, + this_rq, + busiest, + sd,this_cpu,idle, + &pressure_imbalance); + + /* + * hzheng: 2 is another magic number + * stop balancing if the imbalance is less than 25% of the orig + */ + if (pressure_imbalance <= (pressure_imbalance_old >> 2)) + goto out; + + //update imbalance + imbalance *= pressure_imbalance / pressure_imbalance_old; + other_class: + //who is next? + list = clsptr->links.next; + if (list == &active_cpu_classes) + list = list->next; + clsptr = list_entry(list, typeof(*clsptr), links); + if (clsptr != vip_cls) + goto move_class; + out: + return pulled; +} + +/** + * ckrm_check_balance - is load balancing necessary? + * return 0 if load balancing is not necessary + * otherwise return the average load of the system + * also, update nr_group + * + * heuristics: + * no load balancing if it's load is over average + * no load balancing if it's load is far more than the min + * task: + * read the status of all the runqueues + */ +static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, + enum idle_type idle, int* nr_group) +{ + struct sched_group *group = sd->groups; + unsigned long min_load, max_load, avg_load; + unsigned long total_load, this_load, total_pwr; + + max_load = this_load = total_load = total_pwr = 0; + min_load = 0xFFFFFFFF; + *nr_group = 0; + + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + /* Tally up the load of all CPUs in the group */ + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto nextgroup; + + avg_load = 0; + local_group = cpu_isset(this_cpu, group->cpumask); + + for_each_cpu_mask(i, tmp) { + load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + } + if (avg_load < min_load) { + min_load = avg_load; + } +nextgroup: + group = group->next; + *nr_group = *nr_group + 1; + } while (group != sd->groups); + + if (!max_load || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + /* hzheng: debugging: 105 is a magic number + * 100*max_load <= sd->imbalance_pct*this_load) + * should use imbalance_pct instead + */ + if (this_load > avg_load + || 100*max_load < 105*this_load + || 100*min_load < 70*this_load + ) + goto out_balanced; + + return avg_load; + out_balanced: + return 0; +} + +/** + * any group that has above average load is considered busy + * find the busiest queue from any of busy group + */ +static runqueue_t * +ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, + unsigned long avg_load, enum idle_type idle, + int nr_group) +{ + struct sched_group *group; + runqueue_t * busiest=NULL; + unsigned long rand; + + group = sd->groups; + rand = get_ckrm_rand(nr_group); + nr_group = 0; + + do { + unsigned long load,total_load,max_load; + cpumask_t tmp; + int i; + runqueue_t * grp_busiest; + + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto find_nextgroup; + + total_load = 0; + max_load = 0; + grp_busiest = NULL; + for_each_cpu_mask(i, tmp) { + load = ckrm_get_pressure(rq_ckrm_load(cpu_rq(i)),0); + total_load += load; + if (load > max_load) { + max_load = load; + grp_busiest = cpu_rq(i); + } + } + + total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; + if (total_load > avg_load) { + busiest = grp_busiest; + if (nr_group >= rand) + break; + } + find_nextgroup: + group = group->next; + nr_group ++; + } while (group != sd->groups); + + return busiest; +} + +/** + * load_balance - pressure based load balancing algorithm used by ckrm + */ +static int ckrm_load_balance_locked(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, + enum idle_type idle) +{ + runqueue_t *busiest; + unsigned long avg_load; + int nr_moved,nr_group; + + avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); + if (! avg_load) + goto out_balanced; + + busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); + if (! busiest) + goto out_balanced; + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_lock_balance(this_rq, busiest); + nr_moved = ckrm_move_tasks(this_rq, this_cpu, busiest, + 0,sd, idle); + spin_unlock(&busiest->lock); + if (nr_moved) { + adjust_local_weight(); + } + } + + if (!nr_moved) + sd->nr_balance_failed ++; + else + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + + return nr_moved; + +out_balanced: + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} + +static inline int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, + enum idle_type idle) +{ + int ret; + + if (ckrm_rq_cpu_disabled(this_rq)) + return -1; + //spin_lock(&this_rq->lock); + read_lock(&class_list_lock); + ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,idle); + // ret = ckrm_load_balance_locked(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + //spin_unlock(&this_rq->lock); + return ret; +} + +#endif // CONFIG_SMP + + +void ckrm_cpu_class_queue_update(int on) +{ + /* This is called when the mode changes from disabled + * to enabled (on=1) or vice versa (on=0). + * we make sure that all classqueues on all cpus + * either have the default class enqueued (on=1) or + * all classes dequeued (on=0). + * if not done a race condition will persist + * when flipping the ckrm_sched_mode. + * Otherwise will lead to more complicated code + * in rq_get_next_task, where we despite knowing of + * runnable tasks can not find an enqueued class. + */ + + int i; + runqueue_t *rq; + ckrm_lrq_t *lrq; + struct ckrm_cpu_class *clsptr; + + if (on) { + BUG_ON(ckrm_cpu_enabled()); + for_each_cpu(i) { + rq = cpu_rq(i); + BUG_ON(ckrm_rq_cpu_enabled(rq)); + lrq = &rq->dflt_lrq; + spin_lock(&rq->lock); + + BUG_ON(cls_in_classqueue(&lrq->classqueue_linkobj)); + + classqueue_init(&rq->classqueue,1); + lrq->top_priority = find_first_bit(lrq->active->bitmap, + MAX_PRIO), + classqueue_enqueue(lrq->classqueue, + &lrq->classqueue_linkobj, 0); + spin_unlock(&rq->lock); +#if 0 + printk("UPDATE(%d) run=%lu:%d:%d %d:%d->%d\n", i, + rq->nr_running,lrq->active->nr_active, + lrq->expired->nr_active, + find_first_bit(lrq->active->bitmap,MAX_PRIO), + find_first_bit(lrq->expired->bitmap,MAX_PRIO), + lrq->top_priority); #endif + } + } else { + for_each_cpu(i) { + rq = cpu_rq(i); + spin_lock(&rq->lock); + + /* walk through all classes and make sure they + * are not enqueued + */ + write_lock(&class_list_lock); + list_for_each_entry(clsptr,&active_cpu_classes,links) { + lrq = get_ckrm_lrq(clsptr,i); + BUG_ON((lrq != &rq->dflt_lrq) && lrq_nr_running(lrq)); // must be empty + if (cls_in_classqueue(&lrq->classqueue_linkobj)) + classqueue_dequeue(lrq->classqueue, + &lrq->classqueue_linkobj); + } + rq->classqueue.enabled = 0; + write_unlock(&class_list_lock); + spin_unlock(&rq->lock); + } + } +} + +/* + * callback when a class is getting deleted + * need to remove it from the class runqueue. see (class_queue_update) + */ + +void ckrm_cpu_class_queue_delete_sync(struct ckrm_cpu_class *clsptr) +{ + int i; + + for_each_cpu(i) { + runqueue_t *rq = cpu_rq(i); + ckrm_lrq_t *lrq = get_ckrm_lrq(clsptr,i); + + spin_lock(&rq->lock); + write_lock(&class_list_lock); + BUG_ON(lrq_nr_running(lrq)); // must be empty + if (cls_in_classqueue(&lrq->classqueue_linkobj)) + classqueue_dequeue(lrq->classqueue, + &lrq->classqueue_linkobj); + write_unlock(&class_list_lock); + spin_unlock(&rq->lock); + } +} + +#endif // CONFIG_CKRM_CPU_SCHEDULE -- 2.47.0