From: Marc Fiuczynski Date: Sat, 17 Jul 2004 13:00:36 +0000 (+0000) Subject: Merged ckrm-E15 CPU controller X-Git-Tag: before-ipod-patch~14 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=92ade5af0325d0597318bb21d38bb34c99726835;p=linux-2.6.git Merged ckrm-E15 CPU controller --- diff --git a/.config b/.config index 7f9bc8d9d..a1a300be4 100644 --- a/.config +++ b/.config @@ -29,6 +29,8 @@ CONFIG_CKRM=y CONFIG_RCFS_FS=y CONFIG_CKRM_TYPE_TASKCLASS=y CONFIG_CKRM_RES_NUMTASKS=y +CONFIG_CKRM_CPU_SCHEDULE=y +CONFIG_CKRM_CPU_MONITOR=y CONFIG_CKRM_TYPE_SOCKETCLASS=y CONFIG_CKRM_RES_LISTENAQ=m CONFIG_CKRM_RBCE=m diff --git a/.config.old b/.config.old index 55ca63656..b22fb717d 100644 --- a/.config.old +++ b/.config.old @@ -29,8 +29,13 @@ CONFIG_CKRM=y CONFIG_RCFS_FS=y CONFIG_CKRM_TYPE_TASKCLASS=y CONFIG_CKRM_RES_NUMTASKS=y +CONFIG_CKRM_CPU_SCHEDULE=y +CONFIG_CKRM_CPU_MONITOR=m CONFIG_CKRM_TYPE_SOCKETCLASS=y CONFIG_CKRM_RES_LISTENAQ=m +CONFIG_CKRM_RBCE=m +CONFIG_CKRM_CRBCE=m +# CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_SYSCTL=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y @@ -414,7 +419,6 @@ CONFIG_BLK_DEV_IDE=y # CONFIG_BLK_DEV_HD_IDE is not set CONFIG_BLK_DEV_IDEDISK=y CONFIG_IDEDISK_MULTI_MODE=y -# CONFIG_IDEDISK_STROKE is not set CONFIG_BLK_DEV_IDECS=m CONFIG_BLK_DEV_IDECD=y CONFIG_BLK_DEV_IDETAPE=m @@ -506,6 +510,7 @@ CONFIG_SCSI_FC_ATTRS=m # SCSI low-level drivers # CONFIG_BLK_DEV_3W_XXXX_RAID=m +# CONFIG_SCSI_3W_9XXX is not set # CONFIG_SCSI_7000FASST is not set # CONFIG_SCSI_ACARD is not set CONFIG_SCSI_AHA152X=m @@ -542,7 +547,6 @@ CONFIG_SCSI_SATA_VIA=m CONFIG_SCSI_SATA_VITESSE=m CONFIG_SCSI_BUSLOGIC=m # CONFIG_SCSI_OMIT_FLASHPOINT is not set -# CONFIG_SCSI_CPQFCTS is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_DTC3280 is not set # CONFIG_SCSI_EATA is not set @@ -613,6 +617,9 @@ CONFIG_MD_RAID6=m CONFIG_MD_MULTIPATH=m CONFIG_BLK_DEV_DM=m CONFIG_DM_CRYPT=m +# CONFIG_DM_SNAPSHOT is not set +# CONFIG_DM_MIRROR is not set +# CONFIG_DM_ZERO is not set # # Fusion MPT device support @@ -1083,6 +1090,7 @@ CONFIG_SUNDANCE=m CONFIG_TLAN=m CONFIG_VIA_RHINE=m CONFIG_VIA_RHINE_MMIO=y +# CONFIG_VIA_VELOCITY is not set CONFIG_NET_POCKET=y CONFIG_ATP=m CONFIG_DE600=m @@ -1576,6 +1584,7 @@ CONFIG_DRM_SIS=m CONFIG_SYNCLINK_CS=m CONFIG_MWAVE=m # CONFIG_RAW_DRIVER is not set +# CONFIG_HPET is not set CONFIG_HANGCHECK_TIMER=m # @@ -1783,6 +1792,7 @@ CONFIG_VIDEO_SELECT=y CONFIG_FB_HGA=m # CONFIG_FB_HGA_ACCEL is not set CONFIG_FB_RIVA=m +# CONFIG_FB_RIVA_I2C is not set CONFIG_FB_I810=m CONFIG_FB_I810_GTF=y CONFIG_FB_MATROX=m @@ -1993,6 +2003,7 @@ CONFIG_USB_ACM=m CONFIG_USB_PRINTER=m CONFIG_USB_STORAGE=m # CONFIG_USB_STORAGE_DEBUG is not set +# CONFIG_USB_STORAGE_RW_DETECT is not set CONFIG_USB_STORAGE_DATAFAB=y CONFIG_USB_STORAGE_FREECOM=y CONFIG_USB_STORAGE_ISD200=y @@ -2284,7 +2295,6 @@ CONFIG_MINIX_SUBPARTITION=y CONFIG_SOLARIS_X86_PARTITION=y CONFIG_UNIXWARE_DISKLABEL=y # CONFIG_LDM_PARTITION is not set -# CONFIG_NEC98_PARTITION is not set CONFIG_SGI_PARTITION=y # CONFIG_ULTRIX_PARTITION is not set CONFIG_SUN_PARTITION=y @@ -2411,5 +2421,4 @@ CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=m CONFIG_X86_BIOS_REBOOT=y -CONFIG_X86_STD_RESOURCES=y CONFIG_PC=y diff --git a/include/linux/autoconf.h b/include/linux/autoconf.h index 5ee385a8a..307d630b3 100644 --- a/include/linux/autoconf.h +++ b/include/linux/autoconf.h @@ -30,6 +30,8 @@ #define CONFIG_RCFS_FS 1 #define CONFIG_CKRM_TYPE_TASKCLASS 1 #define CONFIG_CKRM_RES_NUMTASKS 1 +#define CONFIG_CKRM_CPU_SCHEDULE 1 +#define CONFIG_CKRM_CPU_MONITOR 1 #define CONFIG_CKRM_TYPE_SOCKETCLASS 1 #define CONFIG_CKRM_RES_LISTENAQ_MODULE 1 #define CONFIG_CKRM_RBCE_MODULE 1 diff --git a/include/linux/sched.h b/include/linux/sched.h index 385b390fe..0b1efea4e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -551,6 +551,9 @@ struct task_struct { // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS struct ckrm_task_class *taskclass; struct list_head taskclass_link; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + struct ckrm_cpu_class *cpu_class; +#endif #endif // CONFIG_CKRM_TYPE_TASKCLASS #endif // CONFIG_CKRM @@ -1009,8 +1012,7 @@ static inline struct mm_struct * get_task_mm(struct task_struct * task) return mm; } - - + /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available */ diff --git a/init/Kconfig b/init/Kconfig index d9153c5aa..94af58d28 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -168,6 +168,24 @@ config CKRM_RES_NUMTASKS Say N if unsure, Y to use the feature. +config CKRM_CPU_SCHEDULE + bool "CKRM CPU scheduler" + depends on CKRM_TYPE_TASKCLASS + default m + help + Use CKRM CPU scheduler instead of Linux Scheduler + + Say N if unsure, Y to use the feature. + +config CKRM_CPU_MONITOR + tristate "CKRM CPU Resoure Monitor" + depends on CKRM_CPU_SCHEDULE + default m + help + Monitor CPU Resource Usage of the classes + + Say N if unsure, Y to use the feature. + config CKRM_TYPE_SOCKETCLASS bool "Class Manager for socket groups" depends on CKRM diff --git a/init/main.c b/init/main.c index 155d05f35..0dfaf47af 100644 --- a/init/main.c +++ b/init/main.c @@ -49,6 +49,8 @@ #include #include +int __init init_ckrm_sched_res(void); + /* * This is one of the first .c files built. Error out early @@ -419,7 +421,6 @@ asmlinkage void __init start_kernel(void) * printk() and can access its per-cpu storage. */ smp_prepare_boot_cpu(); - /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() @@ -638,8 +639,8 @@ static int init(void * unused) * firmware files. */ populate_rootfs(); - do_basic_setup(); + init_ckrm_sched_res(); /* * check if there is an early userspace init. If yes, let it do all diff --git a/kernel/Makefile b/kernel/Makefile index e1b650130..e0a5febdb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -26,8 +26,12 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o +obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_sched.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o +obj-$(CONFIG_KGDB) += kgdbstub.o + ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/sched.c b/kernel/sched.c index a5058d6ee..9667edb46 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -17,7 +17,6 @@ * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin */ - #include #include #include @@ -161,9 +160,6 @@ #define LOW_CREDIT(p) \ ((p)->interactive_credit < -CREDIT_LIMIT) -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] * to time slice values. @@ -189,16 +185,24 @@ static unsigned int task_timeslice(task_t *p) /* * These are the runqueue data structures: */ +typedef struct runqueue runqueue_t; -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) +#ifdef CONFIG_CKRM_CPU_SCHEDULE +#include +#endif -typedef struct runqueue runqueue_t; +#ifdef CONFIG_CKRM_CPU_SCHEDULE -struct prio_array { - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; -}; +/** + * if belong to different class, compare class priority + * otherwise compare task priority + */ +#define TASK_PREEMPTS_CURR(p, rq) \ + (((p)->cpu_class != (rq)->curr->cpu_class) && ((rq)->curr != (rq)->idle))? class_preempts_curr((p),(rq)->curr) : ((p)->prio < (rq)->curr->prio) +#else +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) +#endif /* * This is the main, per-CPU runqueue data structure. @@ -215,7 +219,7 @@ struct runqueue { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) unsigned long cpu_load; #endif unsigned long long nr_switches; @@ -223,7 +227,12 @@ struct runqueue { unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + unsigned long ckrm_cpu_load; + struct classqueue_struct classqueue; +#else + prio_array_t *active, *expired, arrays[2]; +#endif int best_expired_prio; atomic_t nr_iowait; @@ -260,12 +269,158 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); # define task_running(rq, p) ((rq)->curr == (p)) #endif +#ifdef CONFIG_CKRM_CPU_SCHEDULE +#include +spinlock_t cvt_lock = SPIN_LOCK_UNLOCKED; +rwlock_t class_list_lock = RW_LOCK_UNLOCKED; +LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor +struct ckrm_cpu_class default_cpu_class_obj; + +/* + * the minimum CVT allowed is the base_cvt + * otherwise, it will starve others + */ +CVT_t get_min_cvt(int cpu) +{ + cq_node_t *node; + struct ckrm_local_runqueue * lrq; + CVT_t min_cvt; + + node = classqueue_get_head(bpt_queue(cpu)); + lrq = (node) ? class_list_entry(node) : NULL; + + if (lrq) + min_cvt = lrq->local_cvt; + else + min_cvt = 0; + + return min_cvt; +} + +/* + * update the classueue base for all the runqueues + * TODO: we can only update half of the min_base to solve the movebackward issue + */ +static inline void check_update_class_base(int this_cpu) { + unsigned long min_base = 0xFFFFFFFF; + cq_node_t *node; + int i; + + if (! cpu_online(this_cpu)) return; + + /* + * find the min_base across all the processors + */ + for_each_online_cpu(i) { + /* + * I should change it to directly use bpt->base + */ + node = classqueue_get_head(bpt_queue(i)); + if (node && node->prio < min_base) { + min_base = node->prio; + } + } + if (min_base != 0xFFFFFFFF) + classqueue_update_base(bpt_queue(this_cpu),min_base); +} + +static inline void ckrm_rebalance_tick(int j,int this_cpu) +{ +#ifdef CONFIG_CKRM_CPU_SCHEDULE + read_lock(&class_list_lock); + if (!(j % CVT_UPDATE_TICK)) + update_global_cvts(this_cpu); + +#define CKRM_BASE_UPDATE_RATE 400 + if (! (jiffies % CKRM_BASE_UPDATE_RATE)) + check_update_class_base(this_cpu); + + read_unlock(&class_list_lock); +#endif +} + +static inline struct ckrm_local_runqueue *rq_get_next_class(struct runqueue *rq) +{ + cq_node_t *node = classqueue_get_head(&rq->classqueue); + return ((node) ? class_list_entry(node) : NULL); +} + +static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +{ + prio_array_t *array; + struct task_struct *next; + struct ckrm_local_runqueue *queue; + int cpu = smp_processor_id(); + + next = rq->idle; + retry_next_class: + if ((queue = rq_get_next_class(rq))) { + array = queue->active; + //check switch active/expired queue + if (unlikely(!queue->active->nr_active)) { + queue->active = queue->expired; + queue->expired = array; + queue->expired_timestamp = 0; + + if (queue->active->nr_active) + set_top_priority(queue, + find_first_bit(queue->active->bitmap, MAX_PRIO)); + else { + classqueue_dequeue(queue->classqueue, + &queue->classqueue_linkobj); + cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); + } + + goto retry_next_class; + } + BUG_ON(!queue->active->nr_active); + next = task_list_entry(array->queue[queue->top_priority].next); + } + return next; +} + +static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load += cpu_class_weight(p->cpu_class); } +static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { rq->ckrm_cpu_load -= cpu_class_weight(p->cpu_class); } + +#else /*CONFIG_CKRM_CPU_SCHEDULE*/ + +static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +{ + prio_array_t *array; + struct list_head *queue; + int idx; + + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + rq->best_expired_prio = MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + return list_entry(queue->next, task_t, run_list); +} + +static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } +static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } +static inline void init_cpu_classes(void) { } +static inline void rq_load_inc(runqueue_t *rq, struct task_struct *p) { } +static inline void rq_load_dec(runqueue_t *rq, struct task_struct *p) { } +#endif /* CONFIG_CKRM_CPU_SCHEDULE */ + + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { struct runqueue *rq; @@ -280,7 +435,7 @@ repeat_lock_task: return rq; } -static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +void task_rq_unlock(runqueue_t *rq, unsigned long *flags) { spin_unlock_irqrestore(&rq->lock, *flags); } @@ -307,20 +462,23 @@ static inline void rq_unlock(runqueue_t *rq) /* * Adding/removing a task to/from a priority array: */ -static void dequeue_task(struct task_struct *p, prio_array_t *array) +void dequeue_task(struct task_struct *p, prio_array_t *array) { + BUG_ON(! array); array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + class_dequeue_task(p,array); } -static void enqueue_task(struct task_struct *p, prio_array_t *array) +void enqueue_task(struct task_struct *p, prio_array_t *array) { list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + class_enqueue_task(p,array); } /* @@ -334,6 +492,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + class_enqueue_task(p,array); } /* @@ -375,8 +534,9 @@ static int effective_prio(task_t *p) */ static inline void __activate_task(task_t *p, runqueue_t *rq) { - enqueue_task(p, rq->active); + enqueue_task(p, rq_active(p,rq)); rq->nr_running++; + rq_load_inc(rq,p); } /* @@ -384,8 +544,9 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) */ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { - enqueue_task_head(p, rq->active); + enqueue_task_head(p, rq_active(p,rq)); rq->nr_running++; + rq_load_inc(rq,p); } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -517,6 +678,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) static void deactivate_task(struct task_struct *p, runqueue_t *rq) { rq->nr_running--; + rq_load_dec(rq,p); if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; dequeue_task(p, p->array); @@ -966,6 +1128,7 @@ void fastcall wake_up_forked_process(task_t * p) p->array = current->array; p->array->nr_active++; rq->nr_running++; + rq_load_inc(rq,p); } task_rq_unlock(rq, &flags); } @@ -1288,6 +1451,7 @@ lock_again: p->array = current->array; p->array->nr_active++; rq->nr_running++; + rq_load_inc(rq,p); } } else { /* Not the local CPU - must adjust timestamp */ @@ -1392,9 +1556,13 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, { dequeue_task(p, src_array); src_rq->nr_running--; + rq_load_dec(src_rq,p); + set_task_cpu(p, this_cpu); this_rq->nr_running++; + rq_load_inc(this_rq,p); enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* @@ -1433,6 +1601,194 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } +#ifdef CONFIG_CKRM_CPU_SCHEDULE + +struct ckrm_cpu_class *find_unbalanced_class(int busiest_cpu, int this_cpu, unsigned long *cls_imbalance) +{ + struct ckrm_cpu_class *most_unbalanced_class = NULL; + struct ckrm_cpu_class *clsptr; + int max_unbalance = 0; + + list_for_each_entry(clsptr,&active_cpu_classes,links) { + struct ckrm_local_runqueue *this_lrq = get_ckrm_local_runqueue(clsptr,this_cpu); + struct ckrm_local_runqueue *busiest_lrq = get_ckrm_local_runqueue(clsptr,busiest_cpu); + int unbalance_degree; + + unbalance_degree = (local_queue_nr_running(busiest_lrq) - local_queue_nr_running(this_lrq)) * cpu_class_weight(clsptr); + if (unbalance_degree >= *cls_imbalance) + continue; // already looked at this class + + if (unbalance_degree > max_unbalance) { + max_unbalance = unbalance_degree; + most_unbalanced_class = clsptr; + } + } + *cls_imbalance = max_unbalance; + return most_unbalanced_class; +} + + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. + */ +static int find_busiest_cpu(runqueue_t *this_rq, int this_cpu, int idle, + int *imbalance) +{ + int cpu_load, load, max_load, i, busiest_cpu; + runqueue_t *busiest, *rq_src; + + + /*Hubertus ... the concept of nr_running is replace with cpu_load */ + cpu_load = this_rq->ckrm_cpu_load; + + busiest = NULL; + busiest_cpu = -1; + + max_load = -1; + for_each_online_cpu(i) { + rq_src = cpu_rq(i); + load = rq_src->ckrm_cpu_load; + + if ((load > max_load) && (rq_src != this_rq)) { + busiest = rq_src; + busiest_cpu = i; + max_load = load; + } + } + + if (likely(!busiest)) + goto out; + + *imbalance = max_load - cpu_load; + + /* It needs an at least ~25% imbalance to trigger balancing. */ + if (!idle && ((*imbalance)*4 < max_load)) { + busiest = NULL; + goto out; + } + + double_lock_balance(this_rq, busiest); + /* + * Make sure nothing changed since we checked the + * runqueue length. + */ + if (busiest->ckrm_cpu_load <= cpu_load) { + spin_unlock(&busiest->lock); + busiest = NULL; + } +out: + return (busiest ? busiest_cpu : -1); +} + +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + int imbalance, idx; + int busiest_cpu; + runqueue_t *busiest; + prio_array_t *array; + struct list_head *head, *curr; + task_t *tmp; + struct ckrm_local_runqueue * busiest_local_queue; + struct ckrm_cpu_class *clsptr; + int weight; + unsigned long cls_imbalance; // so we can retry other classes + + // need to update global CVT based on local accumulated CVTs + read_lock(&class_list_lock); + busiest_cpu = find_busiest_cpu(this_rq, this_cpu, idle, &imbalance); + if (busiest_cpu == -1) + goto out; + + busiest = cpu_rq(busiest_cpu); + + /* + * We only want to steal a number of tasks equal to 1/2 the imbalance, + * otherwise we'll just shift the imbalance to the new queue: + */ + imbalance /= 2; + + /* now find class on that runqueue with largest inbalance */ + cls_imbalance = 0xFFFFFFFF; + + retry_other_class: + clsptr = find_unbalanced_class(busiest_cpu, this_cpu, &cls_imbalance); + if (!clsptr) + goto out_unlock; + + busiest_local_queue = get_ckrm_local_runqueue(clsptr,busiest_cpu); + weight = cpu_class_weight(clsptr); + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest_local_queue->expired->nr_active) + array = busiest_local_queue->expired; + else + array = busiest_local_queue->active; + + new_array: + /* Start searching at priority 0: */ + idx = 0; + skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest_local_queue->expired && busiest_local_queue->active->nr_active) { + array = busiest_local_queue->active; + goto new_array; + } + goto retry_other_class; + } + + head = array->queue + idx; + curr = head->prev; + skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd,idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + pull_task(busiest, array, tmp, this_rq, rq_active(tmp,this_rq),this_cpu); + /* + * tmp BUG FIX: hzheng + * load balancing can make the busiest local queue empty + * thus it should be removed from bpt + */ + if (! local_queue_nr_running(busiest_local_queue)) { + classqueue_dequeue(busiest_local_queue->classqueue,&busiest_local_queue->classqueue_linkobj); + cpu_demand_event(get_rq_local_stat(busiest_local_queue,busiest_cpu),CPU_DEMAND_DEQUEUE,0); + } + + imbalance -= weight; + if (!idle && (imbalance>0)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + out_unlock: + spin_unlock(&busiest->lock); + out: + read_unlock(&class_list_lock); + return 0; +} + + +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) +{ +} +#else /* CONFIG_CKRM_CPU_SCHEDULE */ /* * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, * as part of a balancing operation within "domain". Returns the number of @@ -1883,6 +2239,7 @@ next_group: } while (group != sd->groups); >>>>>>> 1.1.9.3 } +#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ /* * rebalance_tick will get called every timer tick, on every CPU. @@ -1903,6 +2260,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; + ckrm_rebalance_tick(j,this_cpu); + /* Update our load */ old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; @@ -1935,13 +2294,15 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, } } } -#else +#else /* SMP*/ /* * on UP we do not need to balance between CPUs: */ static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) { + ckrm_rebalance_tick(jiffies,cpu); } + static inline void idle_balance(int cpu, runqueue_t *rq) { } @@ -1962,7 +2323,7 @@ static inline int wake_priority_sleeper(runqueue_t *rq) return 0; } -DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; EXPORT_PER_CPU_SYMBOL(kstat); @@ -1976,11 +2337,19 @@ EXPORT_PER_CPU_SYMBOL(kstat); * increasing number of running tasks. We also ignore the interactivity * if a better static_prio task has expired: */ + +#ifndef CONFIG_CKRM_CPU_SCHEDULE #define EXPIRED_STARVING(rq) \ ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ ((rq)->curr->static_prio > (rq)->best_expired_prio)) +#else +#define EXPIRED_STARVING(rq) \ + (STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * (local_queue_nr_running(rq)) + 1))) +#endif /* * This function gets called by the timer code, with HZ frequency. @@ -2030,7 +2399,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { + if (p->array != rq_active(p,rq)) { set_tsk_need_resched(p); goto out; } @@ -2053,12 +2422,17 @@ void scheduler_tick(int user_ticks, int sys_ticks) set_tsk_need_resched(p); /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); + dequeue_task(p, rq_active(p,rq)); + enqueue_task(p, rq_active(p,rq)); } goto out_unlock; } +#warning MEF PLANETLAB: "if (vx_need_resched(p)) was if (!--p->time_slice) */" if (vx_need_resched(p)) { +#ifdef CONFIG_CKRM_CPU_SCHEDULE + /* Hubertus ... we can abstract this out */ + struct ckrm_local_runqueue* rq = get_task_class_queue(p); +#endif dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); @@ -2069,8 +2443,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; + if (p->static_prio < this_rq()->best_expired_prio) + this_rq()->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); } else { @@ -2093,12 +2467,12 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY(p)) && (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { + (p->array == rq_active(p,rq))) { - dequeue_task(p, rq->active); + dequeue_task(p, rq_active(p,rq)); set_tsk_need_resched(p); p->prio = effective_prio(p); - enqueue_task(p, rq->active); + enqueue_task(p, rq_active(p,rq)); } } out_unlock: @@ -2201,10 +2575,9 @@ asmlinkage void __sched schedule(void) task_t *prev, *next; runqueue_t *rq; prio_array_t *array; - struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx; + int cpu; #ifdef CONFIG_VSERVER_HARDCPU struct vx_info *vxi; int maxidle = -HZ; @@ -2300,21 +2673,9 @@ pick_next: } } - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); + next = rq_get_next_task(rq); + if (next == rq->idle) + goto switch_tasks; if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; @@ -2355,6 +2716,14 @@ switch_tasks: clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + if (prev != rq->idle) { + unsigned long long run = now - prev->timestamp; + cpu_demand_event(get_task_local_stat(prev),CPU_DEMAND_DESCHEDULE,run); + update_local_cvt(prev, run); + } +#endif + prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) { prev->sleep_avg = 0; @@ -3062,7 +3431,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; - prio_array_t *target = rq->expired; + prio_array_t *target = rq_expired(current,rq); /* * We implement yielding by moving the task into the expired @@ -3072,7 +3441,7 @@ asmlinkage long sys_sched_yield(void) * array.) */ if (unlikely(rt_task(current))) - target = rq->active; + target = rq_active(current,rq); dequeue_task(current, array); enqueue_task(current, target); @@ -3497,7 +3866,9 @@ static int migration_thread(void * data) } if (rq->active_balance) { +#ifndef CONFIG_CKRM_CPU_SCHEDULE active_load_balance(rq, cpu); +#endif rq->active_balance = 0; } @@ -3973,7 +4344,10 @@ int in_sched_functions(unsigned long addr) void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i; +#ifndef CONFIG_CKRM_CPU_SCHEDULE + int j, k; +#endif #ifdef CONFIG_SMP /* Set up an initial dummy domain for early boot */ @@ -3993,13 +4367,21 @@ void __init sched_init(void) sched_group_init.cpu_power = SCHED_LOAD_SCALE; #endif + init_cpu_classes(); + for (i = 0; i < NR_CPUS; i++) { +#ifndef CONFIG_CKRM_CPU_SCHEDULE prio_array_t *array; - +#endif rq = cpu_rq(i); spin_lock_init(&rq->lock); + +#ifndef CONFIG_CKRM_CPU_SCHEDULE rq->active = rq->arrays; rq->expired = rq->arrays + 1; +#else + rq->ckrm_cpu_load = 0; +#endif rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP @@ -4013,6 +4395,7 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->hold_queue); atomic_set(&rq->nr_iowait, 0); +#ifndef CONFIG_CKRM_CPU_SCHEDULE for (j = 0; j < 2; j++) { array = rq->arrays + j; for (k = 0; k < MAX_PRIO; k++) { @@ -4022,7 +4405,9 @@ void __init sched_init(void) // delimiter for bitsearch __set_bit(MAX_PRIO, array->bitmap); } +#endif } + /* * We have to do a little magic to get the first * thread right in SMP mode. @@ -4031,6 +4416,10 @@ void __init sched_init(void) rq->curr = current; rq->idle = current; set_task_cpu(current, smp_processor_id()); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + current->cpu_class = default_cpu_class; + current->array = NULL; +#endif wake_up_forked_process(current); /* @@ -4116,3 +4505,13 @@ int task_running_sys(struct task_struct *p) EXPORT_SYMBOL(task_running_sys); #endif +#ifdef CONFIG_CKRM_CPU_SCHEDULE +/** + * return the classqueue object of a certain processor + * Note: not supposed to be used in performance sensitive functions + */ +struct classqueue_struct * get_cpu_classqueue(int cpu) +{ + return (& (cpu_rq(cpu)->classqueue) ); +} +#endif