From: Marc Fiuczynski Date: Wed, 29 Sep 2004 11:36:16 +0000 (+0000) Subject: ckrm_E16rc1 cpu controller version 8.2 X-Git-Tag: ckrm_E16rc1-cpu-controller-v8_2~1 X-Git-Url: http://git.onelab.eu/?a=commitdiff_plain;h=cf90186e50f0d22add321b5bf3b1c8a8635620a6;p=linux-2.6.git ckrm_E16rc1 cpu controller version 8.2 --- diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index d66058134..7b6856363 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -886,7 +886,5 @@ ENTRY(sys_call_table) .long sys_mq_notify .long sys_mq_getsetattr .long sys_ni_syscall /* reserved for kexec */ - .long sys_ioprio_set - .long sys_ioprio_get /* 285 */ syscall_table_size=(.-sys_call_table) diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S index 32e1e4059..873199e43 100644 --- a/arch/ppc/kernel/misc.S +++ b/arch/ppc/kernel/misc.S @@ -1450,5 +1450,3 @@ _GLOBAL(sys_call_table) .long sys_mq_notify .long sys_mq_getsetattr .long sys_ni_syscall /* 268 reserved for sys_kexec_load */ - .long sys_ioprio_set - .long sys_ioprio_get diff --git a/drivers/block/Makefile b/drivers/block/Makefile index c66498bad..2654b5b76 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -13,13 +13,12 @@ # kblockd threads # -obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o ckrm-iostub.o +obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_CKRM_RES_BLKIO) += ckrm-io.o obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o obj-$(CONFIG_BLK_DEV_FD98) += floppy98.o diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c index 7b45a805d..068f4eae0 100644 --- a/drivers/block/cfq-iosched.c +++ b/drivers/block/cfq-iosched.c @@ -6,18 +6,6 @@ * Based on ideas from a previously unfinished io * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. * - * IO priorities are supported, from 0% to 100% in 5% increments. Both of - * those values have special meaning - 0% class is allowed to do io if - * noone else wants to use the disk. 100% is considered real-time io, and - * always get priority. Default process io rate is 95%. In absence of other - * io, a class may consume 100% disk bandwidth regardless. Withing a class, - * bandwidth is distributed equally among the citizens. - * - * TODO: - * - cfq_select_requests() needs some work for 5-95% io - * - barriers not supported - * - export grace periods in ms, not jiffies - * * Copyright (C) 2003 Jens Axboe */ #include @@ -33,186 +21,78 @@ #include #include #include -#include - -#if IOPRIO_NR > BITS_PER_LONG -#error Cannot support this many io priority levels -#endif - -#define LIMIT_DEBUG 1 /* * tunables */ -static int cfq_quantum = 6; -static int cfq_quantum_io = 256; -static int cfq_idle_quantum = 1; -static int cfq_idle_quantum_io = 64; -static int cfq_queued = 4; -static int cfq_grace_rt = HZ / 100 ?: 1; -static int cfq_grace_idle = HZ / 10; +static int cfq_quantum = 4; +static int cfq_queued = 8; #define CFQ_QHASH_SHIFT 6 #define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) -#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) +#define list_entry_qhash(entry) list_entry((entry), struct cfq_queue, cfq_hash) #define CFQ_MHASH_SHIFT 8 #define CFQ_MHASH_BLOCK(sec) ((sec) >> 3) #define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT) #define CFQ_MHASH_FN(sec) (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT)) +#define ON_MHASH(crq) !list_empty(&(crq)->hash) #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) -#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) +#define list_entry_hash(ptr) list_entry((ptr), struct cfq_rq, hash) #define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) -#define list_entry_prio(ptr) list_entry((ptr), struct cfq_rq, prio_list) - -#define cfq_account_io(crq) \ - ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT) - -/* define to be 50 ms for now; make tunable later */ -#define CFQ_EPOCH 50000 -/* Needs to be made tunable right away, in MiB/s */ -#define CFQ_DISKBW 10 -/* Temporary global limit, as percent of available b/w, for each "class" */ -#define CFQ_TEMPLIM 10 - -/* - * defines how we distribute bandwidth (can be tgid, uid, etc) - */ - -/* FIXME: change hash_key to be sizeof(void *) rather than sizeof(int) - * otherwise the cast of cki_tsk_icls will not work reliably on 64-bit arches. - * OR, change cki_tsk_icls to return ints (will need another id space to be - * managed) - */ - -#if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE) -extern inline void *cki_hash_key(struct task_struct *tsk); -extern inline int cki_ioprio(struct task_struct *tsk); -#define cfq_hash_key(current) ((int)cki_hash_key((current))) -#define cfq_ioprio(current) (cki_ioprio((current))) - -#else -#define cfq_hash_key(current) ((current)->tgid) - -/* - * move to io_context - */ -#define cfq_ioprio(current) ((current)->ioprio) -#endif -#define CFQ_WAIT_RT 0 -#define CFQ_WAIT_NORM 1 +#define RQ_DATA(rq) ((struct cfq_rq *) (rq)->elevator_private) static kmem_cache_t *crq_pool; static kmem_cache_t *cfq_pool; static mempool_t *cfq_mpool; -/* - * defines an io priority level - */ -struct io_prio_data { - struct list_head rr_list; - int busy_queues; - int busy_rq; - unsigned long busy_sectors; - - /* requests, sectors and queues - * added(in),dispatched/deleted(out) - * at this priority level. - */ - atomic_t cum_rq_in,cum_rq_out; - atomic_t cum_sectors_in,cum_sectors_out; - atomic_t cum_queues_in,cum_queues_out; - -#ifdef LIMIT_DEBUG - int nskip; - unsigned long navsec; - unsigned long csectorate; - unsigned long lsectorate; -#endif - - struct list_head prio_list; - int last_rq; - int last_sectors; -}; - -/* - * per-request queue structure - */ struct cfq_data { struct list_head rr_list; struct list_head *dispatch; - struct hlist_head *cfq_hash; - struct hlist_head *crq_hash; - mempool_t *crq_pool; + struct list_head *cfq_hash; - struct io_prio_data cid[IOPRIO_NR]; + struct list_head *crq_hash; - /* - * total number of busy queues and requests - */ - int busy_rq; - int busy_queues; - unsigned long busy_sectors; + unsigned int busy_queues; + unsigned int max_queued; + mempool_t *crq_pool; request_queue_t *queue; - unsigned long rq_starved_mask; - - /* - * grace period handling - */ - struct timer_list timer; - unsigned long wait_end; - unsigned long flags; - struct work_struct work; /* * tunables */ unsigned int cfq_quantum; - unsigned int cfq_quantum_io; - unsigned int cfq_idle_quantum; - unsigned int cfq_idle_quantum_io; unsigned int cfq_queued; - unsigned int cfq_grace_rt; - unsigned int cfq_grace_idle; - - unsigned long cfq_epoch; /* duration for limit enforcement */ - unsigned long cfq_epochsectors; /* max sectors dispatchable/epoch */ }; -/* - * per-class structure - */ struct cfq_queue { + struct list_head cfq_hash; struct list_head cfq_list; - struct hlist_node cfq_hash; - int hash_key; struct rb_root sort_list; + int pid; int queued[2]; - int ioprio; - - unsigned long avsec; /* avg sectors dispatched/epoch */ - unsigned long long lastime; /* timestamp of last request served */ - unsigned long sectorate; /* limit for sectors served/epoch */ - int skipped; /* queue skipped at last dispatch ? */ +#if 0 + /* + * with a simple addition like this, we can do io priorities. almost. + * does need a split request free list, too. + */ + int io_prio +#endif }; -/* - * per-request structure - */ struct cfq_rq { - struct cfq_queue *cfq_queue; struct rb_node rb_node; - struct hlist_node hash; sector_t rb_key; struct request *request; - struct list_head prio_list; - unsigned long nr_sectors; - int ioprio; + + struct cfq_queue *cfq_queue; + + struct list_head hash; }; static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq); @@ -223,13 +103,18 @@ static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * lots of deadline iosched dupes, can be abstracted later... */ +static inline void __cfq_del_crq_hash(struct cfq_rq *crq) +{ + list_del_init(&crq->hash); +} + static inline void cfq_del_crq_hash(struct cfq_rq *crq) { - hlist_del_init(&crq->hash); + if (ON_MHASH(crq)) + __cfq_del_crq_hash(crq); } -static inline void -cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) +static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) { cfq_del_crq_hash(crq); @@ -240,26 +125,27 @@ cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) { struct request *rq = crq->request; - const int hash_idx = CFQ_MHASH_FN(rq_hash_key(rq)); - BUG_ON(!hlist_unhashed(&crq->hash)); - - hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); + BUG_ON(ON_MHASH(crq)); + + list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]); } static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) { - struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; - struct hlist_node *entry, *next; + struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; + struct list_head *entry, *next = hash_list->next; - hlist_for_each_safe(entry, next, hash_list) { + while ((entry = next) != hash_list) { struct cfq_rq *crq = list_entry_hash(entry); struct request *__rq = crq->request; - BUG_ON(hlist_unhashed(&crq->hash)); + next = entry->next; + + BUG_ON(!ON_MHASH(crq)); if (!rq_mergeable(__rq)) { - cfq_del_crq_hash(crq); + __cfq_del_crq_hash(crq); continue; } @@ -273,27 +159,20 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) /* * rb tree support functions */ -#define RB_EMPTY(node) ((node)->rb_node == NULL) +#define RB_NONE (2) +#define RB_EMPTY(node) ((node)->rb_node == NULL) +#define RB_CLEAR(node) ((node)->rb_color = RB_NONE) +#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL) +#define ON_RB(node) ((node)->rb_color != RB_NONE) #define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) #define rq_rb_key(rq) (rq)->sector -static void -cfq_del_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq) +static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) { - if (crq->cfq_queue) { - crq->cfq_queue = NULL; - - if (cfq_account_io(crq)) { - cfqd->busy_rq--; - cfqd->busy_sectors -= crq->nr_sectors; - cfqd->cid[crq->ioprio].busy_rq--; - cfqd->cid[crq->ioprio].busy_sectors -= crq->nr_sectors; - } - atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_out)); - atomic_add(crq->nr_sectors, - &(cfqd->cid[crq->ioprio].cum_sectors_out)); + if (ON_RB(&crq->rb_node)) { cfqq->queued[rq_data_dir(crq->request)]--; rb_erase(&crq->rb_node, &cfqq->sort_list); + crq->cfq_queue = NULL; } } @@ -326,22 +205,12 @@ cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq) struct request *rq = crq->request; struct cfq_rq *__alias; - + crq->rb_key = rq_rb_key(rq); cfqq->queued[rq_data_dir(rq)]++; - if (cfq_account_io(crq)) { - cfqd->busy_rq++; - cfqd->busy_sectors += crq->nr_sectors; - cfqd->cid[crq->ioprio].busy_rq++; - cfqd->cid[crq->ioprio].busy_sectors += crq->nr_sectors; - } - atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_in)); - atomic_add(crq->nr_sectors, - &(cfqd->cid[crq->ioprio].cum_sectors_in)); retry: __alias = __cfq_add_crq_rb(cfqq, crq); if (!__alias) { rb_insert_color(&crq->rb_node, &cfqq->sort_list); - crq->rb_key = rq_rb_key(rq); crq->cfq_queue = cfqq; return; } @@ -353,7 +222,7 @@ retry: static struct request * cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) { - struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current)); + struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid); struct rb_node *n; if (!cfqq) @@ -378,31 +247,16 @@ out: static void cfq_remove_request(request_queue_t *q, struct request *rq) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); if (crq) { + struct cfq_queue *cfqq = crq->cfq_queue; cfq_remove_merge_hints(q, crq); - list_del_init(&crq->prio_list); list_del_init(&rq->queuelist); - /* - * set a grace period timer to allow realtime io to make real - * progress, if we release an rt request. for normal request, - * set timer so idle io doesn't interfere with other io - */ - if (crq->ioprio == IOPRIO_RT) { - set_bit(CFQ_WAIT_RT, &cfqd->flags); - cfqd->wait_end = jiffies + cfqd->cfq_grace_rt; - } else if (crq->ioprio != IOPRIO_IDLE) { - set_bit(CFQ_WAIT_NORM, &cfqd->flags); - cfqd->wait_end = jiffies + cfqd->cfq_grace_idle; - } - - if (crq->cfq_queue) { - struct cfq_queue *cfqq = crq->cfq_queue; - - cfq_del_crq_rb(cfqd, cfqq, crq); + if (cfqq) { + cfq_del_crq_rb(cfqq, crq); if (RB_EMPTY(&cfqq->sort_list)) cfq_put_queue(cfqd, cfqq); @@ -452,26 +306,18 @@ out_insert: static void cfq_merged_request(request_queue_t *q, struct request *req) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(req); - int tmp; + struct cfq_rq *crq = RQ_DATA(req); cfq_del_crq_hash(crq); cfq_add_crq_hash(cfqd, crq); - if (crq->cfq_queue && (rq_rb_key(req) != crq->rb_key)) { + if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) { struct cfq_queue *cfqq = crq->cfq_queue; - cfq_del_crq_rb(cfqd, cfqq, crq); + cfq_del_crq_rb(cfqq, crq); cfq_add_crq_rb(cfqd, cfqq, crq); } - tmp = req->hard_nr_sectors - crq->nr_sectors; - cfqd->busy_sectors += tmp; - cfqd->cid[crq->ioprio].busy_sectors += tmp; - atomic_add(tmp,&(cfqd->cid[crq->ioprio].cum_sectors_in)); - - crq->nr_sectors = req->hard_nr_sectors; - q->last_merge = req; } @@ -483,9 +329,6 @@ cfq_merged_requests(request_queue_t *q, struct request *req, cfq_remove_request(q, next); } -/* - * sort into dispatch list, in optimal ascending order - */ static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_rq *crq) @@ -493,7 +336,7 @@ cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct list_head *head = cfqd->dispatch, *entry = head; struct request *__rq; - cfq_del_crq_rb(cfqd, cfqq, crq); + cfq_del_crq_rb(cfqq, crq); cfq_remove_merge_hints(cfqd->queue, crq); if (!list_empty(head)) { @@ -516,219 +359,47 @@ link: list_add_tail(&crq->request->queuelist, entry); } -/* - * remove from io scheduler core and put on dispatch list for service - */ -static inline int +static inline void __cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_rq *crq; - unsigned long long ts, gap; - unsigned long newavsec; - - crq = rb_entry_crq(rb_first(&cfqq->sort_list)); - -#if 1 - /* Determine if queue should be skipped for being overshare */ - ts = sched_clock(); - gap = ts - cfqq->lastime; -#ifdef LIMIT_DEBUG - cfqq->sectorate = (cfqd->cfq_epochsectors - * CFQ_TEMPLIM)/100; - -#endif - if ((gap >= cfqd->cfq_epoch) || (gap < 0)) { - cfqq->avsec = crq->nr_sectors ; - cfqq->lastime = ts; - } else { - u64 tmp; - /* Age old average and accumalate request to be served */ - -// tmp = (u64) (cfqq->avsec * gap) ; -// do_div(tmp, cfqd->cfq_epoch); - newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors; -// if (crq->ioprio >= 0 && crq->ioprio <= 20) -// cfqd->cid[crq->ioprio].lsectorate = newavsec; -// atomic_set(&(cfqd->cid[crq->ioprio].lsectorate), -// newavsec); - - if ((newavsec < cfqq->sectorate) || cfqq->skipped) { - cfqq->avsec = newavsec ; - cfqq->lastime = ts; - cfqq->skipped = 0; - } else { - /* queue over share ; skip once */ - cfqq->skipped = 1; -#ifdef LIMIT_DEBUG -// atomic_inc(&(cfqd->cid[crq->ioprio].nskip)); -// if (crq->ioprio >= 0 && crq->ioprio <= 20) -// cfqd->cid[crq->ioprio].nskip++; -#endif - return 0; - } - } -#endif - -#ifdef LIMIT_DEBUG -// if (crq->ioprio >= 0 && crq->ioprio <= 20) { -// cfqd->cid[crq->ioprio].navsec = cfqq->avsec; -// cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate; -// } + struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list)); -// atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec); -// atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate); -#endif cfq_dispatch_sort(cfqd, cfqq, crq); - - /* - * technically, for IOPRIO_RT we don't need to add it to the list. - */ - list_add_tail(&crq->prio_list, &cfqd->cid[cfqq->ioprio].prio_list); - return crq->nr_sectors; } -static int -cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors) +static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd) { - struct cfq_data *cfqd = q->elevator.elevator_data; - struct list_head *plist = &cfqd->cid[prio].rr_list; - struct list_head *entry, *nxt; - int q_rq, q_io; - int ret ; + struct cfq_queue *cfqq; + struct list_head *entry, *tmp; + int ret, queued, good_queues; - /* - * for each queue at this prio level, dispatch a request - */ - q_rq = q_io = 0; - list_for_each_safe(entry, nxt, plist) { - struct cfq_queue *cfqq = list_entry_cfqq(entry); + if (list_empty(&cfqd->rr_list)) + return 0; + + queued = ret = 0; +restart: + good_queues = 0; + list_for_each_safe(entry, tmp, &cfqd->rr_list) { + cfqq = list_entry_cfqq(cfqd->rr_list.next); BUG_ON(RB_EMPTY(&cfqq->sort_list)); - ret = __cfq_dispatch_requests(q, cfqd, cfqq); - if (ret <= 0) { - continue; /* skip queue */ - /* can optimize more by moving q to end of plist ? */ - } - q_io += ret ; - q_rq++ ; + __cfq_dispatch_requests(q, cfqd, cfqq); if (RB_EMPTY(&cfqq->sort_list)) cfq_put_queue(cfqd, cfqq); - /* - * if we hit the queue limit, put the string of serviced - * queues at the back of the pending list - */ - if (q_io >= max_sectors || q_rq >= max_rq) { - struct list_head *prv = nxt->prev; - - if (prv != plist) { - list_del(plist); - list_add(plist, prv); - } - break; - } - } - - cfqd->cid[prio].last_rq = q_rq; - cfqd->cid[prio].last_sectors = q_io; - return q_rq; -} - -/* - * try to move some requests to the dispatch list. return 0 on success - */ -static int cfq_select_requests(request_queue_t *q, struct cfq_data *cfqd) -{ - int queued, busy_rq, busy_sectors, i; - - /* - * if there's any realtime io, only schedule that - */ - if (cfq_dispatch_requests(q, IOPRIO_RT, cfqd->cfq_quantum, cfqd->cfq_quantum_io)) - return 1; - - /* - * if RT io was last serviced and grace time hasn't expired, - * arm the timer to restart queueing if no other RT io has been - * submitted in the mean time - */ - if (test_bit(CFQ_WAIT_RT, &cfqd->flags)) { - if (time_before(jiffies, cfqd->wait_end)) { - mod_timer(&cfqd->timer, cfqd->wait_end); - return 0; - } - clear_bit(CFQ_WAIT_RT, &cfqd->flags); - } - - /* - * for each priority level, calculate number of requests we - * are allowed to put into service. - */ - queued = 0; - busy_rq = cfqd->busy_rq; - busy_sectors = cfqd->busy_sectors; - for (i = IOPRIO_RT - 1; i > IOPRIO_IDLE; i--) { - const int o_rq = busy_rq - cfqd->cid[i].busy_rq; - const int o_sectors = busy_sectors - cfqd->cid[i].busy_sectors; - int q_rq = cfqd->cfq_quantum * (i + 1) / IOPRIO_NR; - int q_io = cfqd->cfq_quantum_io * (i + 1) / IOPRIO_NR; - - /* - * no need to keep iterating the list, if there are no - * requests pending anymore - */ - if (!cfqd->busy_rq) - break; - - /* - * find out how many requests and sectors we are allowed to - * service - */ - if (o_rq) - q_rq = o_sectors * (i + 1) / IOPRIO_NR; - if (q_rq > cfqd->cfq_quantum) - q_rq = cfqd->cfq_quantum; - - if (o_sectors) - q_io = o_sectors * (i + 1) / IOPRIO_NR; - if (q_io > cfqd->cfq_quantum_io) - q_io = cfqd->cfq_quantum_io; - - /* - * average with last dispatched for fairness - */ - if (cfqd->cid[i].last_rq != -1) - q_rq = (cfqd->cid[i].last_rq + q_rq) / 2; - if (cfqd->cid[i].last_sectors != -1) - q_io = (cfqd->cid[i].last_sectors + q_io) / 2; - - queued += cfq_dispatch_requests(q, i, q_rq, q_io); - } - - if (queued) - return 1; + else + good_queues++; - /* - * only allow dispatch of idle io, if the queue has been idle from - * servicing RT or normal io for the grace period - */ - if (test_bit(CFQ_WAIT_NORM, &cfqd->flags)) { - if (time_before(jiffies, cfqd->wait_end)) { - mod_timer(&cfqd->timer, cfqd->wait_end); - return 0; - } - clear_bit(CFQ_WAIT_NORM, &cfqd->flags); + queued++; + ret = 1; } - /* - * if we found nothing to do, allow idle io to be serviced - */ - if (cfq_dispatch_requests(q, IOPRIO_IDLE, cfqd->cfq_idle_quantum, cfqd->cfq_idle_quantum_io)) - return 1; + if ((queued < cfqd->cfq_quantum) && good_queues) + goto restart; - return 0; + return ret; } static struct request *cfq_next_request(request_queue_t *q) @@ -739,82 +410,61 @@ static struct request *cfq_next_request(request_queue_t *q) if (!list_empty(cfqd->dispatch)) { struct cfq_rq *crq; dispatch: - /* - * end grace period, we are servicing a request - */ - del_timer(&cfqd->timer); - clear_bit(CFQ_WAIT_RT, &cfqd->flags); - clear_bit(CFQ_WAIT_NORM, &cfqd->flags); - - BUG_ON(list_empty(cfqd->dispatch)); rq = list_entry_rq(cfqd->dispatch->next); - BUG_ON(q->last_merge == rq); - crq = RQ_ELV_DATA(rq); - if (crq) { - BUG_ON(!hlist_unhashed(&crq->hash)); - list_del_init(&crq->prio_list); - } + crq = RQ_DATA(rq); + if (crq) + cfq_remove_merge_hints(q, crq); return rq; } - /* - * we moved requests to dispatch list, go back end serve one - */ - if (cfq_select_requests(q, cfqd)) + if (cfq_dispatch_requests(q, cfqd)) goto dispatch; return NULL; } static inline struct cfq_queue * -__cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey, const int hashval) +__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval) { - struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; - struct hlist_node *entry; + struct list_head *hash_list = &cfqd->cfq_hash[hashval]; + struct list_head *entry; - hlist_for_each(entry, hash_list) { + list_for_each(entry, hash_list) { struct cfq_queue *__cfqq = list_entry_qhash(entry); - if (__cfqq->hash_key == hashkey) + if (__cfqq->pid == pid) return __cfqq; } return NULL; } - -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey) +static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid) { - const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); + const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); - return __cfq_find_cfq_hash(cfqd, hashkey, hashval); + return __cfq_find_cfq_hash(cfqd, pid, hashval); } static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { cfqd->busy_queues--; - WARN_ON(cfqd->busy_queues < 0); - - cfqd->cid[cfqq->ioprio].busy_queues--; - WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0); - atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out)); - list_del(&cfqq->cfq_list); - hlist_del(&cfqq->cfq_hash); + list_del(&cfqq->cfq_hash); mempool_free(cfqq, cfq_mpool); } -static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey, +static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int pid, int gfp_mask) { - const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); + const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); struct cfq_queue *cfqq, *new_cfqq = NULL; request_queue_t *q = cfqd->queue; retry: - cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval); + cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval); if (!cfqq) { if (new_cfqq) { @@ -828,15 +478,13 @@ retry: } else return NULL; - memset(cfqq, 0, sizeof(*cfqq)); - INIT_HLIST_NODE(&cfqq->cfq_hash); + INIT_LIST_HEAD(&cfqq->cfq_hash); INIT_LIST_HEAD(&cfqq->cfq_list); - cfqq->hash_key = cfq_hash_key(current); - cfqq->ioprio = cfq_ioprio(current); - cfqq->avsec = 0 ; - cfqq->lastime = sched_clock(); - cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100; - hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); + RB_CLEAR_ROOT(&cfqq->sort_list); + + cfqq->pid = pid; + cfqq->queued[0] = cfqq->queued[1] = 0; + list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); } if (new_cfqq) @@ -845,63 +493,31 @@ retry: return cfqq; } -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int hashkey, +static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid, int gfp_mask) { request_queue_t *q = cfqd->queue; struct cfq_queue *cfqq; spin_lock_irq(q->queue_lock); - cfqq = __cfq_get_queue(cfqd, hashkey, gfp_mask); + cfqq = __cfq_get_queue(cfqd, pid, gfp_mask); spin_unlock_irq(q->queue_lock); return cfqq; } -static void -__cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq) +static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) { - const int prio = crq->ioprio; struct cfq_queue *cfqq; - cfqq = __cfq_get_queue(cfqd, cfq_hash_key(current), GFP_ATOMIC); + cfqq = __cfq_get_queue(cfqd, current->tgid, GFP_ATOMIC); if (cfqq) { - - /* - * not too good... - */ - if (prio > cfqq->ioprio) { - printk("prio hash collision %d %d\n", - prio, cfqq->ioprio); - if (!list_empty(&cfqq->cfq_list)) { - cfqd->cid[cfqq->ioprio].busy_queues--; - WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues<0); - atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out)); - cfqd->cid[prio].busy_queues++; - atomic_inc(&(cfqd->cid[prio].cum_queues_in)); - list_move_tail(&cfqq->cfq_list, - &cfqd->cid[prio].rr_list); - } - cfqq->ioprio = prio; - } - cfq_add_crq_rb(cfqd, cfqq, crq); if (list_empty(&cfqq->cfq_list)) { - list_add_tail(&cfqq->cfq_list, - &cfqd->cid[prio].rr_list); - cfqd->cid[prio].busy_queues++; - atomic_inc(&(cfqd->cid[prio].cum_queues_in)); + list_add(&cfqq->cfq_list, &cfqd->rr_list); cfqd->busy_queues++; } - - if (rq_mergeable(crq->request)) { - cfq_add_crq_hash(cfqd, crq); - - if (!q->last_merge) - q->last_merge = crq->request; - } - } else { /* * should can only happen if the request wasn't allocated @@ -912,57 +528,16 @@ __cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq) } } -static void cfq_reenqueue(request_queue_t *q, struct cfq_data *cfqd, int prio) -{ - struct list_head *prio_list = &cfqd->cid[prio].prio_list; - struct list_head *entry, *tmp; - - list_for_each_safe(entry, tmp, prio_list) { - struct cfq_rq *crq = list_entry_prio(entry); - - list_del_init(entry); - list_del_init(&crq->request->queuelist); - __cfq_enqueue(q, cfqd, crq); - } -} - -static void -cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq) -{ - const int prio = cfq_ioprio(current); - - crq->ioprio = prio; - crq->nr_sectors = crq->request->hard_nr_sectors; - __cfq_enqueue(q, cfqd, crq); - - if (prio == IOPRIO_RT) { - int i; - - /* - * realtime io gets priority, move all other io back - */ - for (i = IOPRIO_IDLE; i < IOPRIO_RT; i++) - cfq_reenqueue(q, cfqd, i); - } else if (prio != IOPRIO_IDLE) { - /* - * check if we need to move idle io back into queue - */ - cfq_reenqueue(q, cfqd, IOPRIO_IDLE); - } -} - static void cfq_insert_request(request_queue_t *q, struct request *rq, int where) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); switch (where) { case ELEVATOR_INSERT_BACK: -#if 0 while (cfq_dispatch_requests(q, cfqd)) ; -#endif list_add_tail(&rq->queuelist, cfqd->dispatch); break; case ELEVATOR_INSERT_FRONT: @@ -970,20 +545,26 @@ cfq_insert_request(request_queue_t *q, struct request *rq, int where) break; case ELEVATOR_INSERT_SORT: BUG_ON(!blk_fs_request(rq)); - cfq_enqueue(q, cfqd, crq); + cfq_enqueue(cfqd, crq); break; default: - printk("%s: bad insert point %d\n", - __FUNCTION__,where); + printk("%s: bad insert point %d\n", __FUNCTION__,where); return; } + + if (rq_mergeable(rq)) { + cfq_add_crq_hash(cfqd, crq); + + if (!q->last_merge) + q->last_merge = rq; + } } static int cfq_queue_empty(request_queue_t *q) { struct cfq_data *cfqd = q->elevator.elevator_data; - if (list_empty(cfqd->dispatch) && !cfqd->busy_queues) + if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list)) return 1; return 0; @@ -992,7 +573,7 @@ static int cfq_queue_empty(request_queue_t *q) static struct request * cfq_former_request(request_queue_t *q, struct request *rq) { - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); struct rb_node *rbprev = rb_prev(&crq->rb_node); if (rbprev) @@ -1004,7 +585,7 @@ cfq_former_request(request_queue_t *q, struct request *rq) static struct request * cfq_latter_request(request_queue_t *q, struct request *rq) { - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); struct rb_node *rbnext = rb_next(&crq->rb_node); if (rbnext) @@ -1013,46 +594,27 @@ cfq_latter_request(request_queue_t *q, struct request *rq) return NULL; } -static void cfq_queue_congested(request_queue_t *q) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - - set_bit(cfq_ioprio(current), &cfqd->rq_starved_mask); -} - static int cfq_may_queue(request_queue_t *q, int rw) { struct cfq_data *cfqd = q->elevator.elevator_data; struct cfq_queue *cfqq; - const int prio = cfq_ioprio(current); - int limit, ret = 1; + int ret = 1; if (!cfqd->busy_queues) goto out; - cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current)); - if (!cfqq) - goto out; - - cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current)); - if (!cfqq) - goto out; - - /* - * if higher or equal prio io is sleeping waiting for a request, don't - * allow this one to allocate one. as long as ll_rw_blk does fifo - * waitqueue wakeups this should work... - */ - if (cfqd->rq_starved_mask & ~((1 << prio) - 1)) - goto out; + cfqq = cfq_find_cfq_hash(cfqd, current->tgid); + if (cfqq) { + int limit = (q->nr_requests - cfqd->cfq_queued) / cfqd->busy_queues; - if (cfqq->queued[rw] < cfqd->cfq_queued || !cfqd->cid[prio].busy_queues) - goto out; + if (limit < 3) + limit = 3; + else if (limit > cfqd->max_queued) + limit = cfqd->max_queued; - limit = q->nr_requests * (prio + 1) / IOPRIO_NR; - limit /= cfqd->cid[prio].busy_queues; - if (cfqq->queued[rw] > limit) - ret = 0; + if (cfqq->queued[rw] > limit) + ret = 0; + } out: return ret; } @@ -1060,13 +622,13 @@ out: static void cfq_put_request(request_queue_t *q, struct request *rq) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); struct request_list *rl; int other_rw; if (crq) { BUG_ON(q->last_merge == rq); - BUG_ON(!hlist_unhashed(&crq->hash)); + BUG_ON(ON_MHASH(crq)); mempool_free(crq, cfqd->crq_pool); rq->elevator_private = NULL; @@ -1099,21 +661,17 @@ static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) /* * prepare a queue up front, so cfq_enqueue() doesn't have to */ - cfqq = cfq_get_queue(cfqd, cfq_hash_key(current), gfp_mask); + cfqq = cfq_get_queue(cfqd, current->tgid, gfp_mask); if (!cfqq) return 1; crq = mempool_alloc(cfqd->crq_pool, gfp_mask); if (crq) { - /* - * process now has one request - */ - clear_bit(cfq_ioprio(current), &cfqd->rq_starved_mask); - memset(crq, 0, sizeof(*crq)); + RB_CLEAR(&crq->rb_node); crq->request = rq; - INIT_HLIST_NODE(&crq->hash); - INIT_LIST_HEAD(&crq->prio_list); + crq->cfq_queue = NULL; + INIT_LIST_HEAD(&crq->hash); rq->elevator_private = crq; return 0; } @@ -1132,26 +690,6 @@ static void cfq_exit(request_queue_t *q, elevator_t *e) kfree(cfqd); } -static void cfq_timer(unsigned long data) -{ - struct cfq_data *cfqd = (struct cfq_data *) data; - - clear_bit(CFQ_WAIT_RT, &cfqd->flags); - clear_bit(CFQ_WAIT_NORM, &cfqd->flags); - kblockd_schedule_work(&cfqd->work); -} - -static void cfq_work(void *data) -{ - request_queue_t *q = data; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (cfq_next_request(q)) - q->request_fn(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - static int cfq_init(request_queue_t *q, elevator_t *e) { struct cfq_data *cfqd; @@ -1162,75 +700,38 @@ static int cfq_init(request_queue_t *q, elevator_t *e) return -ENOMEM; memset(cfqd, 0, sizeof(*cfqd)); - init_timer(&cfqd->timer); - cfqd->timer.function = cfq_timer; - cfqd->timer.data = (unsigned long) cfqd; - - INIT_WORK(&cfqd->work, cfq_work, q); - - for (i = 0; i < IOPRIO_NR; i++) { - struct io_prio_data *cid = &cfqd->cid[i]; - - INIT_LIST_HEAD(&cid->rr_list); - INIT_LIST_HEAD(&cid->prio_list); - cid->last_rq = -1; - cid->last_sectors = -1; - - atomic_set(&cid->cum_rq_in,0); - atomic_set(&cid->cum_rq_out,0); - atomic_set(&cid->cum_sectors_in,0); - atomic_set(&cid->cum_sectors_out,0); - atomic_set(&cid->cum_queues_in,0); - atomic_set(&cid->cum_queues_out,0); -#if 0 - atomic_set(&cid->nskip,0); - atomic_set(&cid->navsec,0); - atomic_set(&cid->csectorate,0); - atomic_set(&cid->lsectorate,0); -#endif - } + INIT_LIST_HEAD(&cfqd->rr_list); - cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, - GFP_KERNEL); + cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); if (!cfqd->crq_hash) goto out_crqhash; - cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, - GFP_KERNEL); + cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); if (!cfqd->cfq_hash) goto out_cfqhash; - cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, - mempool_free_slab, crq_pool); + cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool); if (!cfqd->crq_pool) goto out_crqpool; for (i = 0; i < CFQ_MHASH_ENTRIES; i++) - INIT_HLIST_HEAD(&cfqd->crq_hash[i]); + INIT_LIST_HEAD(&cfqd->crq_hash[i]); for (i = 0; i < CFQ_QHASH_ENTRIES; i++) - INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); - - cfqd->cfq_queued = cfq_queued; - cfqd->cfq_quantum = cfq_quantum; - cfqd->cfq_quantum_io = cfq_quantum_io; - cfqd->cfq_idle_quantum = cfq_idle_quantum; - cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io; - cfqd->cfq_grace_rt = cfq_grace_rt; - cfqd->cfq_grace_idle = cfq_grace_idle; - - q->nr_requests <<= 2; + INIT_LIST_HEAD(&cfqd->cfq_hash[i]); cfqd->dispatch = &q->queue_head; e->elevator_data = cfqd; cfqd->queue = q; - cfqd->cfq_epoch = CFQ_EPOCH; - if (q->hardsect_size) - cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/ - q->hardsect_size)* (1000000 / CFQ_EPOCH); - else - cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512) - * (1000000 / CFQ_EPOCH) ; + /* + * just set it to some high value, we want anyone to be able to queue + * some requests. fairness is handled differently + */ + cfqd->max_queued = q->nr_requests; + q->nr_requests = 8192; + + cfqd->cfq_queued = cfq_queued; + cfqd->cfq_quantum = cfq_quantum; return 0; out_crqpool: @@ -1296,12 +797,7 @@ static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ return cfq_var_show(__VAR, (page)); \ } SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum); -SHOW_FUNCTION(cfq_quantum_io_show, cfqd->cfq_quantum_io); -SHOW_FUNCTION(cfq_idle_quantum_show, cfqd->cfq_idle_quantum); -SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io); SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued); -SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt); -SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ @@ -1315,271 +811,23 @@ static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \ return ret; \ } STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX); -STORE_FUNCTION(cfq_quantum_io_store, &cfqd->cfq_quantum_io, 4, INT_MAX); -STORE_FUNCTION(cfq_idle_quantum_store, &cfqd->cfq_idle_quantum, 1, INT_MAX); -STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX); STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX); -STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX); -STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX); #undef STORE_FUNCTION - -static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page) -{ - return sprintf(page, "%lu\n", cfqd->cfq_epoch); -} - -static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count) -{ - char *p = (char *) page; - cfqd->cfq_epoch = simple_strtoul(p, &p, 10); - return count; -} - -static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page) -{ - return sprintf(page, "%lu\n", cfqd->cfq_epochsectors); -} - -static ssize_t -cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count) -{ - char *p = (char *) page; - cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10); - return count; -} - -/* Additional entries to get priority level data */ -static ssize_t -cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl) -{ - int r1,r2,s1,s2,q1,q2; - - if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) - return 0; - - r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in)); - r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out)); - s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in)); - s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out)); - q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); - q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out)); - - return sprintf(page,"skip %d avsec %lu rate %lu new %lu" - "rq (%d,%d) sec (%d,%d) q (%d,%d)\n", - cfqd->cid[priolvl].nskip, - cfqd->cid[priolvl].navsec, - cfqd->cid[priolvl].csectorate, - cfqd->cid[priolvl].lsectorate, -// atomic_read(&cfqd->cid[priolvl].nskip), -// atomic_read(&cfqd->cid[priolvl].navsec), -// atomic_read(&cfqd->cid[priolvl].csectorate), -// atomic_read(&cfqd->cid[priolvl].lsectorate), - r1,r2, - s1,s2, - q1,q2); -} - -#define SHOW_PRIO_DATA(__PRIOLVL) \ -static ssize_t cfq_prio_##__PRIOLVL##_show(struct cfq_data *cfqd, char *page) \ -{ \ - return cfq_prio_show(cfqd,page,__PRIOLVL); \ -} -SHOW_PRIO_DATA(0); -SHOW_PRIO_DATA(1); -SHOW_PRIO_DATA(2); -SHOW_PRIO_DATA(3); -SHOW_PRIO_DATA(4); -SHOW_PRIO_DATA(5); -SHOW_PRIO_DATA(6); -SHOW_PRIO_DATA(7); -SHOW_PRIO_DATA(8); -SHOW_PRIO_DATA(9); -SHOW_PRIO_DATA(10); -SHOW_PRIO_DATA(11); -SHOW_PRIO_DATA(12); -SHOW_PRIO_DATA(13); -SHOW_PRIO_DATA(14); -SHOW_PRIO_DATA(15); -SHOW_PRIO_DATA(16); -SHOW_PRIO_DATA(17); -SHOW_PRIO_DATA(18); -SHOW_PRIO_DATA(19); -SHOW_PRIO_DATA(20); -#undef SHOW_PRIO_DATA - - -static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl) -{ - atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0); - atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0); - atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0); - atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0); - atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0); - atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0); - - return count; -} - - -#define STORE_PRIO_DATA(__PRIOLVL) \ -static ssize_t cfq_prio_##__PRIOLVL##_store(struct cfq_data *cfqd, const char *page, size_t count) \ -{ \ - return cfq_prio_store(cfqd,page,count,__PRIOLVL); \ -} -STORE_PRIO_DATA(0); -STORE_PRIO_DATA(1); -STORE_PRIO_DATA(2); -STORE_PRIO_DATA(3); -STORE_PRIO_DATA(4); -STORE_PRIO_DATA(5); -STORE_PRIO_DATA(6); -STORE_PRIO_DATA(7); -STORE_PRIO_DATA(8); -STORE_PRIO_DATA(9); -STORE_PRIO_DATA(10); -STORE_PRIO_DATA(11); -STORE_PRIO_DATA(12); -STORE_PRIO_DATA(13); -STORE_PRIO_DATA(14); -STORE_PRIO_DATA(15); -STORE_PRIO_DATA(16); -STORE_PRIO_DATA(17); -STORE_PRIO_DATA(18); -STORE_PRIO_DATA(19); -STORE_PRIO_DATA(20); -#undef STORE_PRIO_DATA - - static struct cfq_fs_entry cfq_quantum_entry = { .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR }, .show = cfq_quantum_show, .store = cfq_quantum_store, }; -static struct cfq_fs_entry cfq_quantum_io_entry = { - .attr = {.name = "quantum_io", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_quantum_io_show, - .store = cfq_quantum_io_store, -}; -static struct cfq_fs_entry cfq_idle_quantum_entry = { - .attr = {.name = "idle_quantum", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_idle_quantum_show, - .store = cfq_idle_quantum_store, -}; -static struct cfq_fs_entry cfq_idle_quantum_io_entry = { - .attr = {.name = "idle_quantum_io", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_idle_quantum_io_show, - .store = cfq_idle_quantum_io_store, -}; static struct cfq_fs_entry cfq_queued_entry = { .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR }, .show = cfq_queued_show, .store = cfq_queued_store, }; -static struct cfq_fs_entry cfq_grace_rt_entry = { - .attr = {.name = "grace_rt", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_grace_rt_show, - .store = cfq_grace_rt_store, -}; -static struct cfq_fs_entry cfq_grace_idle_entry = { - .attr = {.name = "grace_idle", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_grace_idle_show, - .store = cfq_grace_idle_store, -}; -static struct cfq_fs_entry cfq_epoch_entry = { - .attr = {.name = "epoch", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_epoch_show, - .store = cfq_epoch_store, -}; -static struct cfq_fs_entry cfq_epochsectors_entry = { - .attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_epochsectors_show, - .store = cfq_epochsectors_store, -}; - -#define P_0_STR "p0" -#define P_1_STR "p1" -#define P_2_STR "p2" -#define P_3_STR "p3" -#define P_4_STR "p4" -#define P_5_STR "p5" -#define P_6_STR "p6" -#define P_7_STR "p7" -#define P_8_STR "p8" -#define P_9_STR "p9" -#define P_10_STR "p10" -#define P_11_STR "p11" -#define P_12_STR "p12" -#define P_13_STR "p13" -#define P_14_STR "p14" -#define P_15_STR "p15" -#define P_16_STR "p16" -#define P_17_STR "p17" -#define P_18_STR "p18" -#define P_19_STR "p19" -#define P_20_STR "p20" - - -#define CFQ_PRIO_SYSFS_ENTRY(__PRIOLVL) \ -static struct cfq_fs_entry cfq_prio_##__PRIOLVL##_entry = { \ - .attr = {.name = P_##__PRIOLVL##_STR, .mode = S_IRUGO | S_IWUSR }, \ - .show = cfq_prio_##__PRIOLVL##_show, \ - .store = cfq_prio_##__PRIOLVL##_store, \ -}; -CFQ_PRIO_SYSFS_ENTRY(0); -CFQ_PRIO_SYSFS_ENTRY(1); -CFQ_PRIO_SYSFS_ENTRY(2); -CFQ_PRIO_SYSFS_ENTRY(3); -CFQ_PRIO_SYSFS_ENTRY(4); -CFQ_PRIO_SYSFS_ENTRY(5); -CFQ_PRIO_SYSFS_ENTRY(6); -CFQ_PRIO_SYSFS_ENTRY(7); -CFQ_PRIO_SYSFS_ENTRY(8); -CFQ_PRIO_SYSFS_ENTRY(9); -CFQ_PRIO_SYSFS_ENTRY(10); -CFQ_PRIO_SYSFS_ENTRY(11); -CFQ_PRIO_SYSFS_ENTRY(12); -CFQ_PRIO_SYSFS_ENTRY(13); -CFQ_PRIO_SYSFS_ENTRY(14); -CFQ_PRIO_SYSFS_ENTRY(15); -CFQ_PRIO_SYSFS_ENTRY(16); -CFQ_PRIO_SYSFS_ENTRY(17); -CFQ_PRIO_SYSFS_ENTRY(18); -CFQ_PRIO_SYSFS_ENTRY(19); -CFQ_PRIO_SYSFS_ENTRY(20); -#undef CFQ_PRIO_SYSFS_ENTRY static struct attribute *default_attrs[] = { &cfq_quantum_entry.attr, - &cfq_quantum_io_entry.attr, - &cfq_idle_quantum_entry.attr, - &cfq_idle_quantum_io_entry.attr, &cfq_queued_entry.attr, - &cfq_grace_rt_entry.attr, - &cfq_grace_idle_entry.attr, - &cfq_epoch_entry.attr, - &cfq_epochsectors_entry.attr, - &cfq_prio_0_entry.attr, - &cfq_prio_1_entry.attr, - &cfq_prio_2_entry.attr, - &cfq_prio_3_entry.attr, - &cfq_prio_4_entry.attr, - &cfq_prio_5_entry.attr, - &cfq_prio_6_entry.attr, - &cfq_prio_7_entry.attr, - &cfq_prio_8_entry.attr, - &cfq_prio_9_entry.attr, - &cfq_prio_10_entry.attr, - &cfq_prio_11_entry.attr, - &cfq_prio_12_entry.attr, - &cfq_prio_13_entry.attr, - &cfq_prio_14_entry.attr, - &cfq_prio_15_entry.attr, - &cfq_prio_16_entry.attr, - &cfq_prio_17_entry.attr, - &cfq_prio_18_entry.attr, - &cfq_prio_19_entry.attr, - &cfq_prio_20_entry.attr, NULL, }; @@ -1635,7 +883,6 @@ elevator_t iosched_cfq = { .elevator_set_req_fn = cfq_set_request, .elevator_put_req_fn = cfq_put_request, .elevator_may_queue_fn = cfq_may_queue, - .elevator_set_congested_fn = cfq_queue_congested, .elevator_init_fn = cfq_init, .elevator_exit_fn = cfq_exit, }; diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index 950eb9923..35c9385ac 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -339,14 +339,6 @@ void elv_put_request(request_queue_t *q, struct request *rq) e->elevator_put_req_fn(q, rq); } -void elv_set_congested(request_queue_t *q) -{ - elevator_t *e = &q->elevator; - - if (e->elevator_set_congested_fn) - e->elevator_set_congested_fn(q); -} - int elv_may_queue(request_queue_t *q, int rw) { elevator_t *e = &q->elevator; @@ -354,7 +346,7 @@ int elv_may_queue(request_queue_t *q, int rw) if (e->elevator_may_queue_fn) return e->elevator_may_queue_fn(q, rw); - return 1; + return 0; } void elv_completed_request(request_queue_t *q, struct request *rq) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index b6ff3448b..17c403ebd 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1594,10 +1594,6 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) struct io_context *ioc = get_io_context(gfp_mask); spin_lock_irq(q->queue_lock); - - if (!elv_may_queue(q, rw)) - goto out_lock; - if (rl->count[rw]+1 >= q->nr_requests) { /* * The queue will fill after this allocation, so set it as @@ -1611,12 +1607,15 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) } } - /* - * The queue is full and the allocating process is not a - * "batcher", and not exempted by the IO scheduler - */ - if (blk_queue_full(q, rw) && !ioc_batching(ioc)) - goto out_lock; + if (blk_queue_full(q, rw) + && !ioc_batching(ioc) && !elv_may_queue(q, rw)) { + /* + * The queue is full and the allocating process is not a + * "batcher", and not exempted by the IO scheduler + */ + spin_unlock_irq(q->queue_lock); + goto out; + } rl->count[rw]++; if (rl->count[rw] >= queue_congestion_on_threshold(q)) @@ -1634,7 +1633,8 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) */ spin_lock_irq(q->queue_lock); freed_request(q, rw); - goto out_lock; + spin_unlock_irq(q->queue_lock); + goto out; } if (ioc_batching(ioc)) @@ -1664,11 +1664,6 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) out: put_io_context(ioc); return rq; -out_lock: - if (!rq) - elv_set_congested(q); - spin_unlock_irq(q->queue_lock); - goto out; } /* @@ -3172,21 +3167,3 @@ void blk_unregister_queue(struct gendisk *disk) kobject_put(&disk->kobj); } } - -asmlinkage int sys_ioprio_set(int ioprio) -{ - if (ioprio < IOPRIO_IDLE || ioprio > IOPRIO_RT) - return -EINVAL; - if (ioprio == IOPRIO_RT && !capable(CAP_SYS_ADMIN)) - return -EACCES; - - printk("%s: set ioprio %d\n", current->comm, ioprio); - current->ioprio = ioprio; - return 0; -} - -asmlinkage int sys_ioprio_get(void) -{ - return current->ioprio; -} - diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index 30bbe7fcd..ef936b861 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -289,10 +289,8 @@ #define __NR_mq_notify (__NR_mq_open+4) #define __NR_mq_getsetattr (__NR_mq_open+5) #define __NR_sys_kexec_load 283 -#define __NR_ioprio_set 284 -#define __NR_ioprio_get 285 -#define NR_syscalls 286 +#define NR_syscalls 284 /* user-visible error numbers are in the range -1 - -124: see */ diff --git a/include/asm-ppc/unistd.h b/include/asm-ppc/unistd.h index bdf4ebe9b..57fb02c6c 100644 --- a/include/asm-ppc/unistd.h +++ b/include/asm-ppc/unistd.h @@ -273,10 +273,8 @@ #define __NR_mq_notify 266 #define __NR_mq_getsetattr 267 #define __NR_kexec_load 268 -#define __NR_ioprio_set 269 -#define __NR_ioprio_get 270 -#define __NR_syscalls 271 +#define __NR_syscalls 269 #define __NR(n) #n diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 0b0a6a10d..26e0aa30b 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -554,12 +554,8 @@ __SYSCALL(__NR_mq_notify, sys_mq_notify) __SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) #define __NR_kexec_load 246 __SYSCALL(__NR_kexec_load, sys_ni_syscall) -#define __NR_ioprio_set 247 -__SYSCALL(__NR_ioprio_set, sys_ioprio_set); -#define __NR_ioprio_get 248 -__SYSCALL(__NR_ioprio_get, sys_ioprio_get); -#define __NR_syscall_max __NR_ioprio_get +#define __NR_syscall_max __NR_kexec_load #ifndef __NO_STUBS /* user-visible error numbers are in the range -1 - -4095 */ diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h index fc62d99cd..45e778bd5 100644 --- a/include/linux/ckrm_sched.h +++ b/include/linux/ckrm_sched.h @@ -71,8 +71,6 @@ struct ckrm_runqueue { * * initialized to be 0 * a class can't accumulate more than SAVING_THRESHOLD of savings - * savings are kept in normalized form (like cvt) - * so when task share change the savings should be scaled accordingly */ unsigned long long savings; @@ -256,7 +254,7 @@ void ckrm_cpu_change_class(void *task, void *old, void *new); #define CPU_DEMAND_INIT 3 /*functions exported by ckrm_cpu_monitor.c*/ -void ckrm_cpu_monitor(void); +void ckrm_cpu_monitor(int check_min); int ckrm_cpu_monitor_init(void); void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len); @@ -274,18 +272,21 @@ void adjust_local_weight(void); * * CLASS_QUANTIZER: * - * A class with 5% share, can execute 50M nsecs / per sec ~ 2^28. + * A class with 50% share, can execute 500 ms / per sec ~ 2^29 ns. * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7. * With CLASS_QUANTIZER=16, the local_cvt of this class will increase - * by 2^28/2^9 = 2^19 = 512K. - * Setting CLASS_QUANTIZER to 16, 2^(19-16) = 8 slots / per second. - * A class with 5% shares, will cover 80 slots / per second. + * by 2^29/2^9 = 2^20 = 1024K. + * Setting CLASS_QUANTIZER to 16, 2^(20-16) = 16 slots / per second. + * Do the same math, a class with any share value, will cover 16 slots / per second. + * So 2^8 total slots is good track for 8 seconds of system execution * * PRIORITY_QUANTIZER: * * How much can top priorities of class impact slot bonus. - * There are 40 nice priorities. "2" will allow upto 10 slots improvement - * in the RQ thus for 50% class it can perform ~1sec starvation. + * There are 40 nice priorities, range from -20 to 19, with default nice = 0 + * "2" will allow upto 5 slots improvement + * when certain task within the class has a nice value of -20 + * in the RQ thus for 50% class it can perform ~300 msec starvation. * *******************************************************************/ @@ -322,7 +323,7 @@ void adjust_local_weight(void); /* * to improve system responsiveness * an inactive class is put a little bit ahead of the current class when it wakes up - * the amount is set in normalized termis to simplify the calculation + * the amount is set in normalized term to simplify the calculation * for class with 100% share, it can be 2s ahead * while for class with 10% share, it can be 200ms ahead */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b42a9c4e2..27e8183f4 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -17,7 +17,6 @@ typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *); typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *); typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); typedef int (elevator_may_queue_fn) (request_queue_t *, int); -typedef void (elevator_set_congested_fn) (request_queue_t *); typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int); typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); @@ -46,7 +45,6 @@ struct elevator_s elevator_put_req_fn *elevator_put_req_fn; elevator_may_queue_fn *elevator_may_queue_fn; - elevator_set_congested_fn *elevator_set_congested_fn; elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; @@ -76,7 +74,6 @@ extern struct request *elv_latter_request(request_queue_t *, struct request *); extern int elv_register_queue(request_queue_t *q); extern void elv_unregister_queue(request_queue_t *q); extern int elv_may_queue(request_queue_t *, int); -extern void elv_set_congested(request_queue_t *); extern void elv_completed_request(request_queue_t *, struct request *); extern int elv_set_request(request_queue_t *, struct request *, int); extern void elv_put_request(request_queue_t *, struct request *); @@ -122,6 +119,4 @@ extern int elv_try_last_merge(request_queue_t *, struct bio *); #define ELEVATOR_INSERT_BACK 2 #define ELEVATOR_INSERT_SORT 3 -#define RQ_ELV_DATA(rq) (rq)->elevator_private - #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 0b4e2114a..7e10a252a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1570,17 +1570,5 @@ static inline void free_secdata(void *secdata) { } #endif /* CONFIG_SECURITY */ -/* io priorities */ - -#define IOPRIO_NR 21 - -#define IOPRIO_IDLE 0 -#define IOPRIO_NORM 10 -#define IOPRIO_RT 20 - -asmlinkage int sys_ioprio_set(int ioprio); -asmlinkage int sys_ioprio_get(void); - - #endif /* __KERNEL__ */ #endif /* _LINUX_FS_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 5d6206327..9937c8df8 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -112,7 +112,6 @@ extern struct group_info init_groups; .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ - .ioprio = IOPRIO_NORM, \ } diff --git a/include/linux/sched.h b/include/linux/sched.h index 4dd9fbded..c1bd9eaf6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -388,6 +388,24 @@ int set_current_groups(struct group_info *group_info); struct audit_context; /* See audit.c */ struct mempolicy; +#ifdef CONFIG_CKRM_CPU_SCHEDULE +/** + * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class + * @run: how much time it has been running since the counter started + * @total: total time since the counter started + * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping + * @recalc_interval: how often do we recalculate the cpu_demand + * @cpu_demand: moving average of run/total + */ +struct ckrm_cpu_demand_stat { + unsigned long long run; + unsigned long long total; + unsigned long long last_sleep; + unsigned long long recalc_interval; + unsigned long cpu_demand; /*estimated cpu demand */ +}; +#endif + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -487,7 +505,6 @@ struct task_struct { /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; - sigset_t blocked, real_blocked; struct sigpending pending; @@ -521,8 +538,6 @@ struct task_struct { struct io_context *io_context; - int ioprio; - unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ @@ -538,6 +553,11 @@ struct task_struct { // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS struct ckrm_task_class *taskclass; struct list_head taskclass_link; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + struct ckrm_cpu_class *cpu_class; + //track cpu demand of this task + struct ckrm_cpu_demand_stat demand_stat; +#endif //CONFIG_CKRM_CPU_SCHEDULE #endif // CONFIG_CKRM_TYPE_TASKCLASS #endif // CONFIG_CKRM @@ -861,6 +881,7 @@ static inline int capable(int cap) } #endif + /* * Routines for handling mm_structs */ @@ -995,7 +1016,6 @@ static inline struct mm_struct * get_task_mm(struct task_struct * task) return mm; } - /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available */ diff --git a/init/Kconfig b/init/Kconfig index 45a39b1ad..e5480f047 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -172,16 +172,12 @@ config CKRM_RES_NUMTASKS Say N if unsure, Y to use the feature. -config CKRM_RES_BLKIO - tristate " Disk I/O Resource Controller" - depends on CKRM_TYPE_TASKCLASS && IOSCHED_CFQ - default m +config CKRM_CPU_SCHEDULE + bool "CKRM CPU scheduler" + depends on CKRM_TYPE_TASKCLASS + default y help - Provides a resource controller for best-effort block I/O - bandwidth control. The controller attempts this by proportional - servicing of requests in the I/O scheduler. However, seek - optimizations and reordering by device drivers/disk controllers may - alter the actual bandwidth delivered to a class. + Use CKRM CPU scheduler instead of Linux Scheduler Say N if unsure, Y to use the feature. diff --git a/init/main.c b/init/main.c index 44a43d447..7a93e4edf 100644 --- a/init/main.c +++ b/init/main.c @@ -50,6 +50,7 @@ #include #include +#include /* * This is one of the first .c files built. Error out early @@ -680,6 +681,7 @@ static int init(void * unused) do_basic_setup(); + init_ckrm_sched_res(); /* * check if there is an early userspace init. If yes, let it do all * the work diff --git a/kernel/Makefile b/kernel/Makefile index 97364d362..2038a7247 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o +obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index 008b6c6e0..de490232b 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -9,3 +9,4 @@ endif obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o + obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c index ad45380ee..09ea6ba80 100644 --- a/kernel/ckrm/ckrm_cpu_class.c +++ b/kernel/ckrm/ckrm_cpu_class.c @@ -180,6 +180,9 @@ static void ckrm_free_cpu_class(void *my_res) write_unlock(&class_list_lock); kfree(cls); + + //call ckrm_cpu_monitor after class removed + ckrm_cpu_monitor(0); } /* @@ -220,6 +223,10 @@ int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) if (cls->parent) { spin_unlock(&parres->cnt_lock); } + + //call ckrm_cpu_monitor after changes are changed + ckrm_cpu_monitor(0); + return rc; } @@ -269,7 +276,7 @@ int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) ); for_each_online_cpu(i) { lrq = get_ckrm_lrq(cls,i); - seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav=%lu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings); + seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings); } seq_printf(sfile, "-------- CPU Class Status END ---------\n"); diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c index c83c83fca..11f65d73b 100644 --- a/kernel/ckrm/ckrm_cpu_monitor.c +++ b/kernel/ckrm/ckrm_cpu_monitor.c @@ -357,6 +357,10 @@ static int update_child_effective(struct ckrm_core_class *parent) c_cls->stat.ehl * get_myhard_limit(c_cls) / c_cls->shares.total_guarantee; + set_eshare(&c_cls->stat,c_cls->stat.egrt); + set_meshare(&c_cls->stat,c_cls->stat.megrt); + + child_core = ckrm_get_next_child(parent, child_core); }; return 0; @@ -386,15 +390,18 @@ static int update_effectives(struct ckrm_core_class *root_core) / cls->shares.total_guarantee; cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls) / cls->shares.total_guarantee; - + set_eshare(&cls->stat,cls->stat.egrt); + set_meshare(&cls->stat,cls->stat.megrt); + repeat: //check exit if (!cur_core) return 0; - //visit this node - if (update_child_effective(cur_core) < 0) - return ret; //invalid cur_core node + //visit this node only once + if (! child_core) + if (update_child_effective(cur_core) < 0) + return ret; //invalid cur_core node //next child child_core = ckrm_get_next_child(cur_core, child_core); @@ -439,37 +446,30 @@ static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) } /** - * node_surplus_consume: consume the surplus - * @ckeck_sl: if check_sl is set, then check soft_limit - * @total_grt: total guarantee + * consume_surplus: decides how much surplus a node can consume + * @ckeck_sl: if check_sl is set, then check soft_limitx * return how much consumed - * return -1 on error * * implements all the CKRM Scheduling Requirement - * update total_grt if necessary + * assume c_cls is valid */ -static inline int node_surplus_consume(int surplus, - struct ckrm_core_class *child_core, +static inline int consume_surplus(int surplus, + struct ckrm_cpu_class *c_cls, struct ckrm_cpu_class *p_cls, int check_sl ) { int consumed = 0; int inc_limit; - int glut = 1; - - struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core); int total_grt = p_cls->shares.total_guarantee; BUG_ON(surplus < 0); - if (! c_cls || ! total_grt) - goto out; - /*can't consume more than demand or hard limit*/ if (c_cls->stat.eshare >= c_cls->stat.max_demand) goto out; + //the surplus allocation is propotional to grt consumed = surplus * c_cls->shares.my_guarantee / total_grt; @@ -481,25 +481,106 @@ static inline int node_surplus_consume(int surplus, if (check_sl) { int esl = p_cls->stat.eshare * get_soft_limit(c_cls) - /p_cls->shares.total_guarantee; + /total_grt; if (esl < c_cls->stat.max_demand) inc_limit = esl - c_cls->stat.eshare; } - if (consumed > inc_limit) consumed = inc_limit; - else - glut = 0; BUG_ON(consumed < 0); - set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed); - BUG_ON(c_cls->stat.eshare < 0); + out: + return consumed; +} + +/* + * how much a node can consume for itself? + */ +static inline int consume_self_surplus(int surplus, + struct ckrm_cpu_class *p_cls, + int check_sl + ) +{ + int consumed = 0; + int inc_limit; + int total_grt = p_cls->shares.total_guarantee; + int max_demand = get_mmax_demand(&p_cls->stat); + + BUG_ON(surplus < 0); + /*can't consume more than demand or hard limit*/ + if (p_cls->stat.meshare >= max_demand) + goto out; + + //the surplus allocation is propotional to grt + consumed = + surplus * p_cls->shares.unused_guarantee / total_grt; + + if (! consumed) //no more share + goto out; + + //hard limit and demand limit + inc_limit = max_demand - p_cls->stat.meshare; + + if (check_sl) { + int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls) + /total_grt; + if (mesl < max_demand) + inc_limit = mesl - p_cls->stat.meshare; + } + + if (consumed > inc_limit) + consumed = inc_limit; + + BUG_ON(consumed < 0); out: return consumed; } + +/* + * allocate surplus to all its children and also its default class + */ +static int alloc_surplus_single_round( + int surplus, + struct ckrm_core_class *parent, + struct ckrm_cpu_class *p_cls, + int check_sl) +{ + struct ckrm_cpu_class *c_cls; + struct ckrm_core_class *child_core = NULL; + int total_consumed = 0,consumed; + + //first allocate to the default class + consumed = + consume_self_surplus(surplus,p_cls,check_sl); + + if (consumed > 0) { + set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed); + total_consumed += consumed; + } + + do { + child_core = ckrm_get_next_child(parent, child_core); + if (child_core) { + c_cls = ckrm_get_cpu_class(child_core); + if (! c_cls) + return -1; + + consumed = + consume_surplus(surplus, c_cls, + p_cls,check_sl); + if (consumed > 0) { + set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed); + total_consumed += consumed; + } + } + } while (child_core); + + return total_consumed; +} + /** * alloc_surplus_node: re-allocate the shares for children under parent * @parent: parent node @@ -512,80 +593,63 @@ static inline int node_surplus_consume(int surplus, */ static int alloc_surplus_node(struct ckrm_core_class *parent) { - int total_surplus , old_surplus; - struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent); - struct ckrm_core_class *child_core = NULL; - int self_share; + struct ckrm_cpu_class *p_cls,*c_cls; + int total_surplus,consumed; int check_sl; int ret = -1; + struct ckrm_core_class *child_core = NULL; + p_cls = ckrm_get_cpu_class(parent); if (! p_cls) - return ret; - - total_surplus = get_my_node_surplus(p_cls); + goto realloc_out; /* - * initialize effective_share + * get total surplus */ + total_surplus = p_cls->stat.eshare - p_cls->stat.egrt; + BUG_ON(total_surplus < 0); + total_surplus += get_my_node_surplus(p_cls); + do { child_core = ckrm_get_next_child(parent, child_core); if (child_core) { - struct ckrm_cpu_class *c_cls; - c_cls = ckrm_get_cpu_class(child_core); if (! c_cls) - return ret; + goto realloc_out; total_surplus += get_node_surplus(c_cls); - - set_eshare(&c_cls->stat, c_cls->stat.egrt); } } while (child_core); - if (! total_surplus) + + if (! total_surplus) { + ret = 0; goto realloc_out; + } - /* distribute the surplus */ - child_core = NULL; + /* + * distributing the surplus + * first with the check_sl enabled + * once all the tasks has research the soft limit, disable check_sl and try again + */ + check_sl = 1; - old_surplus = 0; do { - if (!child_core) {//start a new round + consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl); + if (consumed < 0) //something is wrong + goto realloc_out; - //ok, everybody reached the soft limit - if (old_surplus == total_surplus) - check_sl = 0; - old_surplus = total_surplus; - } + if (! consumed) + check_sl = 0; + else + total_surplus -= consumed; - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - int consumed = 0; - consumed -= - node_surplus_consume(old_surplus, child_core, - p_cls,check_sl); - if (consumed >= 0) - total_surplus -= consumed; - else - return ret; - } - //start a new round if something is allocated in the last round - } while (child_core || check_sl || total_surplus != old_surplus); + } while ((total_surplus > 0) && (consumed || check_sl) ); - realloc_out: - /*how much for itself*/ - self_share = p_cls->stat.eshare * - p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee; - - if (self_share < p_cls->stat.max_demand) { - /*any remaining surplus goes to the default class*/ - self_share += total_surplus; - if (self_share > p_cls->stat.max_demand) - self_share = p_cls->stat.max_demand; - } + ret = 0; - set_meshare(&p_cls->stat, self_share); - return 0; + realloc_out: + return ret; } /** @@ -597,29 +661,27 @@ static int alloc_surplus_node(struct ckrm_core_class *parent) static int alloc_surplus(struct ckrm_core_class *root_core) { struct ckrm_core_class *cur_core, *child_core; - struct ckrm_cpu_class *cls; + // struct ckrm_cpu_class *cls; int ret = -1; /*initialize*/ cur_core = root_core; child_core = NULL; - cls = ckrm_get_cpu_class(cur_core); - - //set root eshare - set_eshare(&cls->stat, cls->stat.egrt); + // cls = ckrm_get_cpu_class(cur_core); /*the ckrm idle tasks get all what's remaining*/ /*hzheng: uncomment the following like for hard limit support */ // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); - repeat: + repeat: //check exit if (!cur_core) return 0; - //visit this node - if ( alloc_surplus_node(cur_core) < 0 ) - return ret; + //visit this node only once + if (! child_core) + if ( alloc_surplus_node(cur_core) < 0 ) + return ret; //next child child_core = ckrm_get_next_child(cur_core, child_core); @@ -708,7 +770,7 @@ static int ckrm_cpu_idled(void *nothing) /*similar to cpu_idle */ while (1) { while (!need_resched()) { - ckrm_cpu_monitor(); + ckrm_cpu_monitor(1); if (current_cpu_data.hlt_works_ok) { local_irq_disable(); if (!need_resched()) { @@ -830,12 +892,13 @@ void adjust_local_weight(void) /**********************************************/ /** *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress + *@check_min: if check_min is set, the call can't be within 100ms of last call * * this function is called every CPU_MONITOR_INTERVAL * it computes the cpu demand of each class * and re-allocate the un-used shares to other classes */ -void ckrm_cpu_monitor(void) +void ckrm_cpu_monitor(int check_min) { static spinlock_t lock = SPIN_LOCK_UNLOCKED; static unsigned long long last_check = 0; @@ -855,9 +918,9 @@ void ckrm_cpu_monitor(void) now = sched_clock(); //consecutive check should be at least 100ms apart - if (now - last_check < MIN_CPU_MONITOR_INTERVAL) { + if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL)) goto outunlock; - } + last_check = now; if (update_effectives(root_core) != 0) @@ -889,7 +952,7 @@ static int ckrm_cpu_monitord(void *nothing) /*sleep for sometime before next try*/ set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(CPU_MONITOR_INTERVAL); - ckrm_cpu_monitor(); + ckrm_cpu_monitor(1); if (thread_exit) { break; } @@ -910,8 +973,6 @@ void ckrm_start_monitor(void) void ckrm_kill_monitor(void) { - int interval = HZ; - printk("killing process %d\n", cpu_monitor_pid); if (cpu_monitor_pid > 0) { thread_exit = 1; diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c index 9c653a3b6..1ca2611dc 100644 --- a/kernel/ckrm_sched.c +++ b/kernel/ckrm_sched.c @@ -77,7 +77,7 @@ static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) lrq->savings -= savings_used; unscale_cvt(savings_used,lrq); BUG_ON(lrq->local_cvt < savings_used); - // lrq->local_cvt -= savings_used; + lrq->local_cvt -= savings_used; } } diff --git a/kernel/fork.c b/kernel/fork.c index d665090c8..37c727ae1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1096,7 +1096,6 @@ struct task_struct *copy_process(unsigned long clone_flags, } else link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); - p->ioprio = current->ioprio; nr_threads++; write_unlock_irq(&tasklist_lock); retval = 0; diff --git a/kernel/sched.c b/kernel/sched.c index 0e1d0a2ed..85fb705c1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -17,7 +17,6 @@ * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin */ - #include #include #include @@ -157,8 +156,20 @@ #define LOW_CREDIT(p) \ ((p)->interactive_credit < -CREDIT_LIMIT) +#ifdef CONFIG_CKRM_CPU_SCHEDULE +/* + * if belong to different class, compare class priority + * otherwise compare task priority + */ +#define TASK_PREEMPTS_CURR(p, rq) \ + ( ((p)->cpu_class != (rq)->curr->cpu_class) \ + && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \ + ? class_preempts_curr((p),(rq)->curr) \ + : ((p)->prio < (rq)->curr->prio) +#else #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) +#endif /* * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] @@ -175,7 +186,7 @@ ((MAX_TIMESLICE - MIN_TIMESLICE) * \ (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1))) -static unsigned int task_timeslice(task_t *p) +unsigned int task_timeslice(task_t *p) { return BASE_TIMESLICE(p); } @@ -186,15 +197,9 @@ static unsigned int task_timeslice(task_t *p) * These are the runqueue data structures: */ -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - typedef struct runqueue runqueue_t; - -struct prio_array { - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; -}; +#include +#include /* * This is the main, per-CPU runqueue data structure. @@ -219,7 +224,12 @@ struct runqueue { unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + struct classqueue_struct classqueue; + ckrm_load_t ckrm_load; +#else + prio_array_t *active, *expired, arrays[2]; +#endif int best_expired_prio; atomic_t nr_iowait; @@ -298,15 +308,108 @@ static inline void rq_unlock(runqueue_t *rq) spin_unlock_irq(&rq->lock); } +#ifdef CONFIG_CKRM_CPU_SCHEDULE +static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) +{ + cq_node_t *node = classqueue_get_head(&rq->classqueue); + return ((node) ? class_list_entry(node) : NULL); +} + +/* + * return the cvt of the current running class + * if no current running class, return 0 + * assume cpu is valid (cpu_online(cpu) == 1) + */ +CVT_t get_local_cur_cvt(int cpu) +{ + ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu)); + + if (lrq) + return lrq->local_cvt; + else + return 0; +} + +static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +{ + prio_array_t *array; + struct task_struct *next; + ckrm_lrq_t *queue; + int idx; + int cpu = smp_processor_id(); + + next = rq->idle; + retry_next_class: + if ((queue = rq_get_next_class(rq))) { + //check switch active/expired queue + array = queue->active; + if (unlikely(!array->nr_active)) { + queue->active = queue->expired; + queue->expired = array; + queue->expired_timestamp = 0; + + if (queue->active->nr_active) + set_top_priority(queue, + find_first_bit(queue->active->bitmap, MAX_PRIO)); + else { + classqueue_dequeue(queue->classqueue, + &queue->classqueue_linkobj); + cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); + } + goto retry_next_class; + } + BUG_ON(!array->nr_active); + + idx = queue->top_priority; + if (queue->top_priority == MAX_PRIO) { + BUG_ON(1); + } + + next = task_list_entry(array->queue[idx].next); + } + return next; +} +#else /*! CONFIG_CKRM_CPU_SCHEDULE*/ +static inline struct task_struct * rq_get_next_task(struct runqueue* rq) +{ + prio_array_t *array; + struct list_head *queue; + int idx; + + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + return list_entry(queue->next, task_t, run_list); +} + +static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } +static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } +static inline void init_cpu_classes(void) { } +#define rq_ckrm_load(rq) NULL +static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} +#endif /* CONFIG_CKRM_CPU_SCHEDULE */ + /* * Adding/removing a task to/from a priority array: */ static void dequeue_task(struct task_struct *p, prio_array_t *array) { + BUG_ON(! array); array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + class_dequeue_task(p,array); } static void enqueue_task(struct task_struct *p, prio_array_t *array) @@ -315,6 +418,7 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + class_enqueue_task(p,array); } /* @@ -328,6 +432,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + class_enqueue_task(p,array); } /* @@ -366,7 +471,7 @@ static int effective_prio(task_t *p) */ static inline void __activate_task(task_t *p, runqueue_t *rq) { - enqueue_task(p, rq->active); + enqueue_task(p, rq_active(p,rq)); rq->nr_running++; } @@ -375,7 +480,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) */ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { - enqueue_task_head(p, rq->active); + enqueue_task_head(p, rq_active(p,rq)); rq->nr_running++; } @@ -881,6 +986,10 @@ void fastcall sched_fork(task_t *p) INIT_LIST_HEAD(&p->run_list); p->array = NULL; spin_lock_init(&p->switch_lock); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0); +#endif + #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which @@ -956,6 +1065,7 @@ void fastcall wake_up_forked_process(task_t * p) p->array = current->array; p->array->nr_active++; rq->nr_running++; + class_enqueue_task(p,p->array); } task_rq_unlock(rq, &flags); } @@ -1278,6 +1388,7 @@ lock_again: p->array = current->array; p->array->nr_active++; rq->nr_running++; + class_enqueue_task(p,p->array); } } else { /* Not the local CPU - must adjust timestamp */ @@ -1423,6 +1534,449 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } +#ifdef CONFIG_CKRM_CPU_SCHEDULE +static inline int ckrm_preferred_task(task_t *tmp,long min, long max, + int phase, enum idle_type idle) +{ + long pressure = task_load(tmp); + + if (pressure > max) + return 0; + + if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) + return 0; + return 1; +} + +/* + * move tasks for a specic local class + * return number of tasks pulled + */ +static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, + runqueue_t *this_rq, + runqueue_t *busiest, + struct sched_domain *sd, + int this_cpu, + enum idle_type idle, + long* pressure_imbalance) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + task_t *tmp; + int idx; + int pulled = 0; + int phase = -1; + long pressure_min, pressure_max; + /*hzheng: magic : 90% balance is enough*/ + long balance_min = *pressure_imbalance / 10; +/* + * we don't want to migrate tasks that will reverse the balance + * or the tasks that make too small difference + */ +#define CKRM_BALANCE_MAX_RATIO 100 +#define CKRM_BALANCE_MIN_RATIO 1 + start: + phase ++; + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (src_lrq->expired->nr_active) { + array = src_lrq->expired; + dst_array = dst_lrq->expired; + } else { + array = src_lrq->active; + dst_array = dst_lrq->active; + } + + new_array: + /* Start searching at priority 0: */ + idx = 0; + skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == src_lrq->expired && src_lrq->active->nr_active) { + array = src_lrq->active; + dst_array = dst_lrq->active; + goto new_array; + } + if ((! phase) && (! pulled) && (idle != IDLE)) + goto start; //try again + else + goto out; //finished search for this lrq + } + + head = array->queue + idx; + curr = head->prev; + skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + + pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; + pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; + /* + * skip the tasks that will reverse the balance too much + */ + if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { + *pressure_imbalance -= task_load(tmp); + pull_task(busiest, array, tmp, + this_rq, dst_array, this_cpu); + pulled++; + + if (*pressure_imbalance <= balance_min) + goto out; + } + + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + out: + return pulled; +} + +static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) +{ + long imbalance; + /* + * make sure after balance, imbalance' > - imbalance/2 + * we don't want the imbalance be reversed too much + */ + imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) + - pid_get_pressure(rq_ckrm_load(this_rq),1); + imbalance /= 2; + return imbalance; +} + +/* + * try to balance the two runqueues + * + * Called with both runqueues locked. + * if move_tasks is called, it will try to move at least one task over + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct ckrm_cpu_class *clsptr,*vip_cls = NULL; + ckrm_lrq_t* src_lrq,*dst_lrq; + long pressure_imbalance, pressure_imbalance_old; + int src_cpu = task_cpu(busiest->curr); + struct list_head *list; + int pulled = 0; + long imbalance; + + imbalance = ckrm_rq_imbalance(this_rq,busiest); + + if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) + goto out; + + //try to find the vip class + list_for_each_entry(clsptr,&active_cpu_classes,links) { + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + + if (! lrq_nr_running(src_lrq)) + continue; + + if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) + { + vip_cls = clsptr; + } + } + + /* + * do search from the most significant class + * hopefully, less tasks will be migrated this way + */ + clsptr = vip_cls; + + move_class: + if (! clsptr) + goto out; + + + src_lrq = get_ckrm_lrq(clsptr,src_cpu); + if (! lrq_nr_running(src_lrq)) + goto other_class; + + dst_lrq = get_ckrm_lrq(clsptr,this_cpu); + + //how much pressure for this class should be transferred + pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; + if (pulled && ! pressure_imbalance) + goto other_class; + + pressure_imbalance_old = pressure_imbalance; + + //move tasks + pulled += + ckrm_cls_move_tasks(src_lrq,dst_lrq, + this_rq, + busiest, + sd,this_cpu,idle, + &pressure_imbalance); + + /* + * hzheng: 2 is another magic number + * stop balancing if the imbalance is less than 25% of the orig + */ + if (pressure_imbalance <= (pressure_imbalance_old >> 2)) + goto out; + + //update imbalance + imbalance *= pressure_imbalance / pressure_imbalance_old; + other_class: + //who is next? + list = clsptr->links.next; + if (list == &active_cpu_classes) + list = list->next; + clsptr = list_entry(list, typeof(*clsptr), links); + if (clsptr != vip_cls) + goto move_class; + out: + return pulled; +} + +/** + * ckrm_check_balance - is load balancing necessary? + * return 0 if load balancing is not necessary + * otherwise return the average load of the system + * also, update nr_group + * + * heuristics: + * no load balancing if it's load is over average + * no load balancing if it's load is far more than the min + * task: + * read the status of all the runqueues + */ +static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, + enum idle_type idle, int* nr_group) +{ + struct sched_group *group = sd->groups; + unsigned long min_load, max_load, avg_load; + unsigned long total_load, this_load, total_pwr; + + max_load = this_load = total_load = total_pwr = 0; + min_load = 0xFFFFFFFF; + *nr_group = 0; + + do { + cpumask_t tmp; + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + /* Tally up the load of all CPUs in the group */ + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto nextgroup; + + avg_load = 0; + local_group = cpu_isset(this_cpu, group->cpumask); + + for_each_cpu_mask(i, tmp) { + load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + } + if (avg_load < min_load) { + min_load = avg_load; + } +nextgroup: + group = group->next; + *nr_group = *nr_group + 1; + } while (group != sd->groups); + + if (!max_load || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + /* hzheng: debugging: 105 is a magic number + * 100*max_load <= sd->imbalance_pct*this_load) + * should use imbalance_pct instead + */ + if (this_load > avg_load + || 100*max_load < 105*this_load + || 100*min_load < 70*this_load + ) + goto out_balanced; + + return avg_load; + out_balanced: + return 0; +} + +/** + * any group that has above average load is considered busy + * find the busiest queue from any of busy group + */ +static runqueue_t * +ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, + unsigned long avg_load, enum idle_type idle, + int nr_group) +{ + struct sched_group *group; + runqueue_t * busiest=NULL; + unsigned long rand; + + group = sd->groups; + rand = get_ckrm_rand(nr_group); + nr_group = 0; + + do { + unsigned long load,total_load,max_load; + cpumask_t tmp; + int i; + runqueue_t * grp_busiest; + + cpus_and(tmp, group->cpumask, cpu_online_map); + if (unlikely(cpus_empty(tmp))) + goto find_nextgroup; + + total_load = 0; + max_load = 0; + grp_busiest = NULL; + for_each_cpu_mask(i, tmp) { + load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); + total_load += load; + if (load > max_load) { + max_load = load; + grp_busiest = cpu_rq(i); + } + } + + total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; + if (total_load > avg_load) { + busiest = grp_busiest; + if (nr_group >= rand) + break; + } + find_nextgroup: + group = group->next; + nr_group ++; + } while (group != sd->groups); + + return busiest; +} + +/** + * load_balance - pressure based load balancing algorithm used by ckrm + */ +static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + runqueue_t *busiest; + unsigned long avg_load; + int nr_moved,nr_group; + + avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); + if (! avg_load) + goto out_balanced; + + busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); + if (! busiest) + goto out_balanced; + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + 0,sd, idle); + spin_unlock(&busiest->lock); + if (nr_moved) { + adjust_local_weight(); + } + } + + if (!nr_moved) + sd->nr_balance_failed ++; + else + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + + return nr_moved; + +out_balanced: + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} + +/* + * this_rq->lock is already held + */ +static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) +{ + int ret; + read_lock(&class_list_lock); + ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + return ret; +} + +static inline int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + int ret; + + spin_lock(&this_rq->lock); + read_lock(&class_list_lock); + ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); + read_unlock(&class_list_lock); + spin_unlock(&this_rq->lock); + return ret; +} +#else /*! CONFIG_CKRM_CPU_SCHEDULE */ /* * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, * as part of a balancing operation within "domain". Returns the number of @@ -1787,6 +2341,8 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out: return nr_moved; } +#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ + /* * idle_balance is called by schedule() if this_cpu is about to become @@ -1924,7 +2480,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, } } } -#else +#else /* SMP*/ /* * on UP we do not need to balance between CPUs: */ @@ -1951,8 +2507,7 @@ static inline int wake_priority_sleeper(runqueue_t *rq) return 0; } -DEFINE_PER_CPU(struct kernel_stat, kstat); - +DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; EXPORT_PER_CPU_SYMBOL(kstat); /* @@ -1965,11 +2520,19 @@ EXPORT_PER_CPU_SYMBOL(kstat); * increasing number of running tasks. We also ignore the interactivity * if a better static_prio task has expired: */ + +#ifndef CONFIG_CKRM_CPU_SCHEDULE #define EXPIRED_STARVING(rq) \ ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ ((rq)->curr->static_prio > (rq)->best_expired_prio)) +#else +#define EXPIRED_STARVING(rq) \ + (STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) +#endif /* * This function gets called by the timer code, with HZ frequency. @@ -2006,6 +2569,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; + ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, IDLE); return; } @@ -2016,7 +2580,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { + if (p->array != rq_active(p,rq)) { set_tsk_need_resched(p); goto out; } @@ -2039,12 +2603,16 @@ void scheduler_tick(int user_ticks, int sys_ticks) set_tsk_need_resched(p); /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); + dequeue_task(p, rq_active(p,rq)); + enqueue_task(p, rq_active(p,rq)); } goto out_unlock; } if (!--p->time_slice) { +#ifdef CONFIG_CKRM_CPU_SCHEDULE + /* Hubertus ... we can abstract this out */ + ckrm_lrq_t* rq = get_task_lrq(p); +#endif dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); @@ -2055,8 +2623,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; + if (p->static_prio < this_rq()->best_expired_prio) + this_rq()->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); } else { @@ -2079,17 +2647,18 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY(p)) && (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { + (p->array == rq_active(p,rq))) { - dequeue_task(p, rq->active); + dequeue_task(p, rq_active(p,rq)); set_tsk_need_resched(p); p->prio = effective_prio(p); - enqueue_task(p, rq->active); + enqueue_task(p, rq_active(p,rq)); } } out_unlock: spin_unlock(&rq->lock); out: + ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -2187,10 +2756,9 @@ asmlinkage void __sched schedule(void) task_t *prev, *next; runqueue_t *rq; prio_array_t *array; - struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx; + int cpu; /* * Test if we are atomic. Since do_exit() needs to call into @@ -2226,6 +2794,19 @@ need_resched: spin_lock_irq(&rq->lock); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + if (prev != rq->idle) { + unsigned long long run = now - prev->timestamp; + ckrm_lrq_t * lrq = get_task_lrq(prev); + + lrq->lrq_load -= task_load(prev); + cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); + lrq->lrq_load += task_load(prev); + + cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); + update_local_cvt(prev, run); + } +#endif /* * if entering off of a kernel preemption go straight * to picking the next task. @@ -2243,30 +2824,15 @@ need_resched: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - goto switch_tasks; - } } - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - rq->active = rq->expired; - rq->expired = array; - array = rq->active; + next = rq_get_next_task(rq); + if (next == rq->idle) { rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; + wake_sleeping_dependent(cpu, rq); + goto switch_tasks; } - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); - if (dependent_sleeper(cpu, rq, next)) { next = rq->idle; goto switch_tasks; @@ -2321,7 +2887,6 @@ switch_tasks: } EXPORT_SYMBOL(schedule); - #ifdef CONFIG_PREEMPT /* * this is is the entry point to schedule() from in-kernel preemption @@ -3009,7 +3574,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; - prio_array_t *target = rq->expired; + prio_array_t *target = rq_expired(current,rq); /* * We implement yielding by moving the task into the expired @@ -3019,7 +3584,7 @@ asmlinkage long sys_sched_yield(void) * array.) */ if (unlikely(rt_task(current))) - target = rq->active; + target = rq_active(current,rq); dequeue_task(current, array); enqueue_task(current, target); @@ -3396,7 +3961,6 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; - set_task_cpu(p, dest_cpu); if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -3407,10 +3971,12 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); - } + } else + set_task_cpu(p, dest_cpu); out: double_rq_unlock(rq_src, rq_dest); @@ -3919,7 +4485,7 @@ int in_sched_functions(unsigned long addr) void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i; #ifdef CONFIG_SMP /* Set up an initial dummy domain for early boot */ @@ -3938,36 +4504,49 @@ void __init sched_init(void) sched_group_init.next = &sched_group_init; sched_group_init.cpu_power = SCHED_LOAD_SCALE; #endif + init_cpu_classes(); for (i = 0; i < NR_CPUS; i++) { +#ifndef CONFIG_CKRM_CPU_SCHEDULE + int j, k; prio_array_t *array; rq = cpu_rq(i); spin_lock_init(&rq->lock); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + rq->active = rq->arrays; rq->expired = rq->arrays + 1; +#else + rq = cpu_rq(i); + spin_lock_init(&rq->lock); +#endif + rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; +#ifdef CONFIG_CKRM_CPU_SCHEDULE + ckrm_load_init(rq_ckrm_load(rq)); +#endif rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); #endif atomic_set(&rq->nr_iowait, 0); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } } + /* * We have to do a little magic to get the first * thread right in SMP mode. @@ -3976,6 +4555,10 @@ void __init sched_init(void) rq->curr = current; rq->idle = current; set_task_cpu(current, smp_processor_id()); +#ifdef CONFIG_CKRM_CPU_SCHEDULE + current->cpu_class = get_default_cpu_class(); + current->array = NULL; +#endif wake_up_forked_process(current); /* @@ -4061,3 +4644,33 @@ int task_running_sys(struct task_struct *p) EXPORT_SYMBOL(task_running_sys); #endif +#ifdef CONFIG_CKRM_CPU_SCHEDULE +/** + * return the classqueue object of a certain processor + */ +struct classqueue_struct * get_cpu_classqueue(int cpu) +{ + return (& (cpu_rq(cpu)->classqueue) ); +} + +/** + * _ckrm_cpu_change_class - change the class of a task + */ +void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) +{ + prio_array_t *array; + struct runqueue *rq; + unsigned long flags; + + rq = task_rq_lock(tsk,&flags); + array = tsk->array; + if (array) { + dequeue_task(tsk,array); + tsk->cpu_class = newcls; + enqueue_task(tsk,rq_active(tsk,rq)); + } else + tsk->cpu_class = newcls; + + task_rq_unlock(rq,&flags); +} +#endif