.long sys_mq_notify
.long sys_mq_getsetattr
.long sys_ni_syscall /* reserved for kexec */
- .long sys_ioprio_set
- .long sys_ioprio_get /* 285 */
syscall_table_size=(.-sys_call_table)
.long sys_mq_notify
.long sys_mq_getsetattr
.long sys_ni_syscall /* 268 reserved for sys_kexec_load */
- .long sys_ioprio_set
- .long sys_ioprio_get
# kblockd threads
#
-obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o ckrm-iostub.o
+obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
-obj-$(CONFIG_CKRM_RES_BLKIO) += ckrm-io.o
obj-$(CONFIG_MAC_FLOPPY) += swim3.o
obj-$(CONFIG_BLK_DEV_FD) += floppy.o
obj-$(CONFIG_BLK_DEV_FD98) += floppy98.o
* Based on ideas from a previously unfinished io
* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
*
- * IO priorities are supported, from 0% to 100% in 5% increments. Both of
- * those values have special meaning - 0% class is allowed to do io if
- * noone else wants to use the disk. 100% is considered real-time io, and
- * always get priority. Default process io rate is 95%. In absence of other
- * io, a class may consume 100% disk bandwidth regardless. Withing a class,
- * bandwidth is distributed equally among the citizens.
- *
- * TODO:
- * - cfq_select_requests() needs some work for 5-95% io
- * - barriers not supported
- * - export grace periods in ms, not jiffies
- *
* Copyright (C) 2003 Jens Axboe <axboe@suse.de>
*/
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/rbtree.h>
#include <linux/mempool.h>
-#include <asm/div64.h>
-
-#if IOPRIO_NR > BITS_PER_LONG
-#error Cannot support this many io priority levels
-#endif
-
-#define LIMIT_DEBUG 1
/*
* tunables
*/
-static int cfq_quantum = 6;
-static int cfq_quantum_io = 256;
-static int cfq_idle_quantum = 1;
-static int cfq_idle_quantum_io = 64;
-static int cfq_queued = 4;
-static int cfq_grace_rt = HZ / 100 ?: 1;
-static int cfq_grace_idle = HZ / 10;
+static int cfq_quantum = 4;
+static int cfq_queued = 8;
#define CFQ_QHASH_SHIFT 6
#define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
+#define list_entry_qhash(entry) list_entry((entry), struct cfq_queue, cfq_hash)
#define CFQ_MHASH_SHIFT 8
#define CFQ_MHASH_BLOCK(sec) ((sec) >> 3)
#define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT)
#define CFQ_MHASH_FN(sec) (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
+#define ON_MHASH(crq) !list_empty(&(crq)->hash)
#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash)
+#define list_entry_hash(ptr) list_entry((ptr), struct cfq_rq, hash)
#define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list)
-#define list_entry_prio(ptr) list_entry((ptr), struct cfq_rq, prio_list)
-
-#define cfq_account_io(crq) \
- ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
-
-/* define to be 50 ms for now; make tunable later */
-#define CFQ_EPOCH 50000
-/* Needs to be made tunable right away, in MiB/s */
-#define CFQ_DISKBW 10
-/* Temporary global limit, as percent of available b/w, for each "class" */
-#define CFQ_TEMPLIM 10
-
-/*
- * defines how we distribute bandwidth (can be tgid, uid, etc)
- */
-
-/* FIXME: change hash_key to be sizeof(void *) rather than sizeof(int)
- * otherwise the cast of cki_tsk_icls will not work reliably on 64-bit arches.
- * OR, change cki_tsk_icls to return ints (will need another id space to be
- * managed)
- */
-
-#if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
-extern inline void *cki_hash_key(struct task_struct *tsk);
-extern inline int cki_ioprio(struct task_struct *tsk);
-#define cfq_hash_key(current) ((int)cki_hash_key((current)))
-#define cfq_ioprio(current) (cki_ioprio((current)))
-
-#else
-#define cfq_hash_key(current) ((current)->tgid)
-
-/*
- * move to io_context
- */
-#define cfq_ioprio(current) ((current)->ioprio)
-#endif
-#define CFQ_WAIT_RT 0
-#define CFQ_WAIT_NORM 1
+#define RQ_DATA(rq) ((struct cfq_rq *) (rq)->elevator_private)
static kmem_cache_t *crq_pool;
static kmem_cache_t *cfq_pool;
static mempool_t *cfq_mpool;
-/*
- * defines an io priority level
- */
-struct io_prio_data {
- struct list_head rr_list;
- int busy_queues;
- int busy_rq;
- unsigned long busy_sectors;
-
- /* requests, sectors and queues
- * added(in),dispatched/deleted(out)
- * at this priority level.
- */
- atomic_t cum_rq_in,cum_rq_out;
- atomic_t cum_sectors_in,cum_sectors_out;
- atomic_t cum_queues_in,cum_queues_out;
-
-#ifdef LIMIT_DEBUG
- int nskip;
- unsigned long navsec;
- unsigned long csectorate;
- unsigned long lsectorate;
-#endif
-
- struct list_head prio_list;
- int last_rq;
- int last_sectors;
-};
-
-/*
- * per-request queue structure
- */
struct cfq_data {
struct list_head rr_list;
struct list_head *dispatch;
- struct hlist_head *cfq_hash;
- struct hlist_head *crq_hash;
- mempool_t *crq_pool;
+ struct list_head *cfq_hash;
- struct io_prio_data cid[IOPRIO_NR];
+ struct list_head *crq_hash;
- /*
- * total number of busy queues and requests
- */
- int busy_rq;
- int busy_queues;
- unsigned long busy_sectors;
+ unsigned int busy_queues;
+ unsigned int max_queued;
+ mempool_t *crq_pool;
request_queue_t *queue;
- unsigned long rq_starved_mask;
-
- /*
- * grace period handling
- */
- struct timer_list timer;
- unsigned long wait_end;
- unsigned long flags;
- struct work_struct work;
/*
* tunables
*/
unsigned int cfq_quantum;
- unsigned int cfq_quantum_io;
- unsigned int cfq_idle_quantum;
- unsigned int cfq_idle_quantum_io;
unsigned int cfq_queued;
- unsigned int cfq_grace_rt;
- unsigned int cfq_grace_idle;
-
- unsigned long cfq_epoch; /* duration for limit enforcement */
- unsigned long cfq_epochsectors; /* max sectors dispatchable/epoch */
};
-/*
- * per-class structure
- */
struct cfq_queue {
+ struct list_head cfq_hash;
struct list_head cfq_list;
- struct hlist_node cfq_hash;
- int hash_key;
struct rb_root sort_list;
+ int pid;
int queued[2];
- int ioprio;
-
- unsigned long avsec; /* avg sectors dispatched/epoch */
- unsigned long long lastime; /* timestamp of last request served */
- unsigned long sectorate; /* limit for sectors served/epoch */
- int skipped; /* queue skipped at last dispatch ? */
+#if 0
+ /*
+ * with a simple addition like this, we can do io priorities. almost.
+ * does need a split request free list, too.
+ */
+ int io_prio
+#endif
};
-/*
- * per-request structure
- */
struct cfq_rq {
- struct cfq_queue *cfq_queue;
struct rb_node rb_node;
- struct hlist_node hash;
sector_t rb_key;
struct request *request;
- struct list_head prio_list;
- unsigned long nr_sectors;
- int ioprio;
+
+ struct cfq_queue *cfq_queue;
+
+ struct list_head hash;
};
static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
/*
* lots of deadline iosched dupes, can be abstracted later...
*/
+static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
+{
+ list_del_init(&crq->hash);
+}
+
static inline void cfq_del_crq_hash(struct cfq_rq *crq)
{
- hlist_del_init(&crq->hash);
+ if (ON_MHASH(crq))
+ __cfq_del_crq_hash(crq);
}
-static inline void
-cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
+static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
{
cfq_del_crq_hash(crq);
static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
{
struct request *rq = crq->request;
- const int hash_idx = CFQ_MHASH_FN(rq_hash_key(rq));
- BUG_ON(!hlist_unhashed(&crq->hash));
-
- hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
+ BUG_ON(ON_MHASH(crq));
+
+ list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
}
static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
{
- struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
- struct hlist_node *entry, *next;
+ struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
+ struct list_head *entry, *next = hash_list->next;
- hlist_for_each_safe(entry, next, hash_list) {
+ while ((entry = next) != hash_list) {
struct cfq_rq *crq = list_entry_hash(entry);
struct request *__rq = crq->request;
- BUG_ON(hlist_unhashed(&crq->hash));
+ next = entry->next;
+
+ BUG_ON(!ON_MHASH(crq));
if (!rq_mergeable(__rq)) {
- cfq_del_crq_hash(crq);
+ __cfq_del_crq_hash(crq);
continue;
}
/*
* rb tree support functions
*/
-#define RB_EMPTY(node) ((node)->rb_node == NULL)
+#define RB_NONE (2)
+#define RB_EMPTY(node) ((node)->rb_node == NULL)
+#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
+#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL)
+#define ON_RB(node) ((node)->rb_color != RB_NONE)
#define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node)
#define rq_rb_key(rq) (rq)->sector
-static void
-cfq_del_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
+static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
{
- if (crq->cfq_queue) {
- crq->cfq_queue = NULL;
-
- if (cfq_account_io(crq)) {
- cfqd->busy_rq--;
- cfqd->busy_sectors -= crq->nr_sectors;
- cfqd->cid[crq->ioprio].busy_rq--;
- cfqd->cid[crq->ioprio].busy_sectors -= crq->nr_sectors;
- }
- atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_out));
- atomic_add(crq->nr_sectors,
- &(cfqd->cid[crq->ioprio].cum_sectors_out));
+ if (ON_RB(&crq->rb_node)) {
cfqq->queued[rq_data_dir(crq->request)]--;
rb_erase(&crq->rb_node, &cfqq->sort_list);
+ crq->cfq_queue = NULL;
}
}
struct request *rq = crq->request;
struct cfq_rq *__alias;
-
+ crq->rb_key = rq_rb_key(rq);
cfqq->queued[rq_data_dir(rq)]++;
- if (cfq_account_io(crq)) {
- cfqd->busy_rq++;
- cfqd->busy_sectors += crq->nr_sectors;
- cfqd->cid[crq->ioprio].busy_rq++;
- cfqd->cid[crq->ioprio].busy_sectors += crq->nr_sectors;
- }
- atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_in));
- atomic_add(crq->nr_sectors,
- &(cfqd->cid[crq->ioprio].cum_sectors_in));
retry:
__alias = __cfq_add_crq_rb(cfqq, crq);
if (!__alias) {
rb_insert_color(&crq->rb_node, &cfqq->sort_list);
- crq->rb_key = rq_rb_key(rq);
crq->cfq_queue = cfqq;
return;
}
static struct request *
cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
{
- struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
+ struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
struct rb_node *n;
if (!cfqq)
static void cfq_remove_request(request_queue_t *q, struct request *rq)
{
struct cfq_data *cfqd = q->elevator.elevator_data;
- struct cfq_rq *crq = RQ_ELV_DATA(rq);
+ struct cfq_rq *crq = RQ_DATA(rq);
if (crq) {
+ struct cfq_queue *cfqq = crq->cfq_queue;
cfq_remove_merge_hints(q, crq);
- list_del_init(&crq->prio_list);
list_del_init(&rq->queuelist);
- /*
- * set a grace period timer to allow realtime io to make real
- * progress, if we release an rt request. for normal request,
- * set timer so idle io doesn't interfere with other io
- */
- if (crq->ioprio == IOPRIO_RT) {
- set_bit(CFQ_WAIT_RT, &cfqd->flags);
- cfqd->wait_end = jiffies + cfqd->cfq_grace_rt;
- } else if (crq->ioprio != IOPRIO_IDLE) {
- set_bit(CFQ_WAIT_NORM, &cfqd->flags);
- cfqd->wait_end = jiffies + cfqd->cfq_grace_idle;
- }
-
- if (crq->cfq_queue) {
- struct cfq_queue *cfqq = crq->cfq_queue;
-
- cfq_del_crq_rb(cfqd, cfqq, crq);
+ if (cfqq) {
+ cfq_del_crq_rb(cfqq, crq);
if (RB_EMPTY(&cfqq->sort_list))
cfq_put_queue(cfqd, cfqq);
static void cfq_merged_request(request_queue_t *q, struct request *req)
{
struct cfq_data *cfqd = q->elevator.elevator_data;
- struct cfq_rq *crq = RQ_ELV_DATA(req);
- int tmp;
+ struct cfq_rq *crq = RQ_DATA(req);
cfq_del_crq_hash(crq);
cfq_add_crq_hash(cfqd, crq);
- if (crq->cfq_queue && (rq_rb_key(req) != crq->rb_key)) {
+ if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
struct cfq_queue *cfqq = crq->cfq_queue;
- cfq_del_crq_rb(cfqd, cfqq, crq);
+ cfq_del_crq_rb(cfqq, crq);
cfq_add_crq_rb(cfqd, cfqq, crq);
}
- tmp = req->hard_nr_sectors - crq->nr_sectors;
- cfqd->busy_sectors += tmp;
- cfqd->cid[crq->ioprio].busy_sectors += tmp;
- atomic_add(tmp,&(cfqd->cid[crq->ioprio].cum_sectors_in));
-
- crq->nr_sectors = req->hard_nr_sectors;
-
q->last_merge = req;
}
cfq_remove_request(q, next);
}
-/*
- * sort into dispatch list, in optimal ascending order
- */
static void
cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct cfq_rq *crq)
struct list_head *head = cfqd->dispatch, *entry = head;
struct request *__rq;
- cfq_del_crq_rb(cfqd, cfqq, crq);
+ cfq_del_crq_rb(cfqq, crq);
cfq_remove_merge_hints(cfqd->queue, crq);
if (!list_empty(head)) {
list_add_tail(&crq->request->queuelist, entry);
}
-/*
- * remove from io scheduler core and put on dispatch list for service
- */
-static inline int
+static inline void
__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
struct cfq_queue *cfqq)
{
- struct cfq_rq *crq;
- unsigned long long ts, gap;
- unsigned long newavsec;
-
- crq = rb_entry_crq(rb_first(&cfqq->sort_list));
-
-#if 1
- /* Determine if queue should be skipped for being overshare */
- ts = sched_clock();
- gap = ts - cfqq->lastime;
-#ifdef LIMIT_DEBUG
- cfqq->sectorate = (cfqd->cfq_epochsectors
- * CFQ_TEMPLIM)/100;
-
-#endif
- if ((gap >= cfqd->cfq_epoch) || (gap < 0)) {
- cfqq->avsec = crq->nr_sectors ;
- cfqq->lastime = ts;
- } else {
- u64 tmp;
- /* Age old average and accumalate request to be served */
-
-// tmp = (u64) (cfqq->avsec * gap) ;
-// do_div(tmp, cfqd->cfq_epoch);
- newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors;
-// if (crq->ioprio >= 0 && crq->ioprio <= 20)
-// cfqd->cid[crq->ioprio].lsectorate = newavsec;
-// atomic_set(&(cfqd->cid[crq->ioprio].lsectorate),
-// newavsec);
-
- if ((newavsec < cfqq->sectorate) || cfqq->skipped) {
- cfqq->avsec = newavsec ;
- cfqq->lastime = ts;
- cfqq->skipped = 0;
- } else {
- /* queue over share ; skip once */
- cfqq->skipped = 1;
-#ifdef LIMIT_DEBUG
-// atomic_inc(&(cfqd->cid[crq->ioprio].nskip));
-// if (crq->ioprio >= 0 && crq->ioprio <= 20)
-// cfqd->cid[crq->ioprio].nskip++;
-#endif
- return 0;
- }
- }
-#endif
-
-#ifdef LIMIT_DEBUG
-// if (crq->ioprio >= 0 && crq->ioprio <= 20) {
-// cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
-// cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate;
-// }
+ struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
-// atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec);
-// atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate);
-#endif
cfq_dispatch_sort(cfqd, cfqq, crq);
-
- /*
- * technically, for IOPRIO_RT we don't need to add it to the list.
- */
- list_add_tail(&crq->prio_list, &cfqd->cid[cfqq->ioprio].prio_list);
- return crq->nr_sectors;
}
-static int
-cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors)
+static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
{
- struct cfq_data *cfqd = q->elevator.elevator_data;
- struct list_head *plist = &cfqd->cid[prio].rr_list;
- struct list_head *entry, *nxt;
- int q_rq, q_io;
- int ret ;
+ struct cfq_queue *cfqq;
+ struct list_head *entry, *tmp;
+ int ret, queued, good_queues;
- /*
- * for each queue at this prio level, dispatch a request
- */
- q_rq = q_io = 0;
- list_for_each_safe(entry, nxt, plist) {
- struct cfq_queue *cfqq = list_entry_cfqq(entry);
+ if (list_empty(&cfqd->rr_list))
+ return 0;
+
+ queued = ret = 0;
+restart:
+ good_queues = 0;
+ list_for_each_safe(entry, tmp, &cfqd->rr_list) {
+ cfqq = list_entry_cfqq(cfqd->rr_list.next);
BUG_ON(RB_EMPTY(&cfqq->sort_list));
- ret = __cfq_dispatch_requests(q, cfqd, cfqq);
- if (ret <= 0) {
- continue; /* skip queue */
- /* can optimize more by moving q to end of plist ? */
- }
- q_io += ret ;
- q_rq++ ;
+ __cfq_dispatch_requests(q, cfqd, cfqq);
if (RB_EMPTY(&cfqq->sort_list))
cfq_put_queue(cfqd, cfqq);
- /*
- * if we hit the queue limit, put the string of serviced
- * queues at the back of the pending list
- */
- if (q_io >= max_sectors || q_rq >= max_rq) {
- struct list_head *prv = nxt->prev;
-
- if (prv != plist) {
- list_del(plist);
- list_add(plist, prv);
- }
- break;
- }
- }
-
- cfqd->cid[prio].last_rq = q_rq;
- cfqd->cid[prio].last_sectors = q_io;
- return q_rq;
-}
-
-/*
- * try to move some requests to the dispatch list. return 0 on success
- */
-static int cfq_select_requests(request_queue_t *q, struct cfq_data *cfqd)
-{
- int queued, busy_rq, busy_sectors, i;
-
- /*
- * if there's any realtime io, only schedule that
- */
- if (cfq_dispatch_requests(q, IOPRIO_RT, cfqd->cfq_quantum, cfqd->cfq_quantum_io))
- return 1;
-
- /*
- * if RT io was last serviced and grace time hasn't expired,
- * arm the timer to restart queueing if no other RT io has been
- * submitted in the mean time
- */
- if (test_bit(CFQ_WAIT_RT, &cfqd->flags)) {
- if (time_before(jiffies, cfqd->wait_end)) {
- mod_timer(&cfqd->timer, cfqd->wait_end);
- return 0;
- }
- clear_bit(CFQ_WAIT_RT, &cfqd->flags);
- }
-
- /*
- * for each priority level, calculate number of requests we
- * are allowed to put into service.
- */
- queued = 0;
- busy_rq = cfqd->busy_rq;
- busy_sectors = cfqd->busy_sectors;
- for (i = IOPRIO_RT - 1; i > IOPRIO_IDLE; i--) {
- const int o_rq = busy_rq - cfqd->cid[i].busy_rq;
- const int o_sectors = busy_sectors - cfqd->cid[i].busy_sectors;
- int q_rq = cfqd->cfq_quantum * (i + 1) / IOPRIO_NR;
- int q_io = cfqd->cfq_quantum_io * (i + 1) / IOPRIO_NR;
-
- /*
- * no need to keep iterating the list, if there are no
- * requests pending anymore
- */
- if (!cfqd->busy_rq)
- break;
-
- /*
- * find out how many requests and sectors we are allowed to
- * service
- */
- if (o_rq)
- q_rq = o_sectors * (i + 1) / IOPRIO_NR;
- if (q_rq > cfqd->cfq_quantum)
- q_rq = cfqd->cfq_quantum;
-
- if (o_sectors)
- q_io = o_sectors * (i + 1) / IOPRIO_NR;
- if (q_io > cfqd->cfq_quantum_io)
- q_io = cfqd->cfq_quantum_io;
-
- /*
- * average with last dispatched for fairness
- */
- if (cfqd->cid[i].last_rq != -1)
- q_rq = (cfqd->cid[i].last_rq + q_rq) / 2;
- if (cfqd->cid[i].last_sectors != -1)
- q_io = (cfqd->cid[i].last_sectors + q_io) / 2;
-
- queued += cfq_dispatch_requests(q, i, q_rq, q_io);
- }
-
- if (queued)
- return 1;
+ else
+ good_queues++;
- /*
- * only allow dispatch of idle io, if the queue has been idle from
- * servicing RT or normal io for the grace period
- */
- if (test_bit(CFQ_WAIT_NORM, &cfqd->flags)) {
- if (time_before(jiffies, cfqd->wait_end)) {
- mod_timer(&cfqd->timer, cfqd->wait_end);
- return 0;
- }
- clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
+ queued++;
+ ret = 1;
}
- /*
- * if we found nothing to do, allow idle io to be serviced
- */
- if (cfq_dispatch_requests(q, IOPRIO_IDLE, cfqd->cfq_idle_quantum, cfqd->cfq_idle_quantum_io))
- return 1;
+ if ((queued < cfqd->cfq_quantum) && good_queues)
+ goto restart;
- return 0;
+ return ret;
}
static struct request *cfq_next_request(request_queue_t *q)
if (!list_empty(cfqd->dispatch)) {
struct cfq_rq *crq;
dispatch:
- /*
- * end grace period, we are servicing a request
- */
- del_timer(&cfqd->timer);
- clear_bit(CFQ_WAIT_RT, &cfqd->flags);
- clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
-
- BUG_ON(list_empty(cfqd->dispatch));
rq = list_entry_rq(cfqd->dispatch->next);
- BUG_ON(q->last_merge == rq);
- crq = RQ_ELV_DATA(rq);
- if (crq) {
- BUG_ON(!hlist_unhashed(&crq->hash));
- list_del_init(&crq->prio_list);
- }
+ crq = RQ_DATA(rq);
+ if (crq)
+ cfq_remove_merge_hints(q, crq);
return rq;
}
- /*
- * we moved requests to dispatch list, go back end serve one
- */
- if (cfq_select_requests(q, cfqd))
+ if (cfq_dispatch_requests(q, cfqd))
goto dispatch;
return NULL;
}
static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey, const int hashval)
+__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
{
- struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
- struct hlist_node *entry;
+ struct list_head *hash_list = &cfqd->cfq_hash[hashval];
+ struct list_head *entry;
- hlist_for_each(entry, hash_list) {
+ list_for_each(entry, hash_list) {
struct cfq_queue *__cfqq = list_entry_qhash(entry);
- if (__cfqq->hash_key == hashkey)
+ if (__cfqq->pid == pid)
return __cfqq;
}
return NULL;
}
-
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey)
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
{
- const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
+ const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
- return __cfq_find_cfq_hash(cfqd, hashkey, hashval);
+ return __cfq_find_cfq_hash(cfqd, pid, hashval);
}
static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
cfqd->busy_queues--;
- WARN_ON(cfqd->busy_queues < 0);
-
- cfqd->cid[cfqq->ioprio].busy_queues--;
- WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0);
- atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
-
list_del(&cfqq->cfq_list);
- hlist_del(&cfqq->cfq_hash);
+ list_del(&cfqq->cfq_hash);
mempool_free(cfqq, cfq_mpool);
}
-static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey,
+static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int pid,
int gfp_mask)
{
- const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
+ const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
struct cfq_queue *cfqq, *new_cfqq = NULL;
request_queue_t *q = cfqd->queue;
retry:
- cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
+ cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
if (!cfqq) {
if (new_cfqq) {
} else
return NULL;
- memset(cfqq, 0, sizeof(*cfqq));
- INIT_HLIST_NODE(&cfqq->cfq_hash);
+ INIT_LIST_HEAD(&cfqq->cfq_hash);
INIT_LIST_HEAD(&cfqq->cfq_list);
- cfqq->hash_key = cfq_hash_key(current);
- cfqq->ioprio = cfq_ioprio(current);
- cfqq->avsec = 0 ;
- cfqq->lastime = sched_clock();
- cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100;
- hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+ RB_CLEAR_ROOT(&cfqq->sort_list);
+
+ cfqq->pid = pid;
+ cfqq->queued[0] = cfqq->queued[1] = 0;
+ list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
}
if (new_cfqq)
return cfqq;
}
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int hashkey,
+static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid,
int gfp_mask)
{
request_queue_t *q = cfqd->queue;
struct cfq_queue *cfqq;
spin_lock_irq(q->queue_lock);
- cfqq = __cfq_get_queue(cfqd, hashkey, gfp_mask);
+ cfqq = __cfq_get_queue(cfqd, pid, gfp_mask);
spin_unlock_irq(q->queue_lock);
return cfqq;
}
-static void
-__cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
+static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
{
- const int prio = crq->ioprio;
struct cfq_queue *cfqq;
- cfqq = __cfq_get_queue(cfqd, cfq_hash_key(current), GFP_ATOMIC);
+ cfqq = __cfq_get_queue(cfqd, current->tgid, GFP_ATOMIC);
if (cfqq) {
-
- /*
- * not too good...
- */
- if (prio > cfqq->ioprio) {
- printk("prio hash collision %d %d\n",
- prio, cfqq->ioprio);
- if (!list_empty(&cfqq->cfq_list)) {
- cfqd->cid[cfqq->ioprio].busy_queues--;
- WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues<0);
- atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
- cfqd->cid[prio].busy_queues++;
- atomic_inc(&(cfqd->cid[prio].cum_queues_in));
- list_move_tail(&cfqq->cfq_list,
- &cfqd->cid[prio].rr_list);
- }
- cfqq->ioprio = prio;
- }
-
cfq_add_crq_rb(cfqd, cfqq, crq);
if (list_empty(&cfqq->cfq_list)) {
- list_add_tail(&cfqq->cfq_list,
- &cfqd->cid[prio].rr_list);
- cfqd->cid[prio].busy_queues++;
- atomic_inc(&(cfqd->cid[prio].cum_queues_in));
+ list_add(&cfqq->cfq_list, &cfqd->rr_list);
cfqd->busy_queues++;
}
-
- if (rq_mergeable(crq->request)) {
- cfq_add_crq_hash(cfqd, crq);
-
- if (!q->last_merge)
- q->last_merge = crq->request;
- }
-
} else {
/*
* should can only happen if the request wasn't allocated
}
}
-static void cfq_reenqueue(request_queue_t *q, struct cfq_data *cfqd, int prio)
-{
- struct list_head *prio_list = &cfqd->cid[prio].prio_list;
- struct list_head *entry, *tmp;
-
- list_for_each_safe(entry, tmp, prio_list) {
- struct cfq_rq *crq = list_entry_prio(entry);
-
- list_del_init(entry);
- list_del_init(&crq->request->queuelist);
- __cfq_enqueue(q, cfqd, crq);
- }
-}
-
-static void
-cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
-{
- const int prio = cfq_ioprio(current);
-
- crq->ioprio = prio;
- crq->nr_sectors = crq->request->hard_nr_sectors;
- __cfq_enqueue(q, cfqd, crq);
-
- if (prio == IOPRIO_RT) {
- int i;
-
- /*
- * realtime io gets priority, move all other io back
- */
- for (i = IOPRIO_IDLE; i < IOPRIO_RT; i++)
- cfq_reenqueue(q, cfqd, i);
- } else if (prio != IOPRIO_IDLE) {
- /*
- * check if we need to move idle io back into queue
- */
- cfq_reenqueue(q, cfqd, IOPRIO_IDLE);
- }
-}
-
static void
cfq_insert_request(request_queue_t *q, struct request *rq, int where)
{
struct cfq_data *cfqd = q->elevator.elevator_data;
- struct cfq_rq *crq = RQ_ELV_DATA(rq);
+ struct cfq_rq *crq = RQ_DATA(rq);
switch (where) {
case ELEVATOR_INSERT_BACK:
-#if 0
while (cfq_dispatch_requests(q, cfqd))
;
-#endif
list_add_tail(&rq->queuelist, cfqd->dispatch);
break;
case ELEVATOR_INSERT_FRONT:
break;
case ELEVATOR_INSERT_SORT:
BUG_ON(!blk_fs_request(rq));
- cfq_enqueue(q, cfqd, crq);
+ cfq_enqueue(cfqd, crq);
break;
default:
- printk("%s: bad insert point %d\n",
- __FUNCTION__,where);
+ printk("%s: bad insert point %d\n", __FUNCTION__,where);
return;
}
+
+ if (rq_mergeable(rq)) {
+ cfq_add_crq_hash(cfqd, crq);
+
+ if (!q->last_merge)
+ q->last_merge = rq;
+ }
}
static int cfq_queue_empty(request_queue_t *q)
{
struct cfq_data *cfqd = q->elevator.elevator_data;
- if (list_empty(cfqd->dispatch) && !cfqd->busy_queues)
+ if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
return 1;
return 0;
static struct request *
cfq_former_request(request_queue_t *q, struct request *rq)
{
- struct cfq_rq *crq = RQ_ELV_DATA(rq);
+ struct cfq_rq *crq = RQ_DATA(rq);
struct rb_node *rbprev = rb_prev(&crq->rb_node);
if (rbprev)
static struct request *
cfq_latter_request(request_queue_t *q, struct request *rq)
{
- struct cfq_rq *crq = RQ_ELV_DATA(rq);
+ struct cfq_rq *crq = RQ_DATA(rq);
struct rb_node *rbnext = rb_next(&crq->rb_node);
if (rbnext)
return NULL;
}
-static void cfq_queue_congested(request_queue_t *q)
-{
- struct cfq_data *cfqd = q->elevator.elevator_data;
-
- set_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
-}
-
static int cfq_may_queue(request_queue_t *q, int rw)
{
struct cfq_data *cfqd = q->elevator.elevator_data;
struct cfq_queue *cfqq;
- const int prio = cfq_ioprio(current);
- int limit, ret = 1;
+ int ret = 1;
if (!cfqd->busy_queues)
goto out;
- cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
- if (!cfqq)
- goto out;
-
- cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
- if (!cfqq)
- goto out;
-
- /*
- * if higher or equal prio io is sleeping waiting for a request, don't
- * allow this one to allocate one. as long as ll_rw_blk does fifo
- * waitqueue wakeups this should work...
- */
- if (cfqd->rq_starved_mask & ~((1 << prio) - 1))
- goto out;
+ cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+ if (cfqq) {
+ int limit = (q->nr_requests - cfqd->cfq_queued) / cfqd->busy_queues;
- if (cfqq->queued[rw] < cfqd->cfq_queued || !cfqd->cid[prio].busy_queues)
- goto out;
+ if (limit < 3)
+ limit = 3;
+ else if (limit > cfqd->max_queued)
+ limit = cfqd->max_queued;
- limit = q->nr_requests * (prio + 1) / IOPRIO_NR;
- limit /= cfqd->cid[prio].busy_queues;
- if (cfqq->queued[rw] > limit)
- ret = 0;
+ if (cfqq->queued[rw] > limit)
+ ret = 0;
+ }
out:
return ret;
}
static void cfq_put_request(request_queue_t *q, struct request *rq)
{
struct cfq_data *cfqd = q->elevator.elevator_data;
- struct cfq_rq *crq = RQ_ELV_DATA(rq);
+ struct cfq_rq *crq = RQ_DATA(rq);
struct request_list *rl;
int other_rw;
if (crq) {
BUG_ON(q->last_merge == rq);
- BUG_ON(!hlist_unhashed(&crq->hash));
+ BUG_ON(ON_MHASH(crq));
mempool_free(crq, cfqd->crq_pool);
rq->elevator_private = NULL;
/*
* prepare a queue up front, so cfq_enqueue() doesn't have to
*/
- cfqq = cfq_get_queue(cfqd, cfq_hash_key(current), gfp_mask);
+ cfqq = cfq_get_queue(cfqd, current->tgid, gfp_mask);
if (!cfqq)
return 1;
crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
if (crq) {
- /*
- * process now has one request
- */
- clear_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
-
memset(crq, 0, sizeof(*crq));
+ RB_CLEAR(&crq->rb_node);
crq->request = rq;
- INIT_HLIST_NODE(&crq->hash);
- INIT_LIST_HEAD(&crq->prio_list);
+ crq->cfq_queue = NULL;
+ INIT_LIST_HEAD(&crq->hash);
rq->elevator_private = crq;
return 0;
}
kfree(cfqd);
}
-static void cfq_timer(unsigned long data)
-{
- struct cfq_data *cfqd = (struct cfq_data *) data;
-
- clear_bit(CFQ_WAIT_RT, &cfqd->flags);
- clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
- kblockd_schedule_work(&cfqd->work);
-}
-
-static void cfq_work(void *data)
-{
- request_queue_t *q = data;
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
- if (cfq_next_request(q))
- q->request_fn(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
static int cfq_init(request_queue_t *q, elevator_t *e)
{
struct cfq_data *cfqd;
return -ENOMEM;
memset(cfqd, 0, sizeof(*cfqd));
- init_timer(&cfqd->timer);
- cfqd->timer.function = cfq_timer;
- cfqd->timer.data = (unsigned long) cfqd;
-
- INIT_WORK(&cfqd->work, cfq_work, q);
-
- for (i = 0; i < IOPRIO_NR; i++) {
- struct io_prio_data *cid = &cfqd->cid[i];
-
- INIT_LIST_HEAD(&cid->rr_list);
- INIT_LIST_HEAD(&cid->prio_list);
- cid->last_rq = -1;
- cid->last_sectors = -1;
-
- atomic_set(&cid->cum_rq_in,0);
- atomic_set(&cid->cum_rq_out,0);
- atomic_set(&cid->cum_sectors_in,0);
- atomic_set(&cid->cum_sectors_out,0);
- atomic_set(&cid->cum_queues_in,0);
- atomic_set(&cid->cum_queues_out,0);
-#if 0
- atomic_set(&cid->nskip,0);
- atomic_set(&cid->navsec,0);
- atomic_set(&cid->csectorate,0);
- atomic_set(&cid->lsectorate,0);
-#endif
- }
+ INIT_LIST_HEAD(&cfqd->rr_list);
- cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES,
- GFP_KERNEL);
+ cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
if (!cfqd->crq_hash)
goto out_crqhash;
- cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES,
- GFP_KERNEL);
+ cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
if (!cfqd->cfq_hash)
goto out_cfqhash;
- cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab,
- mempool_free_slab, crq_pool);
+ cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
if (!cfqd->crq_pool)
goto out_crqpool;
for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
- INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
+ INIT_LIST_HEAD(&cfqd->crq_hash[i]);
for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
- INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
-
- cfqd->cfq_queued = cfq_queued;
- cfqd->cfq_quantum = cfq_quantum;
- cfqd->cfq_quantum_io = cfq_quantum_io;
- cfqd->cfq_idle_quantum = cfq_idle_quantum;
- cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
- cfqd->cfq_grace_rt = cfq_grace_rt;
- cfqd->cfq_grace_idle = cfq_grace_idle;
-
- q->nr_requests <<= 2;
+ INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
cfqd->dispatch = &q->queue_head;
e->elevator_data = cfqd;
cfqd->queue = q;
- cfqd->cfq_epoch = CFQ_EPOCH;
- if (q->hardsect_size)
- cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/
- q->hardsect_size)* (1000000 / CFQ_EPOCH);
- else
- cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512)
- * (1000000 / CFQ_EPOCH) ;
+ /*
+ * just set it to some high value, we want anyone to be able to queue
+ * some requests. fairness is handled differently
+ */
+ cfqd->max_queued = q->nr_requests;
+ q->nr_requests = 8192;
+
+ cfqd->cfq_queued = cfq_queued;
+ cfqd->cfq_quantum = cfq_quantum;
return 0;
out_crqpool:
return cfq_var_show(__VAR, (page)); \
}
SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum);
-SHOW_FUNCTION(cfq_quantum_io_show, cfqd->cfq_quantum_io);
-SHOW_FUNCTION(cfq_idle_quantum_show, cfqd->cfq_idle_quantum);
-SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io);
SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
-SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
-SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
#undef SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
return ret; \
}
STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX);
-STORE_FUNCTION(cfq_quantum_io_store, &cfqd->cfq_quantum_io, 4, INT_MAX);
-STORE_FUNCTION(cfq_idle_quantum_store, &cfqd->cfq_idle_quantum, 1, INT_MAX);
-STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX);
STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
-STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
-STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
#undef STORE_FUNCTION
-
-static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page)
-{
- return sprintf(page, "%lu\n", cfqd->cfq_epoch);
-}
-
-static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
- char *p = (char *) page;
- cfqd->cfq_epoch = simple_strtoul(p, &p, 10);
- return count;
-}
-
-static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page)
-{
- return sprintf(page, "%lu\n", cfqd->cfq_epochsectors);
-}
-
-static ssize_t
-cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
- char *p = (char *) page;
- cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10);
- return count;
-}
-
-/* Additional entries to get priority level data */
-static ssize_t
-cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
-{
- int r1,r2,s1,s2,q1,q2;
-
- if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT))
- return 0;
-
- r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
- r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
- s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
- s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
- q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in));
- q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
-
- return sprintf(page,"skip %d avsec %lu rate %lu new %lu"
- "rq (%d,%d) sec (%d,%d) q (%d,%d)\n",
- cfqd->cid[priolvl].nskip,
- cfqd->cid[priolvl].navsec,
- cfqd->cid[priolvl].csectorate,
- cfqd->cid[priolvl].lsectorate,
-// atomic_read(&cfqd->cid[priolvl].nskip),
-// atomic_read(&cfqd->cid[priolvl].navsec),
-// atomic_read(&cfqd->cid[priolvl].csectorate),
-// atomic_read(&cfqd->cid[priolvl].lsectorate),
- r1,r2,
- s1,s2,
- q1,q2);
-}
-
-#define SHOW_PRIO_DATA(__PRIOLVL) \
-static ssize_t cfq_prio_##__PRIOLVL##_show(struct cfq_data *cfqd, char *page) \
-{ \
- return cfq_prio_show(cfqd,page,__PRIOLVL); \
-}
-SHOW_PRIO_DATA(0);
-SHOW_PRIO_DATA(1);
-SHOW_PRIO_DATA(2);
-SHOW_PRIO_DATA(3);
-SHOW_PRIO_DATA(4);
-SHOW_PRIO_DATA(5);
-SHOW_PRIO_DATA(6);
-SHOW_PRIO_DATA(7);
-SHOW_PRIO_DATA(8);
-SHOW_PRIO_DATA(9);
-SHOW_PRIO_DATA(10);
-SHOW_PRIO_DATA(11);
-SHOW_PRIO_DATA(12);
-SHOW_PRIO_DATA(13);
-SHOW_PRIO_DATA(14);
-SHOW_PRIO_DATA(15);
-SHOW_PRIO_DATA(16);
-SHOW_PRIO_DATA(17);
-SHOW_PRIO_DATA(18);
-SHOW_PRIO_DATA(19);
-SHOW_PRIO_DATA(20);
-#undef SHOW_PRIO_DATA
-
-
-static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
-{
- atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
- atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
- atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
- atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
- atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
- atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
-
- return count;
-}
-
-
-#define STORE_PRIO_DATA(__PRIOLVL) \
-static ssize_t cfq_prio_##__PRIOLVL##_store(struct cfq_data *cfqd, const char *page, size_t count) \
-{ \
- return cfq_prio_store(cfqd,page,count,__PRIOLVL); \
-}
-STORE_PRIO_DATA(0);
-STORE_PRIO_DATA(1);
-STORE_PRIO_DATA(2);
-STORE_PRIO_DATA(3);
-STORE_PRIO_DATA(4);
-STORE_PRIO_DATA(5);
-STORE_PRIO_DATA(6);
-STORE_PRIO_DATA(7);
-STORE_PRIO_DATA(8);
-STORE_PRIO_DATA(9);
-STORE_PRIO_DATA(10);
-STORE_PRIO_DATA(11);
-STORE_PRIO_DATA(12);
-STORE_PRIO_DATA(13);
-STORE_PRIO_DATA(14);
-STORE_PRIO_DATA(15);
-STORE_PRIO_DATA(16);
-STORE_PRIO_DATA(17);
-STORE_PRIO_DATA(18);
-STORE_PRIO_DATA(19);
-STORE_PRIO_DATA(20);
-#undef STORE_PRIO_DATA
-
-
static struct cfq_fs_entry cfq_quantum_entry = {
.attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
.show = cfq_quantum_show,
.store = cfq_quantum_store,
};
-static struct cfq_fs_entry cfq_quantum_io_entry = {
- .attr = {.name = "quantum_io", .mode = S_IRUGO | S_IWUSR },
- .show = cfq_quantum_io_show,
- .store = cfq_quantum_io_store,
-};
-static struct cfq_fs_entry cfq_idle_quantum_entry = {
- .attr = {.name = "idle_quantum", .mode = S_IRUGO | S_IWUSR },
- .show = cfq_idle_quantum_show,
- .store = cfq_idle_quantum_store,
-};
-static struct cfq_fs_entry cfq_idle_quantum_io_entry = {
- .attr = {.name = "idle_quantum_io", .mode = S_IRUGO | S_IWUSR },
- .show = cfq_idle_quantum_io_show,
- .store = cfq_idle_quantum_io_store,
-};
static struct cfq_fs_entry cfq_queued_entry = {
.attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
.show = cfq_queued_show,
.store = cfq_queued_store,
};
-static struct cfq_fs_entry cfq_grace_rt_entry = {
- .attr = {.name = "grace_rt", .mode = S_IRUGO | S_IWUSR },
- .show = cfq_grace_rt_show,
- .store = cfq_grace_rt_store,
-};
-static struct cfq_fs_entry cfq_grace_idle_entry = {
- .attr = {.name = "grace_idle", .mode = S_IRUGO | S_IWUSR },
- .show = cfq_grace_idle_show,
- .store = cfq_grace_idle_store,
-};
-static struct cfq_fs_entry cfq_epoch_entry = {
- .attr = {.name = "epoch", .mode = S_IRUGO | S_IWUSR },
- .show = cfq_epoch_show,
- .store = cfq_epoch_store,
-};
-static struct cfq_fs_entry cfq_epochsectors_entry = {
- .attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR },
- .show = cfq_epochsectors_show,
- .store = cfq_epochsectors_store,
-};
-
-#define P_0_STR "p0"
-#define P_1_STR "p1"
-#define P_2_STR "p2"
-#define P_3_STR "p3"
-#define P_4_STR "p4"
-#define P_5_STR "p5"
-#define P_6_STR "p6"
-#define P_7_STR "p7"
-#define P_8_STR "p8"
-#define P_9_STR "p9"
-#define P_10_STR "p10"
-#define P_11_STR "p11"
-#define P_12_STR "p12"
-#define P_13_STR "p13"
-#define P_14_STR "p14"
-#define P_15_STR "p15"
-#define P_16_STR "p16"
-#define P_17_STR "p17"
-#define P_18_STR "p18"
-#define P_19_STR "p19"
-#define P_20_STR "p20"
-
-
-#define CFQ_PRIO_SYSFS_ENTRY(__PRIOLVL) \
-static struct cfq_fs_entry cfq_prio_##__PRIOLVL##_entry = { \
- .attr = {.name = P_##__PRIOLVL##_STR, .mode = S_IRUGO | S_IWUSR }, \
- .show = cfq_prio_##__PRIOLVL##_show, \
- .store = cfq_prio_##__PRIOLVL##_store, \
-};
-CFQ_PRIO_SYSFS_ENTRY(0);
-CFQ_PRIO_SYSFS_ENTRY(1);
-CFQ_PRIO_SYSFS_ENTRY(2);
-CFQ_PRIO_SYSFS_ENTRY(3);
-CFQ_PRIO_SYSFS_ENTRY(4);
-CFQ_PRIO_SYSFS_ENTRY(5);
-CFQ_PRIO_SYSFS_ENTRY(6);
-CFQ_PRIO_SYSFS_ENTRY(7);
-CFQ_PRIO_SYSFS_ENTRY(8);
-CFQ_PRIO_SYSFS_ENTRY(9);
-CFQ_PRIO_SYSFS_ENTRY(10);
-CFQ_PRIO_SYSFS_ENTRY(11);
-CFQ_PRIO_SYSFS_ENTRY(12);
-CFQ_PRIO_SYSFS_ENTRY(13);
-CFQ_PRIO_SYSFS_ENTRY(14);
-CFQ_PRIO_SYSFS_ENTRY(15);
-CFQ_PRIO_SYSFS_ENTRY(16);
-CFQ_PRIO_SYSFS_ENTRY(17);
-CFQ_PRIO_SYSFS_ENTRY(18);
-CFQ_PRIO_SYSFS_ENTRY(19);
-CFQ_PRIO_SYSFS_ENTRY(20);
-#undef CFQ_PRIO_SYSFS_ENTRY
static struct attribute *default_attrs[] = {
&cfq_quantum_entry.attr,
- &cfq_quantum_io_entry.attr,
- &cfq_idle_quantum_entry.attr,
- &cfq_idle_quantum_io_entry.attr,
&cfq_queued_entry.attr,
- &cfq_grace_rt_entry.attr,
- &cfq_grace_idle_entry.attr,
- &cfq_epoch_entry.attr,
- &cfq_epochsectors_entry.attr,
- &cfq_prio_0_entry.attr,
- &cfq_prio_1_entry.attr,
- &cfq_prio_2_entry.attr,
- &cfq_prio_3_entry.attr,
- &cfq_prio_4_entry.attr,
- &cfq_prio_5_entry.attr,
- &cfq_prio_6_entry.attr,
- &cfq_prio_7_entry.attr,
- &cfq_prio_8_entry.attr,
- &cfq_prio_9_entry.attr,
- &cfq_prio_10_entry.attr,
- &cfq_prio_11_entry.attr,
- &cfq_prio_12_entry.attr,
- &cfq_prio_13_entry.attr,
- &cfq_prio_14_entry.attr,
- &cfq_prio_15_entry.attr,
- &cfq_prio_16_entry.attr,
- &cfq_prio_17_entry.attr,
- &cfq_prio_18_entry.attr,
- &cfq_prio_19_entry.attr,
- &cfq_prio_20_entry.attr,
NULL,
};
.elevator_set_req_fn = cfq_set_request,
.elevator_put_req_fn = cfq_put_request,
.elevator_may_queue_fn = cfq_may_queue,
- .elevator_set_congested_fn = cfq_queue_congested,
.elevator_init_fn = cfq_init,
.elevator_exit_fn = cfq_exit,
};
e->elevator_put_req_fn(q, rq);
}
-void elv_set_congested(request_queue_t *q)
-{
- elevator_t *e = &q->elevator;
-
- if (e->elevator_set_congested_fn)
- e->elevator_set_congested_fn(q);
-}
-
int elv_may_queue(request_queue_t *q, int rw)
{
elevator_t *e = &q->elevator;
if (e->elevator_may_queue_fn)
return e->elevator_may_queue_fn(q, rw);
- return 1;
+ return 0;
}
void elv_completed_request(request_queue_t *q, struct request *rq)
struct io_context *ioc = get_io_context(gfp_mask);
spin_lock_irq(q->queue_lock);
-
- if (!elv_may_queue(q, rw))
- goto out_lock;
-
if (rl->count[rw]+1 >= q->nr_requests) {
/*
* The queue will fill after this allocation, so set it as
}
}
- /*
- * The queue is full and the allocating process is not a
- * "batcher", and not exempted by the IO scheduler
- */
- if (blk_queue_full(q, rw) && !ioc_batching(ioc))
- goto out_lock;
+ if (blk_queue_full(q, rw)
+ && !ioc_batching(ioc) && !elv_may_queue(q, rw)) {
+ /*
+ * The queue is full and the allocating process is not a
+ * "batcher", and not exempted by the IO scheduler
+ */
+ spin_unlock_irq(q->queue_lock);
+ goto out;
+ }
rl->count[rw]++;
if (rl->count[rw] >= queue_congestion_on_threshold(q))
*/
spin_lock_irq(q->queue_lock);
freed_request(q, rw);
- goto out_lock;
+ spin_unlock_irq(q->queue_lock);
+ goto out;
}
if (ioc_batching(ioc))
out:
put_io_context(ioc);
return rq;
-out_lock:
- if (!rq)
- elv_set_congested(q);
- spin_unlock_irq(q->queue_lock);
- goto out;
}
/*
kobject_put(&disk->kobj);
}
}
-
-asmlinkage int sys_ioprio_set(int ioprio)
-{
- if (ioprio < IOPRIO_IDLE || ioprio > IOPRIO_RT)
- return -EINVAL;
- if (ioprio == IOPRIO_RT && !capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- printk("%s: set ioprio %d\n", current->comm, ioprio);
- current->ioprio = ioprio;
- return 0;
-}
-
-asmlinkage int sys_ioprio_get(void)
-{
- return current->ioprio;
-}
-
#define __NR_mq_notify (__NR_mq_open+4)
#define __NR_mq_getsetattr (__NR_mq_open+5)
#define __NR_sys_kexec_load 283
-#define __NR_ioprio_set 284
-#define __NR_ioprio_get 285
-#define NR_syscalls 286
+#define NR_syscalls 284
/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
#define __NR_mq_notify 266
#define __NR_mq_getsetattr 267
#define __NR_kexec_load 268
-#define __NR_ioprio_set 269
-#define __NR_ioprio_get 270
-#define __NR_syscalls 271
+#define __NR_syscalls 269
#define __NR(n) #n
__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
#define __NR_kexec_load 246
__SYSCALL(__NR_kexec_load, sys_ni_syscall)
-#define __NR_ioprio_set 247
-__SYSCALL(__NR_ioprio_set, sys_ioprio_set);
-#define __NR_ioprio_get 248
-__SYSCALL(__NR_ioprio_get, sys_ioprio_get);
-#define __NR_syscall_max __NR_ioprio_get
+#define __NR_syscall_max __NR_kexec_load
#ifndef __NO_STUBS
/* user-visible error numbers are in the range -1 - -4095 */
*
* initialized to be 0
* a class can't accumulate more than SAVING_THRESHOLD of savings
- * savings are kept in normalized form (like cvt)
- * so when task share change the savings should be scaled accordingly
*/
unsigned long long savings;
#define CPU_DEMAND_INIT 3
/*functions exported by ckrm_cpu_monitor.c*/
-void ckrm_cpu_monitor(void);
+void ckrm_cpu_monitor(int check_min);
int ckrm_cpu_monitor_init(void);
void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat);
void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len);
*
* CLASS_QUANTIZER:
*
- * A class with 5% share, can execute 50M nsecs / per sec ~ 2^28.
+ * A class with 50% share, can execute 500 ms / per sec ~ 2^29 ns.
* It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7.
* With CLASS_QUANTIZER=16, the local_cvt of this class will increase
- * by 2^28/2^9 = 2^19 = 512K.
- * Setting CLASS_QUANTIZER to 16, 2^(19-16) = 8 slots / per second.
- * A class with 5% shares, will cover 80 slots / per second.
+ * by 2^29/2^9 = 2^20 = 1024K.
+ * Setting CLASS_QUANTIZER to 16, 2^(20-16) = 16 slots / per second.
+ * Do the same math, a class with any share value, will cover 16 slots / per second.
+ * So 2^8 total slots is good track for 8 seconds of system execution
*
* PRIORITY_QUANTIZER:
*
* How much can top priorities of class impact slot bonus.
- * There are 40 nice priorities. "2" will allow upto 10 slots improvement
- * in the RQ thus for 50% class it can perform ~1sec starvation.
+ * There are 40 nice priorities, range from -20 to 19, with default nice = 0
+ * "2" will allow upto 5 slots improvement
+ * when certain task within the class has a nice value of -20
+ * in the RQ thus for 50% class it can perform ~300 msec starvation.
*
*******************************************************************/
/*
* to improve system responsiveness
* an inactive class is put a little bit ahead of the current class when it wakes up
- * the amount is set in normalized termis to simplify the calculation
+ * the amount is set in normalized term to simplify the calculation
* for class with 100% share, it can be 2s ahead
* while for class with 10% share, it can be 200ms ahead
*/
typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *);
typedef int (elevator_may_queue_fn) (request_queue_t *, int);
-typedef void (elevator_set_congested_fn) (request_queue_t *);
typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int);
typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
elevator_put_req_fn *elevator_put_req_fn;
elevator_may_queue_fn *elevator_may_queue_fn;
- elevator_set_congested_fn *elevator_set_congested_fn;
elevator_init_fn *elevator_init_fn;
elevator_exit_fn *elevator_exit_fn;
extern int elv_register_queue(request_queue_t *q);
extern void elv_unregister_queue(request_queue_t *q);
extern int elv_may_queue(request_queue_t *, int);
-extern void elv_set_congested(request_queue_t *);
extern void elv_completed_request(request_queue_t *, struct request *);
extern int elv_set_request(request_queue_t *, struct request *, int);
extern void elv_put_request(request_queue_t *, struct request *);
#define ELEVATOR_INSERT_BACK 2
#define ELEVATOR_INSERT_SORT 3
-#define RQ_ELV_DATA(rq) (rq)->elevator_private
-
#endif
{ }
#endif /* CONFIG_SECURITY */
-/* io priorities */
-
-#define IOPRIO_NR 21
-
-#define IOPRIO_IDLE 0
-#define IOPRIO_NORM 10
-#define IOPRIO_RT 20
-
-asmlinkage int sys_ioprio_set(int ioprio);
-asmlinkage int sys_ioprio_get(void);
-
-
#endif /* __KERNEL__ */
#endif /* _LINUX_FS_H */
.proc_lock = SPIN_LOCK_UNLOCKED, \
.switch_lock = SPIN_LOCK_UNLOCKED, \
.journal_info = NULL, \
- .ioprio = IOPRIO_NORM, \
}
struct audit_context; /* See audit.c */
struct mempolicy;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/**
+ * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class
+ * @run: how much time it has been running since the counter started
+ * @total: total time since the counter started
+ * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping
+ * @recalc_interval: how often do we recalculate the cpu_demand
+ * @cpu_demand: moving average of run/total
+ */
+struct ckrm_cpu_demand_stat {
+ unsigned long long run;
+ unsigned long long total;
+ unsigned long long last_sleep;
+ unsigned long long recalc_interval;
+ unsigned long cpu_demand; /*estimated cpu demand */
+};
+#endif
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
struct thread_info *thread_info;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;
-
sigset_t blocked, real_blocked;
struct sigpending pending;
struct io_context *io_context;
- int ioprio;
-
unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
// .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS
struct ckrm_task_class *taskclass;
struct list_head taskclass_link;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ struct ckrm_cpu_class *cpu_class;
+ //track cpu demand of this task
+ struct ckrm_cpu_demand_stat demand_stat;
+#endif //CONFIG_CKRM_CPU_SCHEDULE
#endif // CONFIG_CKRM_TYPE_TASKCLASS
#endif // CONFIG_CKRM
}
#endif
+
/*
* Routines for handling mm_structs
*/
return mm;
}
-
/* set thread flags in other task's structures
* - see asm/thread_info.h for TIF_xxxx flags available
*/
Say N if unsure, Y to use the feature.
-config CKRM_RES_BLKIO
- tristate " Disk I/O Resource Controller"
- depends on CKRM_TYPE_TASKCLASS && IOSCHED_CFQ
- default m
+config CKRM_CPU_SCHEDULE
+ bool "CKRM CPU scheduler"
+ depends on CKRM_TYPE_TASKCLASS
+ default y
help
- Provides a resource controller for best-effort block I/O
- bandwidth control. The controller attempts this by proportional
- servicing of requests in the I/O scheduler. However, seek
- optimizations and reordering by device drivers/disk controllers may
- alter the actual bandwidth delivered to a class.
+ Use CKRM CPU scheduler instead of Linux Scheduler
Say N if unsure, Y to use the feature.
#include <asm/setup.h>
#include <linux/ckrm.h>
+#include <linux/ckrm_sched.h>
/*
* This is one of the first .c files built. Error out early
do_basic_setup();
+ init_ckrm_sched_res();
/*
* check if there is an early userspace init. If yes, let it do all
* the work
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKCONFIG_PROC) += configs.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o
obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o
obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o
+ obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o
write_unlock(&class_list_lock);
kfree(cls);
+
+ //call ckrm_cpu_monitor after class removed
+ ckrm_cpu_monitor(0);
}
/*
if (cls->parent) {
spin_unlock(&parres->cnt_lock);
}
+
+ //call ckrm_cpu_monitor after changes are changed
+ ckrm_cpu_monitor(0);
+
return rc;
}
);
for_each_online_cpu(i) {
lrq = get_ckrm_lrq(cls,i);
- seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav=%lu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings);
+ seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings);
}
seq_printf(sfile, "-------- CPU Class Status END ---------\n");
c_cls->stat.ehl *
get_myhard_limit(c_cls) / c_cls->shares.total_guarantee;
+ set_eshare(&c_cls->stat,c_cls->stat.egrt);
+ set_meshare(&c_cls->stat,c_cls->stat.megrt);
+
+
child_core = ckrm_get_next_child(parent, child_core);
};
return 0;
/ cls->shares.total_guarantee;
cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls)
/ cls->shares.total_guarantee;
-
+ set_eshare(&cls->stat,cls->stat.egrt);
+ set_meshare(&cls->stat,cls->stat.megrt);
+
repeat:
//check exit
if (!cur_core)
return 0;
- //visit this node
- if (update_child_effective(cur_core) < 0)
- return ret; //invalid cur_core node
+ //visit this node only once
+ if (! child_core)
+ if (update_child_effective(cur_core) < 0)
+ return ret; //invalid cur_core node
//next child
child_core = ckrm_get_next_child(cur_core, child_core);
}
/**
- * node_surplus_consume: consume the surplus
- * @ckeck_sl: if check_sl is set, then check soft_limit
- * @total_grt: total guarantee
+ * consume_surplus: decides how much surplus a node can consume
+ * @ckeck_sl: if check_sl is set, then check soft_limitx
* return how much consumed
- * return -1 on error
*
* implements all the CKRM Scheduling Requirement
- * update total_grt if necessary
+ * assume c_cls is valid
*/
-static inline int node_surplus_consume(int surplus,
- struct ckrm_core_class *child_core,
+static inline int consume_surplus(int surplus,
+ struct ckrm_cpu_class *c_cls,
struct ckrm_cpu_class *p_cls,
int check_sl
)
{
int consumed = 0;
int inc_limit;
- int glut = 1;
-
- struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core);
int total_grt = p_cls->shares.total_guarantee;
BUG_ON(surplus < 0);
- if (! c_cls || ! total_grt)
- goto out;
-
/*can't consume more than demand or hard limit*/
if (c_cls->stat.eshare >= c_cls->stat.max_demand)
goto out;
+ //the surplus allocation is propotional to grt
consumed =
surplus * c_cls->shares.my_guarantee / total_grt;
if (check_sl) {
int esl = p_cls->stat.eshare * get_soft_limit(c_cls)
- /p_cls->shares.total_guarantee;
+ /total_grt;
if (esl < c_cls->stat.max_demand)
inc_limit = esl - c_cls->stat.eshare;
}
-
if (consumed > inc_limit)
consumed = inc_limit;
- else
- glut = 0;
BUG_ON(consumed < 0);
- set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
- BUG_ON(c_cls->stat.eshare < 0);
+ out:
+ return consumed;
+}
+
+/*
+ * how much a node can consume for itself?
+ */
+static inline int consume_self_surplus(int surplus,
+ struct ckrm_cpu_class *p_cls,
+ int check_sl
+ )
+{
+ int consumed = 0;
+ int inc_limit;
+ int total_grt = p_cls->shares.total_guarantee;
+ int max_demand = get_mmax_demand(&p_cls->stat);
+
+ BUG_ON(surplus < 0);
+ /*can't consume more than demand or hard limit*/
+ if (p_cls->stat.meshare >= max_demand)
+ goto out;
+
+ //the surplus allocation is propotional to grt
+ consumed =
+ surplus * p_cls->shares.unused_guarantee / total_grt;
+
+ if (! consumed) //no more share
+ goto out;
+
+ //hard limit and demand limit
+ inc_limit = max_demand - p_cls->stat.meshare;
+
+ if (check_sl) {
+ int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls)
+ /total_grt;
+ if (mesl < max_demand)
+ inc_limit = mesl - p_cls->stat.meshare;
+ }
+
+ if (consumed > inc_limit)
+ consumed = inc_limit;
+
+ BUG_ON(consumed < 0);
out:
return consumed;
}
+
+/*
+ * allocate surplus to all its children and also its default class
+ */
+static int alloc_surplus_single_round(
+ int surplus,
+ struct ckrm_core_class *parent,
+ struct ckrm_cpu_class *p_cls,
+ int check_sl)
+{
+ struct ckrm_cpu_class *c_cls;
+ struct ckrm_core_class *child_core = NULL;
+ int total_consumed = 0,consumed;
+
+ //first allocate to the default class
+ consumed =
+ consume_self_surplus(surplus,p_cls,check_sl);
+
+ if (consumed > 0) {
+ set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed);
+ total_consumed += consumed;
+ }
+
+ do {
+ child_core = ckrm_get_next_child(parent, child_core);
+ if (child_core) {
+ c_cls = ckrm_get_cpu_class(child_core);
+ if (! c_cls)
+ return -1;
+
+ consumed =
+ consume_surplus(surplus, c_cls,
+ p_cls,check_sl);
+ if (consumed > 0) {
+ set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed);
+ total_consumed += consumed;
+ }
+ }
+ } while (child_core);
+
+ return total_consumed;
+}
+
/**
* alloc_surplus_node: re-allocate the shares for children under parent
* @parent: parent node
*/
static int alloc_surplus_node(struct ckrm_core_class *parent)
{
- int total_surplus , old_surplus;
- struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent);
- struct ckrm_core_class *child_core = NULL;
- int self_share;
+ struct ckrm_cpu_class *p_cls,*c_cls;
+ int total_surplus,consumed;
int check_sl;
int ret = -1;
+ struct ckrm_core_class *child_core = NULL;
+ p_cls = ckrm_get_cpu_class(parent);
if (! p_cls)
- return ret;
-
- total_surplus = get_my_node_surplus(p_cls);
+ goto realloc_out;
/*
- * initialize effective_share
+ * get total surplus
*/
+ total_surplus = p_cls->stat.eshare - p_cls->stat.egrt;
+ BUG_ON(total_surplus < 0);
+ total_surplus += get_my_node_surplus(p_cls);
+
do {
child_core = ckrm_get_next_child(parent, child_core);
if (child_core) {
- struct ckrm_cpu_class *c_cls;
-
c_cls = ckrm_get_cpu_class(child_core);
if (! c_cls)
- return ret;
+ goto realloc_out;
total_surplus += get_node_surplus(c_cls);
-
- set_eshare(&c_cls->stat, c_cls->stat.egrt);
}
} while (child_core);
- if (! total_surplus)
+
+ if (! total_surplus) {
+ ret = 0;
goto realloc_out;
+ }
- /* distribute the surplus */
- child_core = NULL;
+ /*
+ * distributing the surplus
+ * first with the check_sl enabled
+ * once all the tasks has research the soft limit, disable check_sl and try again
+ */
+
check_sl = 1;
- old_surplus = 0;
do {
- if (!child_core) {//start a new round
+ consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl);
+ if (consumed < 0) //something is wrong
+ goto realloc_out;
- //ok, everybody reached the soft limit
- if (old_surplus == total_surplus)
- check_sl = 0;
- old_surplus = total_surplus;
- }
+ if (! consumed)
+ check_sl = 0;
+ else
+ total_surplus -= consumed;
- child_core = ckrm_get_next_child(parent, child_core);
- if (child_core) {
- int consumed = 0;
- consumed -=
- node_surplus_consume(old_surplus, child_core,
- p_cls,check_sl);
- if (consumed >= 0)
- total_surplus -= consumed;
- else
- return ret;
- }
- //start a new round if something is allocated in the last round
- } while (child_core || check_sl || total_surplus != old_surplus);
+ } while ((total_surplus > 0) && (consumed || check_sl) );
- realloc_out:
- /*how much for itself*/
- self_share = p_cls->stat.eshare *
- p_cls->shares.unused_guarantee / p_cls->shares.total_guarantee;
-
- if (self_share < p_cls->stat.max_demand) {
- /*any remaining surplus goes to the default class*/
- self_share += total_surplus;
- if (self_share > p_cls->stat.max_demand)
- self_share = p_cls->stat.max_demand;
- }
+ ret = 0;
- set_meshare(&p_cls->stat, self_share);
- return 0;
+ realloc_out:
+ return ret;
}
/**
static int alloc_surplus(struct ckrm_core_class *root_core)
{
struct ckrm_core_class *cur_core, *child_core;
- struct ckrm_cpu_class *cls;
+ // struct ckrm_cpu_class *cls;
int ret = -1;
/*initialize*/
cur_core = root_core;
child_core = NULL;
- cls = ckrm_get_cpu_class(cur_core);
-
- //set root eshare
- set_eshare(&cls->stat, cls->stat.egrt);
+ // cls = ckrm_get_cpu_class(cur_core);
/*the ckrm idle tasks get all what's remaining*/
/*hzheng: uncomment the following like for hard limit support */
// update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand);
- repeat:
+ repeat:
//check exit
if (!cur_core)
return 0;
- //visit this node
- if ( alloc_surplus_node(cur_core) < 0 )
- return ret;
+ //visit this node only once
+ if (! child_core)
+ if ( alloc_surplus_node(cur_core) < 0 )
+ return ret;
//next child
child_core = ckrm_get_next_child(cur_core, child_core);
/*similar to cpu_idle */
while (1) {
while (!need_resched()) {
- ckrm_cpu_monitor();
+ ckrm_cpu_monitor(1);
if (current_cpu_data.hlt_works_ok) {
local_irq_disable();
if (!need_resched()) {
/**********************************************/
/**
*ckrm_cpu_monitor - adjust relative shares of the classes based on their progress
+ *@check_min: if check_min is set, the call can't be within 100ms of last call
*
* this function is called every CPU_MONITOR_INTERVAL
* it computes the cpu demand of each class
* and re-allocate the un-used shares to other classes
*/
-void ckrm_cpu_monitor(void)
+void ckrm_cpu_monitor(int check_min)
{
static spinlock_t lock = SPIN_LOCK_UNLOCKED;
static unsigned long long last_check = 0;
now = sched_clock();
//consecutive check should be at least 100ms apart
- if (now - last_check < MIN_CPU_MONITOR_INTERVAL) {
+ if (check_min && (now - last_check < MIN_CPU_MONITOR_INTERVAL))
goto outunlock;
- }
+
last_check = now;
if (update_effectives(root_core) != 0)
/*sleep for sometime before next try*/
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(CPU_MONITOR_INTERVAL);
- ckrm_cpu_monitor();
+ ckrm_cpu_monitor(1);
if (thread_exit) {
break;
}
void ckrm_kill_monitor(void)
{
- int interval = HZ;
-
printk("killing process %d\n", cpu_monitor_pid);
if (cpu_monitor_pid > 0) {
thread_exit = 1;
lrq->savings -= savings_used;
unscale_cvt(savings_used,lrq);
BUG_ON(lrq->local_cvt < savings_used);
- // lrq->local_cvt -= savings_used;
+ lrq->local_cvt -= savings_used;
}
}
} else
link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid);
- p->ioprio = current->ioprio;
nr_threads++;
write_unlock_irq(&tasklist_lock);
retval = 0;
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
*/
-
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#define LOW_CREDIT(p) \
((p)->interactive_credit < -CREDIT_LIMIT)
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/*
+ * if belong to different class, compare class priority
+ * otherwise compare task priority
+ */
+#define TASK_PREEMPTS_CURR(p, rq) \
+ ( ((p)->cpu_class != (rq)->curr->cpu_class) \
+ && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \
+ ? class_preempts_curr((p),(rq)->curr) \
+ : ((p)->prio < (rq)->curr->prio)
+#else
#define TASK_PREEMPTS_CURR(p, rq) \
((p)->prio < (rq)->curr->prio)
+#endif
/*
* BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
((MAX_TIMESLICE - MIN_TIMESLICE) * \
(MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
-static unsigned int task_timeslice(task_t *p)
+unsigned int task_timeslice(task_t *p)
{
return BASE_TIMESLICE(p);
}
* These are the runqueue data structures:
*/
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
typedef struct runqueue runqueue_t;
-
-struct prio_array {
- unsigned int nr_active;
- unsigned long bitmap[BITMAP_SIZE];
- struct list_head queue[MAX_PRIO];
-};
+#include <linux/ckrm_classqueue.h>
+#include <linux/ckrm_sched.h>
/*
* This is the main, per-CPU runqueue data structure.
unsigned long long timestamp_last_tick;
task_t *curr, *idle;
struct mm_struct *prev_mm;
- prio_array_t *active, *expired, arrays[2];
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ struct classqueue_struct classqueue;
+ ckrm_load_t ckrm_load;
+#else
+ prio_array_t *active, *expired, arrays[2];
+#endif
int best_expired_prio;
atomic_t nr_iowait;
spin_unlock_irq(&rq->lock);
}
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq)
+{
+ cq_node_t *node = classqueue_get_head(&rq->classqueue);
+ return ((node) ? class_list_entry(node) : NULL);
+}
+
+/*
+ * return the cvt of the current running class
+ * if no current running class, return 0
+ * assume cpu is valid (cpu_online(cpu) == 1)
+ */
+CVT_t get_local_cur_cvt(int cpu)
+{
+ ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu));
+
+ if (lrq)
+ return lrq->local_cvt;
+ else
+ return 0;
+}
+
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
+{
+ prio_array_t *array;
+ struct task_struct *next;
+ ckrm_lrq_t *queue;
+ int idx;
+ int cpu = smp_processor_id();
+
+ next = rq->idle;
+ retry_next_class:
+ if ((queue = rq_get_next_class(rq))) {
+ //check switch active/expired queue
+ array = queue->active;
+ if (unlikely(!array->nr_active)) {
+ queue->active = queue->expired;
+ queue->expired = array;
+ queue->expired_timestamp = 0;
+
+ if (queue->active->nr_active)
+ set_top_priority(queue,
+ find_first_bit(queue->active->bitmap, MAX_PRIO));
+ else {
+ classqueue_dequeue(queue->classqueue,
+ &queue->classqueue_linkobj);
+ cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0);
+ }
+ goto retry_next_class;
+ }
+ BUG_ON(!array->nr_active);
+
+ idx = queue->top_priority;
+ if (queue->top_priority == MAX_PRIO) {
+ BUG_ON(1);
+ }
+
+ next = task_list_entry(array->queue[idx].next);
+ }
+ return next;
+}
+#else /*! CONFIG_CKRM_CPU_SCHEDULE*/
+static inline struct task_struct * rq_get_next_task(struct runqueue* rq)
+{
+ prio_array_t *array;
+ struct list_head *queue;
+ int idx;
+
+ array = rq->active;
+ if (unlikely(!array->nr_active)) {
+ /*
+ * Switch the active and expired arrays.
+ */
+ rq->active = rq->expired;
+ rq->expired = array;
+ array = rq->active;
+ rq->expired_timestamp = 0;
+ }
+
+ idx = sched_find_first_bit(array->bitmap);
+ queue = array->queue + idx;
+ return list_entry(queue->next, task_t, run_list);
+}
+
+static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { }
+static inline void init_cpu_classes(void) { }
+#define rq_ckrm_load(rq) NULL
+static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {}
+#endif /* CONFIG_CKRM_CPU_SCHEDULE */
+
/*
* Adding/removing a task to/from a priority array:
*/
static void dequeue_task(struct task_struct *p, prio_array_t *array)
{
+ BUG_ON(! array);
array->nr_active--;
list_del(&p->run_list);
if (list_empty(array->queue + p->prio))
__clear_bit(p->prio, array->bitmap);
+ class_dequeue_task(p,array);
}
static void enqueue_task(struct task_struct *p, prio_array_t *array)
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
+ class_enqueue_task(p,array);
}
/*
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
+ class_enqueue_task(p,array);
}
/*
*/
static inline void __activate_task(task_t *p, runqueue_t *rq)
{
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq_active(p,rq));
rq->nr_running++;
}
*/
static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
{
- enqueue_task_head(p, rq->active);
+ enqueue_task_head(p, rq_active(p,rq));
rq->nr_running++;
}
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
spin_lock_init(&p->switch_lock);
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0);
+#endif
+
#ifdef CONFIG_PREEMPT
/*
* During context-switch we hold precisely one spinlock, which
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
+ class_enqueue_task(p,p->array);
}
task_rq_unlock(rq, &flags);
}
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
+ class_enqueue_task(p,p->array);
}
} else {
/* Not the local CPU - must adjust timestamp */
return 1;
}
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+static inline int ckrm_preferred_task(task_t *tmp,long min, long max,
+ int phase, enum idle_type idle)
+{
+ long pressure = task_load(tmp);
+
+ if (pressure > max)
+ return 0;
+
+ if ((idle == NOT_IDLE) && ! phase && (pressure <= min))
+ return 0;
+ return 1;
+}
+
+/*
+ * move tasks for a specic local class
+ * return number of tasks pulled
+ */
+static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq,
+ runqueue_t *this_rq,
+ runqueue_t *busiest,
+ struct sched_domain *sd,
+ int this_cpu,
+ enum idle_type idle,
+ long* pressure_imbalance)
+{
+ prio_array_t *array, *dst_array;
+ struct list_head *head, *curr;
+ task_t *tmp;
+ int idx;
+ int pulled = 0;
+ int phase = -1;
+ long pressure_min, pressure_max;
+ /*hzheng: magic : 90% balance is enough*/
+ long balance_min = *pressure_imbalance / 10;
+/*
+ * we don't want to migrate tasks that will reverse the balance
+ * or the tasks that make too small difference
+ */
+#define CKRM_BALANCE_MAX_RATIO 100
+#define CKRM_BALANCE_MIN_RATIO 1
+ start:
+ phase ++;
+ /*
+ * We first consider expired tasks. Those will likely not be
+ * executed in the near future, and they are most likely to
+ * be cache-cold, thus switching CPUs has the least effect
+ * on them.
+ */
+ if (src_lrq->expired->nr_active) {
+ array = src_lrq->expired;
+ dst_array = dst_lrq->expired;
+ } else {
+ array = src_lrq->active;
+ dst_array = dst_lrq->active;
+ }
+
+ new_array:
+ /* Start searching at priority 0: */
+ idx = 0;
+ skip_bitmap:
+ if (!idx)
+ idx = sched_find_first_bit(array->bitmap);
+ else
+ idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+ if (idx >= MAX_PRIO) {
+ if (array == src_lrq->expired && src_lrq->active->nr_active) {
+ array = src_lrq->active;
+ dst_array = dst_lrq->active;
+ goto new_array;
+ }
+ if ((! phase) && (! pulled) && (idle != IDLE))
+ goto start; //try again
+ else
+ goto out; //finished search for this lrq
+ }
+
+ head = array->queue + idx;
+ curr = head->prev;
+ skip_queue:
+ tmp = list_entry(curr, task_t, run_list);
+
+ curr = curr->prev;
+
+ if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+ if (curr != head)
+ goto skip_queue;
+ idx++;
+ goto skip_bitmap;
+ }
+
+ pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100;
+ pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100;
+ /*
+ * skip the tasks that will reverse the balance too much
+ */
+ if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) {
+ *pressure_imbalance -= task_load(tmp);
+ pull_task(busiest, array, tmp,
+ this_rq, dst_array, this_cpu);
+ pulled++;
+
+ if (*pressure_imbalance <= balance_min)
+ goto out;
+ }
+
+ if (curr != head)
+ goto skip_queue;
+ idx++;
+ goto skip_bitmap;
+ out:
+ return pulled;
+}
+
+static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq)
+{
+ long imbalance;
+ /*
+ * make sure after balance, imbalance' > - imbalance/2
+ * we don't want the imbalance be reversed too much
+ */
+ imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0)
+ - pid_get_pressure(rq_ckrm_load(this_rq),1);
+ imbalance /= 2;
+ return imbalance;
+}
+
+/*
+ * try to balance the two runqueues
+ *
+ * Called with both runqueues locked.
+ * if move_tasks is called, it will try to move at least one task over
+ */
+static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+ unsigned long max_nr_move, struct sched_domain *sd,
+ enum idle_type idle)
+{
+ struct ckrm_cpu_class *clsptr,*vip_cls = NULL;
+ ckrm_lrq_t* src_lrq,*dst_lrq;
+ long pressure_imbalance, pressure_imbalance_old;
+ int src_cpu = task_cpu(busiest->curr);
+ struct list_head *list;
+ int pulled = 0;
+ long imbalance;
+
+ imbalance = ckrm_rq_imbalance(this_rq,busiest);
+
+ if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1)
+ goto out;
+
+ //try to find the vip class
+ list_for_each_entry(clsptr,&active_cpu_classes,links) {
+ src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+
+ if (! lrq_nr_running(src_lrq))
+ continue;
+
+ if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) )
+ {
+ vip_cls = clsptr;
+ }
+ }
+
+ /*
+ * do search from the most significant class
+ * hopefully, less tasks will be migrated this way
+ */
+ clsptr = vip_cls;
+
+ move_class:
+ if (! clsptr)
+ goto out;
+
+
+ src_lrq = get_ckrm_lrq(clsptr,src_cpu);
+ if (! lrq_nr_running(src_lrq))
+ goto other_class;
+
+ dst_lrq = get_ckrm_lrq(clsptr,this_cpu);
+
+ //how much pressure for this class should be transferred
+ pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight;
+ if (pulled && ! pressure_imbalance)
+ goto other_class;
+
+ pressure_imbalance_old = pressure_imbalance;
+
+ //move tasks
+ pulled +=
+ ckrm_cls_move_tasks(src_lrq,dst_lrq,
+ this_rq,
+ busiest,
+ sd,this_cpu,idle,
+ &pressure_imbalance);
+
+ /*
+ * hzheng: 2 is another magic number
+ * stop balancing if the imbalance is less than 25% of the orig
+ */
+ if (pressure_imbalance <= (pressure_imbalance_old >> 2))
+ goto out;
+
+ //update imbalance
+ imbalance *= pressure_imbalance / pressure_imbalance_old;
+ other_class:
+ //who is next?
+ list = clsptr->links.next;
+ if (list == &active_cpu_classes)
+ list = list->next;
+ clsptr = list_entry(list, typeof(*clsptr), links);
+ if (clsptr != vip_cls)
+ goto move_class;
+ out:
+ return pulled;
+}
+
+/**
+ * ckrm_check_balance - is load balancing necessary?
+ * return 0 if load balancing is not necessary
+ * otherwise return the average load of the system
+ * also, update nr_group
+ *
+ * heuristics:
+ * no load balancing if it's load is over average
+ * no load balancing if it's load is far more than the min
+ * task:
+ * read the status of all the runqueues
+ */
+static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu,
+ enum idle_type idle, int* nr_group)
+{
+ struct sched_group *group = sd->groups;
+ unsigned long min_load, max_load, avg_load;
+ unsigned long total_load, this_load, total_pwr;
+
+ max_load = this_load = total_load = total_pwr = 0;
+ min_load = 0xFFFFFFFF;
+ *nr_group = 0;
+
+ do {
+ cpumask_t tmp;
+ unsigned long load;
+ int local_group;
+ int i, nr_cpus = 0;
+
+ /* Tally up the load of all CPUs in the group */
+ cpus_and(tmp, group->cpumask, cpu_online_map);
+ if (unlikely(cpus_empty(tmp)))
+ goto nextgroup;
+
+ avg_load = 0;
+ local_group = cpu_isset(this_cpu, group->cpumask);
+
+ for_each_cpu_mask(i, tmp) {
+ load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group);
+ nr_cpus++;
+ avg_load += load;
+ }
+
+ if (!nr_cpus)
+ goto nextgroup;
+
+ total_load += avg_load;
+ total_pwr += group->cpu_power;
+
+ /* Adjust by relative CPU power of the group */
+ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+ if (local_group) {
+ this_load = avg_load;
+ goto nextgroup;
+ } else if (avg_load > max_load) {
+ max_load = avg_load;
+ }
+ if (avg_load < min_load) {
+ min_load = avg_load;
+ }
+nextgroup:
+ group = group->next;
+ *nr_group = *nr_group + 1;
+ } while (group != sd->groups);
+
+ if (!max_load || this_load >= max_load)
+ goto out_balanced;
+
+ avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+ /* hzheng: debugging: 105 is a magic number
+ * 100*max_load <= sd->imbalance_pct*this_load)
+ * should use imbalance_pct instead
+ */
+ if (this_load > avg_load
+ || 100*max_load < 105*this_load
+ || 100*min_load < 70*this_load
+ )
+ goto out_balanced;
+
+ return avg_load;
+ out_balanced:
+ return 0;
+}
+
+/**
+ * any group that has above average load is considered busy
+ * find the busiest queue from any of busy group
+ */
+static runqueue_t *
+ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu,
+ unsigned long avg_load, enum idle_type idle,
+ int nr_group)
+{
+ struct sched_group *group;
+ runqueue_t * busiest=NULL;
+ unsigned long rand;
+
+ group = sd->groups;
+ rand = get_ckrm_rand(nr_group);
+ nr_group = 0;
+
+ do {
+ unsigned long load,total_load,max_load;
+ cpumask_t tmp;
+ int i;
+ runqueue_t * grp_busiest;
+
+ cpus_and(tmp, group->cpumask, cpu_online_map);
+ if (unlikely(cpus_empty(tmp)))
+ goto find_nextgroup;
+
+ total_load = 0;
+ max_load = 0;
+ grp_busiest = NULL;
+ for_each_cpu_mask(i, tmp) {
+ load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0);
+ total_load += load;
+ if (load > max_load) {
+ max_load = load;
+ grp_busiest = cpu_rq(i);
+ }
+ }
+
+ total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ if (total_load > avg_load) {
+ busiest = grp_busiest;
+ if (nr_group >= rand)
+ break;
+ }
+ find_nextgroup:
+ group = group->next;
+ nr_group ++;
+ } while (group != sd->groups);
+
+ return busiest;
+}
+
+/**
+ * load_balance - pressure based load balancing algorithm used by ckrm
+ */
+static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
+{
+ runqueue_t *busiest;
+ unsigned long avg_load;
+ int nr_moved,nr_group;
+
+ avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group);
+ if (! avg_load)
+ goto out_balanced;
+
+ busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group);
+ if (! busiest)
+ goto out_balanced;
+ /*
+ * This should be "impossible", but since load
+ * balancing is inherently racy and statistical,
+ * it could happen in theory.
+ */
+ if (unlikely(busiest == this_rq)) {
+ WARN_ON(1);
+ goto out_balanced;
+ }
+
+ nr_moved = 0;
+ if (busiest->nr_running > 1) {
+ /*
+ * Attempt to move tasks. If find_busiest_group has found
+ * an imbalance but busiest->nr_running <= 1, the group is
+ * still unbalanced. nr_moved simply stays zero, so it is
+ * correctly treated as an imbalance.
+ */
+ double_lock_balance(this_rq, busiest);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
+ 0,sd, idle);
+ spin_unlock(&busiest->lock);
+ if (nr_moved) {
+ adjust_local_weight();
+ }
+ }
+
+ if (!nr_moved)
+ sd->nr_balance_failed ++;
+ else
+ sd->nr_balance_failed = 0;
+
+ /* We were unbalanced, so reset the balancing interval */
+ sd->balance_interval = sd->min_interval;
+
+ return nr_moved;
+
+out_balanced:
+ /* tune up the balancing interval */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
+
+ return 0;
+}
+
+/*
+ * this_rq->lock is already held
+ */
+static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd)
+{
+ int ret;
+ read_lock(&class_list_lock);
+ ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+ read_unlock(&class_list_lock);
+ return ret;
+}
+
+static inline int load_balance(int this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
+{
+ int ret;
+
+ spin_lock(&this_rq->lock);
+ read_lock(&class_list_lock);
+ ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE);
+ read_unlock(&class_list_lock);
+ spin_unlock(&this_rq->lock);
+ return ret;
+}
+#else /*! CONFIG_CKRM_CPU_SCHEDULE */
/*
* move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
* as part of a balancing operation within "domain". Returns the number of
out:
return nr_moved;
}
+#endif /* CONFIG_CKRM_CPU_SCHEDULE*/
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
}
}
}
-#else
+#else /* SMP*/
/*
* on UP we do not need to balance between CPUs:
*/
return 0;
}
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-
+DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
EXPORT_PER_CPU_SYMBOL(kstat);
/*
* increasing number of running tasks. We also ignore the interactivity
* if a better static_prio task has expired:
*/
+
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
#define EXPIRED_STARVING(rq) \
((STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
((rq)->curr->static_prio > (rq)->best_expired_prio))
+#else
+#define EXPIRED_STARVING(rq) \
+ (STARVATION_LIMIT && ((rq)->expired_timestamp && \
+ (jiffies - (rq)->expired_timestamp >= \
+ STARVATION_LIMIT * (lrq_nr_running(rq)) + 1)))
+#endif
/*
* This function gets called by the timer code, with HZ frequency.
cpustat->idle += sys_ticks;
if (wake_priority_sleeper(rq))
goto out;
+ ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
rebalance_tick(cpu, rq, IDLE);
return;
}
cpustat->system += sys_ticks;
/* Task might have expired already, but not scheduled off yet */
- if (p->array != rq->active) {
+ if (p->array != rq_active(p,rq)) {
set_tsk_need_resched(p);
goto out;
}
set_tsk_need_resched(p);
/* put it at the end of the queue: */
- dequeue_task(p, rq->active);
- enqueue_task(p, rq->active);
+ dequeue_task(p, rq_active(p,rq));
+ enqueue_task(p, rq_active(p,rq));
}
goto out_unlock;
}
if (!--p->time_slice) {
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ /* Hubertus ... we can abstract this out */
+ ckrm_lrq_t* rq = get_task_lrq(p);
+#endif
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
p->prio = effective_prio(p);
rq->expired_timestamp = jiffies;
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
enqueue_task(p, rq->expired);
- if (p->static_prio < rq->best_expired_prio)
- rq->best_expired_prio = p->static_prio;
+ if (p->static_prio < this_rq()->best_expired_prio)
+ this_rq()->best_expired_prio = p->static_prio;
} else
enqueue_task(p, rq->active);
} else {
if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
- (p->array == rq->active)) {
+ (p->array == rq_active(p,rq))) {
- dequeue_task(p, rq->active);
+ dequeue_task(p, rq_active(p,rq));
set_tsk_need_resched(p);
p->prio = effective_prio(p);
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq_active(p,rq));
}
}
out_unlock:
spin_unlock(&rq->lock);
out:
+ ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq));
rebalance_tick(cpu, rq, NOT_IDLE);
}
task_t *prev, *next;
runqueue_t *rq;
prio_array_t *array;
- struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu;
/*
* Test if we are atomic. Since do_exit() needs to call into
spin_lock_irq(&rq->lock);
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ if (prev != rq->idle) {
+ unsigned long long run = now - prev->timestamp;
+ ckrm_lrq_t * lrq = get_task_lrq(prev);
+
+ lrq->lrq_load -= task_load(prev);
+ cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run);
+ lrq->lrq_load += task_load(prev);
+
+ cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run);
+ update_local_cvt(prev, run);
+ }
+#endif
/*
* if entering off of a kernel preemption go straight
* to picking the next task.
cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
idle_balance(cpu, rq);
- if (!rq->nr_running) {
- next = rq->idle;
- rq->expired_timestamp = 0;
- wake_sleeping_dependent(cpu, rq);
- goto switch_tasks;
- }
}
- array = rq->active;
- if (unlikely(!array->nr_active)) {
- /*
- * Switch the active and expired arrays.
- */
- rq->active = rq->expired;
- rq->expired = array;
- array = rq->active;
+ next = rq_get_next_task(rq);
+ if (next == rq->idle) {
rq->expired_timestamp = 0;
- rq->best_expired_prio = MAX_PRIO;
+ wake_sleeping_dependent(cpu, rq);
+ goto switch_tasks;
}
- idx = sched_find_first_bit(array->bitmap);
- queue = array->queue + idx;
- next = list_entry(queue->next, task_t, run_list);
-
if (dependent_sleeper(cpu, rq, next)) {
next = rq->idle;
goto switch_tasks;
}
EXPORT_SYMBOL(schedule);
-
#ifdef CONFIG_PREEMPT
/*
* this is is the entry point to schedule() from in-kernel preemption
{
runqueue_t *rq = this_rq_lock();
prio_array_t *array = current->array;
- prio_array_t *target = rq->expired;
+ prio_array_t *target = rq_expired(current,rq);
/*
* We implement yielding by moving the task into the expired
* array.)
*/
if (unlikely(rt_task(current)))
- target = rq->active;
+ target = rq_active(current,rq);
dequeue_task(current, array);
enqueue_task(current, target);
if (!cpu_isset(dest_cpu, p->cpus_allowed))
goto out;
- set_task_cpu(p, dest_cpu);
if (p->array) {
/*
* Sync timestamp with rq_dest's before activating.
p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+ rq_dest->timestamp_last_tick;
deactivate_task(p, rq_src);
+ set_task_cpu(p, dest_cpu);
activate_task(p, rq_dest, 0);
if (TASK_PREEMPTS_CURR(p, rq_dest))
resched_task(rq_dest->curr);
- }
+ } else
+ set_task_cpu(p, dest_cpu);
out:
double_rq_unlock(rq_src, rq_dest);
void __init sched_init(void)
{
runqueue_t *rq;
- int i, j, k;
+ int i;
#ifdef CONFIG_SMP
/* Set up an initial dummy domain for early boot */
sched_group_init.next = &sched_group_init;
sched_group_init.cpu_power = SCHED_LOAD_SCALE;
#endif
+ init_cpu_classes();
for (i = 0; i < NR_CPUS; i++) {
+#ifndef CONFIG_CKRM_CPU_SCHEDULE
+ int j, k;
prio_array_t *array;
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
+
+ for (j = 0; j < 2; j++) {
+ array = rq->arrays + j;
+ for (k = 0; k < MAX_PRIO; k++) {
+ INIT_LIST_HEAD(array->queue + k);
+ __clear_bit(k, array->bitmap);
+ }
+ // delimiter for bitsearch
+ __set_bit(MAX_PRIO, array->bitmap);
+ }
+
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
+#else
+ rq = cpu_rq(i);
+ spin_lock_init(&rq->lock);
+#endif
+
rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
rq->sd = &sched_domain_init;
rq->cpu_load = 0;
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ ckrm_load_init(rq_ckrm_load(rq));
+#endif
rq->active_balance = 0;
rq->push_cpu = 0;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
#endif
atomic_set(&rq->nr_iowait, 0);
-
- for (j = 0; j < 2; j++) {
- array = rq->arrays + j;
- for (k = 0; k < MAX_PRIO; k++) {
- INIT_LIST_HEAD(array->queue + k);
- __clear_bit(k, array->bitmap);
- }
- // delimiter for bitsearch
- __set_bit(MAX_PRIO, array->bitmap);
- }
}
+
/*
* We have to do a little magic to get the first
* thread right in SMP mode.
rq->curr = current;
rq->idle = current;
set_task_cpu(current, smp_processor_id());
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+ current->cpu_class = get_default_cpu_class();
+ current->array = NULL;
+#endif
wake_up_forked_process(current);
/*
EXPORT_SYMBOL(task_running_sys);
#endif
+#ifdef CONFIG_CKRM_CPU_SCHEDULE
+/**
+ * return the classqueue object of a certain processor
+ */
+struct classqueue_struct * get_cpu_classqueue(int cpu)
+{
+ return (& (cpu_rq(cpu)->classqueue) );
+}
+
+/**
+ * _ckrm_cpu_change_class - change the class of a task
+ */
+void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls)
+{
+ prio_array_t *array;
+ struct runqueue *rq;
+ unsigned long flags;
+
+ rq = task_rq_lock(tsk,&flags);
+ array = tsk->array;
+ if (array) {
+ dequeue_task(tsk,array);
+ tsk->cpu_class = newcls;
+ enqueue_task(tsk,rq_active(tsk,rq));
+ } else
+ tsk->cpu_class = newcls;
+
+ task_rq_unlock(rq,&flags);
+}
+#endif