2 * linux/drivers/block/cfq-iosched.c
4 * CFQ, or complete fairness queueing, disk scheduler.
6 * Based on ideas from a previously unfinished io
7 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
9 * IO priorities are supported, from 0% to 100% in 5% increments. Both of
10 * those values have special meaning - 0% class is allowed to do io if
11 * noone else wants to use the disk. 100% is considered real-time io, and
12 * always get priority. Default process io rate is 95%. In absence of other
13 * io, a class may consume 100% disk bandwidth regardless. Withing a class,
14 * bandwidth is distributed equally among the citizens.
17 * - cfq_select_requests() needs some work for 5-95% io
18 * - barriers not supported
19 * - export grace periods in ms, not jiffies
21 * Copyright (C) 2003 Jens Axboe <axboe@suse.de>
23 #include <linux/kernel.h>
25 #include <linux/blkdev.h>
26 #include <linux/elevator.h>
27 #include <linux/bio.h>
28 #include <linux/config.h>
29 #include <linux/module.h>
30 #include <linux/slab.h>
31 #include <linux/init.h>
32 #include <linux/compiler.h>
33 #include <linux/hash.h>
34 #include <linux/rbtree.h>
35 #include <linux/mempool.h>
37 #if IOPRIO_NR > BITS_PER_LONG
38 #error Cannot support this many io priority levels
44 static int cfq_quantum = 6;
45 static int cfq_quantum_io = 256;
46 static int cfq_idle_quantum = 1;
47 static int cfq_idle_quantum_io = 64;
48 static int cfq_queued = 4;
49 static int cfq_grace_rt = HZ / 100 ?: 1;
50 static int cfq_grace_idle = HZ / 10;
52 #define CFQ_QHASH_SHIFT 6
53 #define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT)
54 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
56 #define CFQ_MHASH_SHIFT 8
57 #define CFQ_MHASH_BLOCK(sec) ((sec) >> 3)
58 #define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT)
59 #define CFQ_MHASH_FN(sec) (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
60 #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
61 #define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash)
63 #define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list)
64 #define list_entry_prio(ptr) list_entry((ptr), struct cfq_rq, prio_list)
66 #define cfq_account_io(crq) \
67 ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
70 * defines how we distribute bandwidth (can be tgid, uid, etc)
73 /* FIXME: change hash_key to be sizeof(void *) rather than sizeof(int)
74 * otherwise the cast of cki_tsk_icls will not work reliably on 64-bit arches.
75 * OR, change cki_tsk_icls to return ints (will need another id space to be
79 #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
80 extern inline void *cki_hash_key(struct task_struct *tsk);
81 extern inline int cki_ioprio(struct task_struct *tsk);
82 #define cfq_hash_key(current) ((int)cki_hash_key((current)))
83 #define cfq_ioprio(current) (cki_ioprio((current)))
86 #define cfq_hash_key(current) ((current)->tgid)
90 #define cfq_ioprio(current) ((current)->ioprio)
94 #define CFQ_WAIT_NORM 1
96 static kmem_cache_t *crq_pool;
97 static kmem_cache_t *cfq_pool;
98 static mempool_t *cfq_mpool;
101 * defines an io priority level
103 struct io_prio_data {
104 struct list_head rr_list;
107 unsigned long busy_sectors;
109 /* Statistics on requests, sectors and queues
110 * added to (in) and dispatched from (out)
111 * this priority level. Reinsertion of previously
112 * dispatched crq's into cfq's results in double counting
113 * which is ignored for now as in-out should
116 atomic_t cum_rq_in,cum_rq_out;
117 atomic_t cum_sectors_in,cum_sectors_out;
118 atomic_t cum_queues_in,cum_queues_out;
120 struct list_head prio_list;
126 * per-request queue structure
129 struct list_head *dispatch;
130 struct hlist_head *cfq_hash;
131 struct hlist_head *crq_hash;
134 struct io_prio_data cid[IOPRIO_NR];
137 * total number of busy queues and requests
141 unsigned long busy_sectors;
143 unsigned long rq_starved_mask;
146 * grace period handling
148 struct timer_list timer;
149 unsigned long wait_end;
151 struct work_struct work;
156 unsigned int cfq_quantum;
157 unsigned int cfq_quantum_io;
158 unsigned int cfq_idle_quantum;
159 unsigned int cfq_idle_quantum_io;
160 unsigned int cfq_queued;
161 unsigned int cfq_grace_rt;
162 unsigned int cfq_grace_idle;
166 * per-class structure
169 struct list_head cfq_list;
170 struct hlist_node cfq_hash;
172 struct rb_root sort_list;
178 * per-request structure
181 struct cfq_queue *cfq_queue;
182 struct rb_node rb_node;
183 struct hlist_node hash;
186 struct request *request;
188 struct list_head prio_list;
189 unsigned long nr_sectors;
193 static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
194 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
195 static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq);
198 * lots of deadline iosched dupes, can be abstracted later...
200 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
202 hlist_del_init(&crq->hash);
206 cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
208 cfq_del_crq_hash(crq);
210 if (q->last_merge == crq->request)
211 q->last_merge = NULL;
214 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
216 struct request *rq = crq->request;
217 const int hash_idx = CFQ_MHASH_FN(rq_hash_key(rq));
219 BUG_ON(!hlist_unhashed(&crq->hash));
221 hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
224 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
226 struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
227 struct hlist_node *entry, *next;
229 hlist_for_each_safe(entry, next, hash_list) {
230 struct cfq_rq *crq = list_entry_hash(entry);
231 struct request *__rq = crq->request;
233 BUG_ON(hlist_unhashed(&crq->hash));
235 if (!rq_mergeable(__rq)) {
236 cfq_del_crq_hash(crq);
240 if (rq_hash_key(__rq) == offset)
248 * rb tree support functions
250 #define RB_EMPTY(node) ((node)->rb_node == NULL)
251 #define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node)
252 #define rq_rb_key(rq) (rq)->sector
255 cfq_del_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
257 if (crq->cfq_queue) {
258 crq->cfq_queue = NULL;
260 if (cfq_account_io(crq)) {
262 cfqd->busy_sectors -= crq->nr_sectors;
263 cfqd->cid[crq->ioprio].busy_rq--;
264 atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_out));
265 cfqd->cid[crq->ioprio].busy_sectors -= crq->nr_sectors;
266 atomic_add(crq->nr_sectors,&(cfqd->cid[crq->ioprio].cum_sectors_out));
269 cfqq->queued[rq_data_dir(crq->request)]--;
270 rb_erase(&crq->rb_node, &cfqq->sort_list);
274 static struct cfq_rq *
275 __cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
277 struct rb_node **p = &cfqq->sort_list.rb_node;
278 struct rb_node *parent = NULL;
279 struct cfq_rq *__crq;
283 __crq = rb_entry_crq(parent);
285 if (crq->rb_key < __crq->rb_key)
287 else if (crq->rb_key > __crq->rb_key)
293 rb_link_node(&crq->rb_node, parent, p);
298 cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
300 struct request *rq = crq->request;
301 struct cfq_rq *__alias;
303 cfqq->queued[rq_data_dir(rq)]++;
304 if (cfq_account_io(crq)) {
306 cfqd->busy_sectors += crq->nr_sectors;
307 cfqd->cid[crq->ioprio].busy_rq++;
308 atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_in));
309 cfqd->cid[crq->ioprio].busy_sectors += crq->nr_sectors;
310 atomic_add(crq->nr_sectors,&(cfqd->cid[crq->ioprio].cum_sectors_in));
313 __alias = __cfq_add_crq_rb(cfqq, crq);
315 rb_insert_color(&crq->rb_node, &cfqq->sort_list);
316 crq->rb_key = rq_rb_key(rq);
317 crq->cfq_queue = cfqq;
321 cfq_del_crq_rb(cfqd, cfqq, __alias);
322 cfq_dispatch_sort(cfqd->dispatch, __alias);
326 static struct request *
327 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
329 struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
335 n = cfqq->sort_list.rb_node;
337 struct cfq_rq *crq = rb_entry_crq(n);
339 if (sector < crq->rb_key)
341 else if (sector > crq->rb_key)
351 static void cfq_remove_request(request_queue_t *q, struct request *rq)
353 struct cfq_data *cfqd = q->elevator.elevator_data;
354 struct cfq_rq *crq = RQ_ELV_DATA(rq);
357 cfq_remove_merge_hints(q, crq);
358 list_del_init(&crq->prio_list);
359 list_del_init(&rq->queuelist);
362 * set a grace period timer to allow realtime io to make real
363 * progress, if we release an rt request. for normal request,
364 * set timer so idle io doesn't interfere with other io
366 if (crq->ioprio == IOPRIO_RT) {
367 set_bit(CFQ_WAIT_RT, &cfqd->flags);
368 cfqd->wait_end = jiffies + cfqd->cfq_grace_rt;
369 } else if (crq->ioprio != IOPRIO_IDLE) {
370 set_bit(CFQ_WAIT_NORM, &cfqd->flags);
371 cfqd->wait_end = jiffies + cfqd->cfq_grace_idle;
374 if (crq->cfq_queue) {
375 struct cfq_queue *cfqq = crq->cfq_queue;
377 cfq_del_crq_rb(cfqd, cfqq, crq);
379 if (RB_EMPTY(&cfqq->sort_list))
380 cfq_put_queue(cfqd, cfqq);
386 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
388 struct cfq_data *cfqd = q->elevator.elevator_data;
389 struct request *__rq;
392 ret = elv_try_last_merge(q, bio);
393 if (ret != ELEVATOR_NO_MERGE) {
394 __rq = q->last_merge;
398 __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
400 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
402 if (elv_rq_merge_ok(__rq, bio)) {
403 ret = ELEVATOR_BACK_MERGE;
408 __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
410 if (elv_rq_merge_ok(__rq, bio)) {
411 ret = ELEVATOR_FRONT_MERGE;
416 return ELEVATOR_NO_MERGE;
418 q->last_merge = __rq;
424 static void cfq_merged_request(request_queue_t *q, struct request *req)
426 struct cfq_data *cfqd = q->elevator.elevator_data;
427 struct cfq_rq *crq = RQ_ELV_DATA(req);
429 cfq_del_crq_hash(crq);
430 cfq_add_crq_hash(cfqd, crq);
432 if (crq->cfq_queue && (rq_rb_key(req) != crq->rb_key)) {
433 struct cfq_queue *cfqq = crq->cfq_queue;
435 cfq_del_crq_rb(cfqd, cfqq, crq);
436 cfq_add_crq_rb(cfqd, cfqq, crq);
439 cfqd->busy_sectors += req->hard_nr_sectors - crq->nr_sectors;
440 cfqd->cid[crq->ioprio].busy_sectors += req->hard_nr_sectors - crq->nr_sectors;
441 crq->nr_sectors = req->hard_nr_sectors;
447 cfq_merged_requests(request_queue_t *q, struct request *req,
448 struct request *next)
450 cfq_merged_request(q, req);
451 cfq_remove_request(q, next);
455 * sort into dispatch list, in optimal ascending order
457 static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq)
459 struct list_head *entry = head;
460 struct request *__rq;
462 if (!list_empty(head)) {
463 __rq = list_entry_rq(head->next);
465 if (crq->request->sector < __rq->sector) {
471 while ((entry = entry->prev) != head) {
472 __rq = list_entry_rq(entry);
474 if (crq->request->sector <= __rq->sector)
479 list_add_tail(&crq->request->queuelist, entry);
483 * remove from io scheduler core and put on dispatch list for service
486 __cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
487 struct cfq_queue *cfqq)
491 crq = rb_entry_crq(rb_first(&cfqq->sort_list));
493 cfq_del_crq_rb(cfqd, cfqq, crq);
494 cfq_remove_merge_hints(q, crq);
495 cfq_dispatch_sort(cfqd->dispatch, crq);
498 * technically, for IOPRIO_RT we don't need to add it to the list.
500 list_add_tail(&crq->prio_list, &cfqd->cid[cfqq->ioprio].prio_list);
501 return crq->nr_sectors;
505 cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors)
507 struct cfq_data *cfqd = q->elevator.elevator_data;
508 struct list_head *plist = &cfqd->cid[prio].rr_list;
509 struct list_head *entry, *nxt;
513 * for each queue at this prio level, dispatch a request
516 list_for_each_safe(entry, nxt, plist) {
517 struct cfq_queue *cfqq = list_entry_cfqq(entry);
519 BUG_ON(RB_EMPTY(&cfqq->sort_list));
521 q_io += __cfq_dispatch_requests(q, cfqd, cfqq);
524 if (RB_EMPTY(&cfqq->sort_list))
525 cfq_put_queue(cfqd, cfqq);
528 * if we hit the queue limit, put the string of serviced
529 * queues at the back of the pending list
531 if (q_io >= max_sectors || q_rq >= max_rq) {
532 struct list_head *prv = nxt->prev;
536 list_add(plist, prv);
542 cfqd->cid[prio].last_rq = q_rq;
543 cfqd->cid[prio].last_sectors = q_io;
548 * try to move some requests to the dispatch list. return 0 on success
550 static int cfq_select_requests(request_queue_t *q, struct cfq_data *cfqd)
552 int queued, busy_rq, busy_sectors, i;
555 * if there's any realtime io, only schedule that
557 if (cfq_dispatch_requests(q, IOPRIO_RT, cfqd->cfq_quantum, cfqd->cfq_quantum_io))
561 * if RT io was last serviced and grace time hasn't expired,
562 * arm the timer to restart queueing if no other RT io has been
563 * submitted in the mean time
565 if (test_bit(CFQ_WAIT_RT, &cfqd->flags)) {
566 if (time_before(jiffies, cfqd->wait_end)) {
567 mod_timer(&cfqd->timer, cfqd->wait_end);
570 clear_bit(CFQ_WAIT_RT, &cfqd->flags);
574 * for each priority level, calculate number of requests we
575 * are allowed to put into service.
578 busy_rq = cfqd->busy_rq;
579 busy_sectors = cfqd->busy_sectors;
580 for (i = IOPRIO_RT - 1; i > IOPRIO_IDLE; i--) {
581 const int o_rq = busy_rq - cfqd->cid[i].busy_rq;
582 const int o_sectors = busy_sectors - cfqd->cid[i].busy_sectors;
583 int q_rq = cfqd->cfq_quantum * (i + 1) / IOPRIO_NR;
584 int q_io = cfqd->cfq_quantum_io * (i + 1) / IOPRIO_NR;
587 * no need to keep iterating the list, if there are no
588 * requests pending anymore
594 * find out how many requests and sectors we are allowed to
598 q_rq = o_sectors * (i + 1) / IOPRIO_NR;
599 if (q_rq > cfqd->cfq_quantum)
600 q_rq = cfqd->cfq_quantum;
603 q_io = o_sectors * (i + 1) / IOPRIO_NR;
604 if (q_io > cfqd->cfq_quantum_io)
605 q_io = cfqd->cfq_quantum_io;
608 * average with last dispatched for fairness
610 if (cfqd->cid[i].last_rq != -1)
611 q_rq = (cfqd->cid[i].last_rq + q_rq) / 2;
612 if (cfqd->cid[i].last_sectors != -1)
613 q_io = (cfqd->cid[i].last_sectors + q_io) / 2;
615 queued += cfq_dispatch_requests(q, i, q_rq, q_io);
622 * only allow dispatch of idle io, if the queue has been idle from
623 * servicing RT or normal io for the grace period
625 if (test_bit(CFQ_WAIT_NORM, &cfqd->flags)) {
626 if (time_before(jiffies, cfqd->wait_end)) {
627 mod_timer(&cfqd->timer, cfqd->wait_end);
630 clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
634 * if we found nothing to do, allow idle io to be serviced
636 if (cfq_dispatch_requests(q, IOPRIO_IDLE, cfqd->cfq_idle_quantum, cfqd->cfq_idle_quantum_io))
642 static struct request *cfq_next_request(request_queue_t *q)
644 struct cfq_data *cfqd = q->elevator.elevator_data;
647 if (!list_empty(cfqd->dispatch)) {
651 * end grace period, we are servicing a request
653 del_timer(&cfqd->timer);
654 clear_bit(CFQ_WAIT_RT, &cfqd->flags);
655 clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
657 BUG_ON(list_empty(cfqd->dispatch));
658 rq = list_entry_rq(cfqd->dispatch->next);
660 BUG_ON(q->last_merge == rq);
661 crq = RQ_ELV_DATA(rq);
663 BUG_ON(!hlist_unhashed(&crq->hash));
664 list_del_init(&crq->prio_list);
671 * we moved requests to dispatch list, go back end serve one
673 if (cfq_select_requests(q, cfqd))
679 static inline struct cfq_queue *
680 __cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey, const int hashval)
682 struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
683 struct hlist_node *entry;
685 hlist_for_each(entry, hash_list) {
686 struct cfq_queue *__cfqq = list_entry_qhash(entry);
688 if (__cfqq->hash_key == hashkey)
695 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey)
697 const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
699 return __cfq_find_cfq_hash(cfqd, hashkey, hashval);
702 static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
705 WARN_ON(cfqd->busy_queues < 0);
707 cfqd->cid[cfqq->ioprio].busy_queues--;
708 WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0);
709 atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
711 list_del(&cfqq->cfq_list);
712 hlist_del(&cfqq->cfq_hash);
713 mempool_free(cfqq, cfq_mpool);
716 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int hashkey)
718 const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
719 struct cfq_queue *cfqq, *new_cfqq = NULL;
722 cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
729 new_cfqq = mempool_alloc(cfq_mpool, GFP_ATOMIC);
730 /* MEF: I think cfq-iosched.c needs further fixing
731 to avoid the bugon. Shailabh will be sending
732 a new patch for this soon.
734 BUG_ON(new_cfqq == NULL);
738 memset(cfqq, 0, sizeof(*cfqq));
739 INIT_HLIST_NODE(&cfqq->cfq_hash);
740 INIT_LIST_HEAD(&cfqq->cfq_list);
742 cfqq->hash_key = cfq_hash_key(current);
743 cfqq->ioprio = cfq_ioprio(current);
744 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
748 mempool_free(new_cfqq, cfq_mpool);
755 __cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
757 const int prio = crq->ioprio;
758 struct cfq_queue *cfqq;
760 cfqq = cfq_get_queue(cfqd, cfq_hash_key(current));
765 if (prio > cfqq->ioprio) {
766 printk("prio hash collision %d %d\n", prio, cfqq->ioprio);
767 if (!list_empty(&cfqq->cfq_list)) {
768 cfqd->cid[cfqq->ioprio].busy_queues--;
769 WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0);
770 atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
771 cfqd->cid[prio].busy_queues++;
772 atomic_inc(&(cfqd->cid[prio].cum_queues_in));
773 list_move_tail(&cfqq->cfq_list, &cfqd->cid[prio].rr_list);
778 cfq_add_crq_rb(cfqd, cfqq, crq);
780 if (list_empty(&cfqq->cfq_list)) {
781 list_add_tail(&cfqq->cfq_list, &cfqd->cid[prio].rr_list);
782 cfqd->cid[prio].busy_queues++;
783 atomic_inc(&(cfqd->cid[prio].cum_queues_in));
787 if (rq_mergeable(crq->request)) {
788 cfq_add_crq_hash(cfqd, crq);
791 q->last_merge = crq->request;
796 static void cfq_reenqueue(request_queue_t *q, struct cfq_data *cfqd, int prio)
798 struct list_head *prio_list = &cfqd->cid[prio].prio_list;
799 struct list_head *entry, *tmp;
801 list_for_each_safe(entry, tmp, prio_list) {
802 struct cfq_rq *crq = list_entry_prio(entry);
804 list_del_init(entry);
805 list_del_init(&crq->request->queuelist);
806 __cfq_enqueue(q, cfqd, crq);
811 cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
813 const int prio = cfq_ioprio(current);
816 crq->nr_sectors = crq->request->hard_nr_sectors;
817 __cfq_enqueue(q, cfqd, crq);
819 if (prio == IOPRIO_RT) {
823 * realtime io gets priority, move all other io back
825 for (i = IOPRIO_IDLE; i < IOPRIO_RT; i++)
826 cfq_reenqueue(q, cfqd, i);
827 } else if (prio != IOPRIO_IDLE) {
829 * check if we need to move idle io back into queue
831 cfq_reenqueue(q, cfqd, IOPRIO_IDLE);
836 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
838 struct cfq_data *cfqd = q->elevator.elevator_data;
839 struct cfq_rq *crq = RQ_ELV_DATA(rq);
842 case ELEVATOR_INSERT_BACK:
844 while (cfq_dispatch_requests(q, cfqd))
847 list_add_tail(&rq->queuelist, cfqd->dispatch);
849 case ELEVATOR_INSERT_FRONT:
850 list_add(&rq->queuelist, cfqd->dispatch);
852 case ELEVATOR_INSERT_SORT:
853 BUG_ON(!blk_fs_request(rq));
854 cfq_enqueue(q, cfqd, crq);
857 printk("%s: bad insert point %d\n", __FUNCTION__,where);
862 static int cfq_queue_empty(request_queue_t *q)
864 struct cfq_data *cfqd = q->elevator.elevator_data;
866 if (list_empty(cfqd->dispatch) && !cfqd->busy_queues)
872 static struct request *
873 cfq_former_request(request_queue_t *q, struct request *rq)
875 struct cfq_rq *crq = RQ_ELV_DATA(rq);
876 struct rb_node *rbprev = rb_prev(&crq->rb_node);
879 return rb_entry_crq(rbprev)->request;
884 static struct request *
885 cfq_latter_request(request_queue_t *q, struct request *rq)
887 struct cfq_rq *crq = RQ_ELV_DATA(rq);
888 struct rb_node *rbnext = rb_next(&crq->rb_node);
891 return rb_entry_crq(rbnext)->request;
896 static void cfq_queue_congested(request_queue_t *q)
898 struct cfq_data *cfqd = q->elevator.elevator_data;
900 set_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
903 static int cfq_may_queue(request_queue_t *q, int rw)
905 struct cfq_data *cfqd = q->elevator.elevator_data;
906 struct cfq_queue *cfqq;
907 const int prio = cfq_ioprio(current);
910 if (!cfqd->busy_queues)
913 cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
917 cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
922 * if higher or equal prio io is sleeping waiting for a request, don't
923 * allow this one to allocate one. as long as ll_rw_blk does fifo
924 * waitqueue wakeups this should work...
926 if (cfqd->rq_starved_mask & ~((1 << prio) - 1))
929 if (cfqq->queued[rw] < cfqd->cfq_queued || !cfqd->cid[prio].busy_queues)
932 limit = q->nr_requests * (prio + 1) / IOPRIO_NR;
933 limit /= cfqd->cid[prio].busy_queues;
934 if (cfqq->queued[rw] > limit)
941 static void cfq_put_request(request_queue_t *q, struct request *rq)
943 struct cfq_data *cfqd = q->elevator.elevator_data;
944 struct cfq_rq *crq = RQ_ELV_DATA(rq);
947 BUG_ON(q->last_merge == rq);
948 BUG_ON(!hlist_unhashed(&crq->hash));
950 mempool_free(crq, cfqd->crq_pool);
951 rq->elevator_private = NULL;
955 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
957 struct cfq_data *cfqd = q->elevator.elevator_data;
958 struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
962 * process now has one request
964 clear_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
966 memset(crq, 0, sizeof(*crq));
968 INIT_HLIST_NODE(&crq->hash);
969 INIT_LIST_HEAD(&crq->prio_list);
970 rq->elevator_private = crq;
977 static void cfq_exit(request_queue_t *q, elevator_t *e)
979 struct cfq_data *cfqd = e->elevator_data;
981 e->elevator_data = NULL;
982 mempool_destroy(cfqd->crq_pool);
983 kfree(cfqd->crq_hash);
984 kfree(cfqd->cfq_hash);
988 static void cfq_timer(unsigned long data)
990 struct cfq_data *cfqd = (struct cfq_data *) data;
992 clear_bit(CFQ_WAIT_RT, &cfqd->flags);
993 clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
994 kblockd_schedule_work(&cfqd->work);
997 static void cfq_work(void *data)
999 request_queue_t *q = data;
1000 unsigned long flags;
1002 spin_lock_irqsave(q->queue_lock, flags);
1003 if (cfq_next_request(q))
1005 spin_unlock_irqrestore(q->queue_lock, flags);
1008 static int cfq_init(request_queue_t *q, elevator_t *e)
1010 struct cfq_data *cfqd;
1013 cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
1017 memset(cfqd, 0, sizeof(*cfqd));
1019 init_timer(&cfqd->timer);
1020 cfqd->timer.function = cfq_timer;
1021 cfqd->timer.data = (unsigned long) cfqd;
1023 INIT_WORK(&cfqd->work, cfq_work, q);
1025 for (i = 0; i < IOPRIO_NR; i++) {
1026 struct io_prio_data *cid = &cfqd->cid[i];
1028 INIT_LIST_HEAD(&cid->rr_list);
1029 INIT_LIST_HEAD(&cid->prio_list);
1031 cid->last_sectors = -1;
1033 atomic_set(&cid->cum_rq_in,0);
1034 atomic_set(&cid->cum_rq_out,0);
1035 atomic_set(&cid->cum_sectors_in,0);
1036 atomic_set(&cid->cum_sectors_out,0);
1037 atomic_set(&cid->cum_queues_in,0);
1038 atomic_set(&cid->cum_queues_out,0);
1041 cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
1042 if (!cfqd->crq_hash)
1045 cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
1046 if (!cfqd->cfq_hash)
1049 cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
1050 if (!cfqd->crq_pool)
1053 for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
1054 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
1055 for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
1056 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
1058 cfqd->cfq_queued = cfq_queued;
1059 cfqd->cfq_quantum = cfq_quantum;
1060 cfqd->cfq_quantum_io = cfq_quantum_io;
1061 cfqd->cfq_idle_quantum = cfq_idle_quantum;
1062 cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
1063 cfqd->cfq_grace_rt = cfq_grace_rt;
1064 cfqd->cfq_grace_idle = cfq_grace_idle;
1066 q->nr_requests <<= 2;
1068 cfqd->dispatch = &q->queue_head;
1069 e->elevator_data = cfqd;
1073 kfree(cfqd->cfq_hash);
1075 kfree(cfqd->crq_hash);
1081 static int __init cfq_slab_setup(void)
1083 crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
1087 panic("cfq_iosched: can't init crq pool\n");
1089 cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
1093 panic("cfq_iosched: can't init cfq pool\n");
1095 cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
1098 panic("cfq_iosched: can't init cfq mpool\n");
1103 subsys_initcall(cfq_slab_setup);
1106 * sysfs parts below -->
1108 struct cfq_fs_entry {
1109 struct attribute attr;
1110 ssize_t (*show)(struct cfq_data *, char *);
1111 ssize_t (*store)(struct cfq_data *, const char *, size_t);
1115 cfq_var_show(unsigned int var, char *page)
1117 return sprintf(page, "%d\n", var);
1121 cfq_var_store(unsigned int *var, const char *page, size_t count)
1123 char *p = (char *) page;
1125 *var = simple_strtoul(p, &p, 10);
1129 #define SHOW_FUNCTION(__FUNC, __VAR) \
1130 static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \
1132 return cfq_var_show(__VAR, (page)); \
1134 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum);
1135 SHOW_FUNCTION(cfq_quantum_io_show, cfqd->cfq_quantum_io);
1136 SHOW_FUNCTION(cfq_idle_quantum_show, cfqd->cfq_idle_quantum);
1137 SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io);
1138 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
1139 SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
1140 SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
1141 #undef SHOW_FUNCTION
1143 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
1144 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \
1146 int ret = cfq_var_store(__PTR, (page), count); \
1147 if (*(__PTR) < (MIN)) \
1149 else if (*(__PTR) > (MAX)) \
1153 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX);
1154 STORE_FUNCTION(cfq_quantum_io_store, &cfqd->cfq_quantum_io, 4, INT_MAX);
1155 STORE_FUNCTION(cfq_idle_quantum_store, &cfqd->cfq_idle_quantum, 1, INT_MAX);
1156 STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX);
1157 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
1158 STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
1159 STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
1160 #undef STORE_FUNCTION
1163 /* Additional entries to get priority level data */
1165 cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
1167 int r1,r2,s1,s2,q1,q2;
1169 if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT))
1172 r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
1173 r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
1174 s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
1175 s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
1176 q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in));
1177 q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
1181 return sprintf(page,"rq %d (%d,%d) sec %d (%d,%d) q %d (%d,%d)\n",
1187 return sprintf(page,"rq (%d,%d) sec (%d,%d) q (%d,%d)\n",
1194 #define SHOW_PRIO_DATA(__PRIOLVL) \
1195 static ssize_t cfq_prio_##__PRIOLVL##_show(struct cfq_data *cfqd, char *page) \
1197 return cfq_prio_show(cfqd,page,__PRIOLVL); \
1220 #undef SHOW_PRIO_DATA
1223 static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
1225 atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
1226 atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
1227 atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
1228 atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
1229 atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
1230 atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
1236 #define STORE_PRIO_DATA(__PRIOLVL) \
1237 static ssize_t cfq_prio_##__PRIOLVL##_store(struct cfq_data *cfqd, const char *page, size_t count) \
1239 return cfq_prio_store(cfqd,page,count,__PRIOLVL); \
1251 STORE_PRIO_DATA(10);
1252 STORE_PRIO_DATA(11);
1253 STORE_PRIO_DATA(12);
1254 STORE_PRIO_DATA(13);
1255 STORE_PRIO_DATA(14);
1256 STORE_PRIO_DATA(15);
1257 STORE_PRIO_DATA(16);
1258 STORE_PRIO_DATA(17);
1259 STORE_PRIO_DATA(18);
1260 STORE_PRIO_DATA(19);
1261 STORE_PRIO_DATA(20);
1262 #undef STORE_PRIO_DATA
1266 static struct cfq_fs_entry cfq_quantum_entry = {
1267 .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
1268 .show = cfq_quantum_show,
1269 .store = cfq_quantum_store,
1271 static struct cfq_fs_entry cfq_quantum_io_entry = {
1272 .attr = {.name = "quantum_io", .mode = S_IRUGO | S_IWUSR },
1273 .show = cfq_quantum_io_show,
1274 .store = cfq_quantum_io_store,
1276 static struct cfq_fs_entry cfq_idle_quantum_entry = {
1277 .attr = {.name = "idle_quantum", .mode = S_IRUGO | S_IWUSR },
1278 .show = cfq_idle_quantum_show,
1279 .store = cfq_idle_quantum_store,
1281 static struct cfq_fs_entry cfq_idle_quantum_io_entry = {
1282 .attr = {.name = "idle_quantum_io", .mode = S_IRUGO | S_IWUSR },
1283 .show = cfq_idle_quantum_io_show,
1284 .store = cfq_idle_quantum_io_store,
1286 static struct cfq_fs_entry cfq_queued_entry = {
1287 .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
1288 .show = cfq_queued_show,
1289 .store = cfq_queued_store,
1291 static struct cfq_fs_entry cfq_grace_rt_entry = {
1292 .attr = {.name = "grace_rt", .mode = S_IRUGO | S_IWUSR },
1293 .show = cfq_grace_rt_show,
1294 .store = cfq_grace_rt_store,
1296 static struct cfq_fs_entry cfq_grace_idle_entry = {
1297 .attr = {.name = "grace_idle", .mode = S_IRUGO | S_IWUSR },
1298 .show = cfq_grace_idle_show,
1299 .store = cfq_grace_idle_store,
1302 #define P_0_STR "p0"
1303 #define P_1_STR "p1"
1304 #define P_2_STR "p2"
1305 #define P_3_STR "p3"
1306 #define P_4_STR "p4"
1307 #define P_5_STR "p5"
1308 #define P_6_STR "p6"
1309 #define P_7_STR "p7"
1310 #define P_8_STR "p8"
1311 #define P_9_STR "p9"
1312 #define P_10_STR "p10"
1313 #define P_11_STR "p11"
1314 #define P_12_STR "p12"
1315 #define P_13_STR "p13"
1316 #define P_14_STR "p14"
1317 #define P_15_STR "p15"
1318 #define P_16_STR "p16"
1319 #define P_17_STR "p17"
1320 #define P_18_STR "p18"
1321 #define P_19_STR "p19"
1322 #define P_20_STR "p20"
1325 #define CFQ_PRIO_SYSFS_ENTRY(__PRIOLVL) \
1326 static struct cfq_fs_entry cfq_prio_##__PRIOLVL##_entry = { \
1327 .attr = {.name = P_##__PRIOLVL##_STR, .mode = S_IRUGO | S_IWUSR }, \
1328 .show = cfq_prio_##__PRIOLVL##_show, \
1329 .store = cfq_prio_##__PRIOLVL##_store, \
1331 CFQ_PRIO_SYSFS_ENTRY(0);
1332 CFQ_PRIO_SYSFS_ENTRY(1);
1333 CFQ_PRIO_SYSFS_ENTRY(2);
1334 CFQ_PRIO_SYSFS_ENTRY(3);
1335 CFQ_PRIO_SYSFS_ENTRY(4);
1336 CFQ_PRIO_SYSFS_ENTRY(5);
1337 CFQ_PRIO_SYSFS_ENTRY(6);
1338 CFQ_PRIO_SYSFS_ENTRY(7);
1339 CFQ_PRIO_SYSFS_ENTRY(8);
1340 CFQ_PRIO_SYSFS_ENTRY(9);
1341 CFQ_PRIO_SYSFS_ENTRY(10);
1342 CFQ_PRIO_SYSFS_ENTRY(11);
1343 CFQ_PRIO_SYSFS_ENTRY(12);
1344 CFQ_PRIO_SYSFS_ENTRY(13);
1345 CFQ_PRIO_SYSFS_ENTRY(14);
1346 CFQ_PRIO_SYSFS_ENTRY(15);
1347 CFQ_PRIO_SYSFS_ENTRY(16);
1348 CFQ_PRIO_SYSFS_ENTRY(17);
1349 CFQ_PRIO_SYSFS_ENTRY(18);
1350 CFQ_PRIO_SYSFS_ENTRY(19);
1351 CFQ_PRIO_SYSFS_ENTRY(20);
1352 #undef CFQ_PRIO_SYSFS_ENTRY
1355 static struct attribute *default_attrs[] = {
1356 &cfq_quantum_entry.attr,
1357 &cfq_quantum_io_entry.attr,
1358 &cfq_idle_quantum_entry.attr,
1359 &cfq_idle_quantum_io_entry.attr,
1360 &cfq_queued_entry.attr,
1361 &cfq_grace_rt_entry.attr,
1362 &cfq_grace_idle_entry.attr,
1363 &cfq_prio_0_entry.attr,
1364 &cfq_prio_1_entry.attr,
1365 &cfq_prio_2_entry.attr,
1366 &cfq_prio_3_entry.attr,
1367 &cfq_prio_4_entry.attr,
1368 &cfq_prio_5_entry.attr,
1369 &cfq_prio_6_entry.attr,
1370 &cfq_prio_7_entry.attr,
1371 &cfq_prio_8_entry.attr,
1372 &cfq_prio_9_entry.attr,
1373 &cfq_prio_10_entry.attr,
1374 &cfq_prio_11_entry.attr,
1375 &cfq_prio_12_entry.attr,
1376 &cfq_prio_13_entry.attr,
1377 &cfq_prio_14_entry.attr,
1378 &cfq_prio_15_entry.attr,
1379 &cfq_prio_16_entry.attr,
1380 &cfq_prio_17_entry.attr,
1381 &cfq_prio_18_entry.attr,
1382 &cfq_prio_19_entry.attr,
1383 &cfq_prio_20_entry.attr,
1387 #define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
1390 cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1392 elevator_t *e = container_of(kobj, elevator_t, kobj);
1393 struct cfq_fs_entry *entry = to_cfq(attr);
1398 return entry->show(e->elevator_data, page);
1402 cfq_attr_store(struct kobject *kobj, struct attribute *attr,
1403 const char *page, size_t length)
1405 elevator_t *e = container_of(kobj, elevator_t, kobj);
1406 struct cfq_fs_entry *entry = to_cfq(attr);
1411 return entry->store(e->elevator_data, page, length);
1414 static struct sysfs_ops cfq_sysfs_ops = {
1415 .show = cfq_attr_show,
1416 .store = cfq_attr_store,
1419 struct kobj_type cfq_ktype = {
1420 .sysfs_ops = &cfq_sysfs_ops,
1421 .default_attrs = default_attrs,
1424 elevator_t iosched_cfq = {
1425 .elevator_name = "cfq",
1426 .elevator_ktype = &cfq_ktype,
1427 .elevator_merge_fn = cfq_merge,
1428 .elevator_merged_fn = cfq_merged_request,
1429 .elevator_merge_req_fn = cfq_merged_requests,
1430 .elevator_next_req_fn = cfq_next_request,
1431 .elevator_add_req_fn = cfq_insert_request,
1432 .elevator_remove_req_fn = cfq_remove_request,
1433 .elevator_queue_empty_fn = cfq_queue_empty,
1434 .elevator_former_req_fn = cfq_former_request,
1435 .elevator_latter_req_fn = cfq_latter_request,
1436 .elevator_set_req_fn = cfq_set_request,
1437 .elevator_put_req_fn = cfq_put_request,
1438 .elevator_may_queue_fn = cfq_may_queue,
1439 .elevator_set_congested_fn = cfq_queue_congested,
1440 .elevator_init_fn = cfq_init,
1441 .elevator_exit_fn = cfq_exit,
1444 EXPORT_SYMBOL(iosched_cfq);