6c24f236f7f6de5af02e200ef770dabc19b9c6c1
[linux-2.6.git] / drivers / block / cfq-iosched.c
1 /*
2  *  linux/drivers/block/cfq-iosched.c
3  *
4  *  CFQ, or complete fairness queueing, disk scheduler.
5  *
6  *  Based on ideas from a previously unfinished io
7  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
8  *
9  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
10  */
11 #include <linux/kernel.h>
12 #include <linux/fs.h>
13 #include <linux/blkdev.h>
14 #include <linux/elevator.h>
15 #include <linux/bio.h>
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/init.h>
20 #include <linux/compiler.h>
21 #include <linux/hash.h>
22 #include <linux/rbtree.h>
23 #include <linux/mempool.h>
24
25 static unsigned long max_elapsed_crq;
26 static unsigned long max_elapsed_dispatch;
27
28 /*
29  * tunables
30  */
31 static int cfq_quantum = 4;             /* max queue in one round of service */
32 static int cfq_queued = 8;              /* minimum rq allocate limit per-queue*/
33 static int cfq_service = HZ;            /* period over which service is avg */
34 static int cfq_fifo_expire_r = HZ / 2;  /* fifo timeout for sync requests */
35 static int cfq_fifo_expire_w = 5 * HZ;  /* fifo timeout for async requests */
36 static int cfq_fifo_rate = HZ / 8;      /* fifo expiry rate */
37 static int cfq_back_max = 16 * 1024;    /* maximum backwards seek, in KiB */
38 static int cfq_back_penalty = 2;        /* penalty of a backwards seek */
39
40 /*
41  * for the hash of cfqq inside the cfqd
42  */
43 #define CFQ_QHASH_SHIFT         6
44 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
45 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
46
47 /*
48  * for the hash of crq inside the cfqq
49  */
50 #define CFQ_MHASH_SHIFT         6
51 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
52 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
53 #define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
54 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
55 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
56
57 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
58
59 #define RQ_DATA(rq)             (rq)->elevator_private
60
61 /*
62  * rb-tree defines
63  */
64 #define RB_NONE                 (2)
65 #define RB_EMPTY(node)          ((node)->rb_node == NULL)
66 #define RB_CLEAR_COLOR(node)    (node)->rb_color = RB_NONE
67 #define RB_CLEAR(node)          do {    \
68         (node)->rb_parent = NULL;       \
69         RB_CLEAR_COLOR((node));         \
70         (node)->rb_right = NULL;        \
71         (node)->rb_left = NULL;         \
72 } while (0)
73 #define RB_CLEAR_ROOT(root)     ((root)->rb_node = NULL)
74 #define ON_RB(node)             ((node)->rb_color != RB_NONE)
75 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
76 #define rq_rb_key(rq)           (rq)->sector
77
78 /*
79  * threshold for switching off non-tag accounting
80  */
81 #define CFQ_MAX_TAG             (4)
82
83 /*
84  * sort key types and names
85  */
86 enum {
87         CFQ_KEY_PGID,
88         CFQ_KEY_TGID,
89         CFQ_KEY_UID,
90         CFQ_KEY_GID,
91         CFQ_KEY_LAST,
92 };
93
94 static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
95
96 static kmem_cache_t *crq_pool;
97 static kmem_cache_t *cfq_pool;
98 static kmem_cache_t *cfq_ioc_pool;
99
100 struct cfq_data {
101         struct list_head rr_list;
102         struct list_head empty_list;
103
104         struct hlist_head *cfq_hash;
105         struct hlist_head *crq_hash;
106
107         /* queues on rr_list (ie they have pending requests */
108         unsigned int busy_queues;
109
110         unsigned int max_queued;
111
112         atomic_t ref;
113
114         int key_type;
115
116         mempool_t *crq_pool;
117
118         request_queue_t *queue;
119
120         sector_t last_sector;
121
122         int rq_in_driver;
123
124         /*
125          * tunables, see top of file
126          */
127         unsigned int cfq_quantum;
128         unsigned int cfq_queued;
129         unsigned int cfq_fifo_expire_r;
130         unsigned int cfq_fifo_expire_w;
131         unsigned int cfq_fifo_batch_expire;
132         unsigned int cfq_back_penalty;
133         unsigned int cfq_back_max;
134         unsigned int find_best_crq;
135
136         unsigned int cfq_tagged;
137 };
138
139 struct cfq_queue {
140         /* reference count */
141         atomic_t ref;
142         /* parent cfq_data */
143         struct cfq_data *cfqd;
144         /* hash of mergeable requests */
145         struct hlist_node cfq_hash;
146         /* hash key */
147         unsigned long key;
148         /* whether queue is on rr (or empty) list */
149         int on_rr;
150         /* on either rr or empty list of cfqd */
151         struct list_head cfq_list;
152         /* sorted list of pending requests */
153         struct rb_root sort_list;
154         /* if fifo isn't expired, next request to serve */
155         struct cfq_rq *next_crq;
156         /* requests queued in sort_list */
157         int queued[2];
158         /* currently allocated requests */
159         int allocated[2];
160         /* fifo list of requests in sort_list */
161         struct list_head fifo[2];
162         /* last time fifo expired */
163         unsigned long last_fifo_expire;
164
165         int key_type;
166
167         unsigned long service_start;
168         unsigned long service_used;
169
170         unsigned int max_rate;
171
172         /* number of requests that have been handed to the driver */
173         int in_flight;
174         /* number of currently allocated requests */
175         int alloc_limit[2];
176 };
177
178 struct cfq_rq {
179         struct rb_node rb_node;
180         sector_t rb_key;
181         struct request *request;
182         struct hlist_node hash;
183
184         struct cfq_queue *cfq_queue;
185         struct cfq_io_context *io_context;
186
187         unsigned long service_start;
188         unsigned long queue_start;
189
190         unsigned int in_flight : 1;
191         unsigned int accounted : 1;
192         unsigned int is_sync   : 1;
193         unsigned int is_write  : 1;
194 };
195
196 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
197 static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
198 static void cfq_update_next_crq(struct cfq_rq *);
199 static void cfq_put_cfqd(struct cfq_data *cfqd);
200
201 /*
202  * what the fairness is based on (ie how processes are grouped and
203  * differentiated)
204  */
205 static inline unsigned long
206 cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
207 {
208         /*
209          * optimize this so that ->key_type is the offset into the struct
210          */
211         switch (cfqd->key_type) {
212                 case CFQ_KEY_PGID:
213                         return process_group(tsk);
214                 default:
215                 case CFQ_KEY_TGID:
216                         return tsk->tgid;
217                 case CFQ_KEY_UID:
218                         return tsk->uid;
219                 case CFQ_KEY_GID:
220                         return tsk->gid;
221         }
222 }
223
224 /*
225  * lots of deadline iosched dupes, can be abstracted later...
226  */
227 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
228 {
229         hlist_del_init(&crq->hash);
230 }
231
232 static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
233 {
234         cfq_del_crq_hash(crq);
235
236         if (q->last_merge == crq->request)
237                 q->last_merge = NULL;
238
239         cfq_update_next_crq(crq);
240 }
241
242 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
243 {
244         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
245
246         BUG_ON(!hlist_unhashed(&crq->hash));
247
248         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
249 }
250
251 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
252 {
253         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
254         struct hlist_node *entry, *next;
255
256         hlist_for_each_safe(entry, next, hash_list) {
257                 struct cfq_rq *crq = list_entry_hash(entry);
258                 struct request *__rq = crq->request;
259
260                 BUG_ON(hlist_unhashed(&crq->hash));
261
262                 if (!rq_mergeable(__rq)) {
263                         cfq_del_crq_hash(crq);
264                         continue;
265                 }
266
267                 if (rq_hash_key(__rq) == offset)
268                         return __rq;
269         }
270
271         return NULL;
272 }
273
274 /*
275  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
276  * We choose the request that is closest to the head right now. Distance
277  * behind the head are penalized and only allowed to a certain extent.
278  */
279 static struct cfq_rq *
280 cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
281 {
282         sector_t last, s1, s2, d1 = 0, d2 = 0;
283         int r1_wrap = 0, r2_wrap = 0;   /* requests are behind the disk head */
284         unsigned long back_max;
285
286         if (crq1 == NULL || crq1 == crq2)
287                 return crq2;
288         if (crq2 == NULL)
289                 return crq1;
290
291         s1 = crq1->request->sector;
292         s2 = crq2->request->sector;
293
294         last = cfqd->last_sector;
295
296 #if 0
297         if (!list_empty(&cfqd->queue->queue_head)) {
298                 struct list_head *entry = &cfqd->queue->queue_head;
299                 unsigned long distance = ~0UL;
300                 struct request *rq;
301
302                 while ((entry = entry->prev) != &cfqd->queue->queue_head) {
303                         rq = list_entry_rq(entry);
304
305                         if (blk_barrier_rq(rq))
306                                 break;
307
308                         if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
309                                 distance = abs(s1 - rq->sector +rq->nr_sectors);
310                                 last = rq->sector + rq->nr_sectors;
311                         }
312                         if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
313                                 distance = abs(s2 - rq->sector +rq->nr_sectors);
314                                 last = rq->sector + rq->nr_sectors;
315                         }
316                 }
317         }
318 #endif
319
320         /*
321          * by definition, 1KiB is 2 sectors
322          */
323         back_max = cfqd->cfq_back_max * 2;
324
325         /*
326          * Strict one way elevator _except_ in the case where we allow
327          * short backward seeks which are biased as twice the cost of a
328          * similar forward seek.
329          */
330         if (s1 >= last)
331                 d1 = s1 - last;
332         else if (s1 + back_max >= last)
333                 d1 = (last - s1) * cfqd->cfq_back_penalty;
334         else
335                 r1_wrap = 1;
336
337         if (s2 >= last)
338                 d2 = s2 - last;
339         else if (s2 + back_max >= last)
340                 d2 = (last - s2) * cfqd->cfq_back_penalty;
341         else
342                 r2_wrap = 1;
343
344         /* Found required data */
345         if (!r1_wrap && r2_wrap)
346                 return crq1;
347         else if (!r2_wrap && r1_wrap)
348                 return crq2;
349         else if (r1_wrap && r2_wrap) {
350                 /* both behind the head */
351                 if (s1 <= s2)
352                         return crq1;
353                 else
354                         return crq2;
355         }
356
357         /* Both requests in front of the head */
358         if (d1 < d2)
359                 return crq1;
360         else if (d2 < d1)
361                 return crq2;
362         else {
363                 if (s1 >= s2)
364                         return crq1;
365                 else
366                         return crq2;
367         }
368 }
369
370 /*
371  * would be nice to take fifo expire time into account as well
372  */
373 static struct cfq_rq *
374 cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
375                   struct cfq_rq *last)
376 {
377         struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
378         struct rb_node *rbnext, *rbprev;
379
380         if (!ON_RB(&last->rb_node))
381                 return NULL;
382
383         if ((rbnext = rb_next(&last->rb_node)) == NULL)
384                 rbnext = rb_first(&cfqq->sort_list);
385
386         rbprev = rb_prev(&last->rb_node);
387
388         if (rbprev)
389                 crq_prev = rb_entry_crq(rbprev);
390         if (rbnext)
391                 crq_next = rb_entry_crq(rbnext);
392
393         return cfq_choose_req(cfqd, crq_next, crq_prev);
394 }
395
396 static void cfq_update_next_crq(struct cfq_rq *crq)
397 {
398         struct cfq_queue *cfqq = crq->cfq_queue;
399
400         if (cfqq->next_crq == crq)
401                 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
402 }
403
404 static int cfq_check_sort_rr_list(struct cfq_queue *cfqq)
405 {
406         struct list_head *head = &cfqq->cfqd->rr_list;
407         struct list_head *next, *prev;
408
409         /*
410          * list might still be ordered
411          */
412         next = cfqq->cfq_list.next;
413         if (next != head) {
414                 struct cfq_queue *cnext = list_entry_cfqq(next);
415
416                 if (cfqq->service_used > cnext->service_used)
417                         return 1;
418         }
419
420         prev = cfqq->cfq_list.prev;
421         if (prev != head) {
422                 struct cfq_queue *cprev = list_entry_cfqq(prev);
423
424                 if (cfqq->service_used < cprev->service_used)
425                         return 1;
426         }
427
428         return 0;
429 }
430
431 static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
432 {
433         struct list_head *entry = &cfqq->cfqd->rr_list;
434
435         if (!cfqq->on_rr)
436                 return;
437         if (!new_queue && !cfq_check_sort_rr_list(cfqq))
438                 return;
439
440         list_del(&cfqq->cfq_list);
441
442         /*
443          * sort by our mean service_used, sub-sort by in-flight requests
444          */
445         while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
446                 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
447
448                 if (cfqq->service_used > __cfqq->service_used)
449                         break;
450                 else if (cfqq->service_used == __cfqq->service_used) {
451                         struct list_head *prv;
452
453                         while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
454                                 __cfqq = list_entry_cfqq(prv);
455
456                                 WARN_ON(__cfqq->service_used > cfqq->service_used);
457                                 if (cfqq->service_used != __cfqq->service_used)
458                                         break;
459                                 if (cfqq->in_flight > __cfqq->in_flight)
460                                         break;
461
462                                 entry = prv;
463                         }
464                 }
465         }
466
467         list_add(&cfqq->cfq_list, entry);
468 }
469
470 /*
471  * add to busy list of queues for service, trying to be fair in ordering
472  * the pending list according to requests serviced
473  */
474 static inline void
475 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
476 {
477         /*
478          * it's currently on the empty list
479          */
480         cfqq->on_rr = 1;
481         cfqd->busy_queues++;
482
483         if (time_after(jiffies, cfqq->service_start + cfq_service))
484                 cfqq->service_used >>= 3;
485
486         cfq_sort_rr_list(cfqq, 1);
487 }
488
489 static inline void
490 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
491 {
492         list_move(&cfqq->cfq_list, &cfqd->empty_list);
493         cfqq->on_rr = 0;
494
495         BUG_ON(!cfqd->busy_queues);
496         cfqd->busy_queues--;
497 }
498
499 /*
500  * rb tree support functions
501  */
502 static inline void cfq_del_crq_rb(struct cfq_rq *crq)
503 {
504         struct cfq_queue *cfqq = crq->cfq_queue;
505
506         if (ON_RB(&crq->rb_node)) {
507                 struct cfq_data *cfqd = cfqq->cfqd;
508
509                 BUG_ON(!cfqq->queued[crq->is_sync]);
510
511                 cfq_update_next_crq(crq);
512
513                 cfqq->queued[crq->is_sync]--;
514                 rb_erase(&crq->rb_node, &cfqq->sort_list);
515                 RB_CLEAR_COLOR(&crq->rb_node);
516
517                 if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr)
518                         cfq_del_cfqq_rr(cfqd, cfqq);
519         }
520 }
521
522 static struct cfq_rq *
523 __cfq_add_crq_rb(struct cfq_rq *crq)
524 {
525         struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
526         struct rb_node *parent = NULL;
527         struct cfq_rq *__crq;
528
529         while (*p) {
530                 parent = *p;
531                 __crq = rb_entry_crq(parent);
532
533                 if (crq->rb_key < __crq->rb_key)
534                         p = &(*p)->rb_left;
535                 else if (crq->rb_key > __crq->rb_key)
536                         p = &(*p)->rb_right;
537                 else
538                         return __crq;
539         }
540
541         rb_link_node(&crq->rb_node, parent, p);
542         return NULL;
543 }
544
545 static void cfq_add_crq_rb(struct cfq_rq *crq)
546 {
547         struct cfq_queue *cfqq = crq->cfq_queue;
548         struct cfq_data *cfqd = cfqq->cfqd;
549         struct request *rq = crq->request;
550         struct cfq_rq *__alias;
551
552         crq->rb_key = rq_rb_key(rq);
553         cfqq->queued[crq->is_sync]++;
554
555         /*
556          * looks a little odd, but the first insert might return an alias.
557          * if that happens, put the alias on the dispatch list
558          */
559         while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
560                 cfq_dispatch_sort(cfqd->queue, __alias);
561
562         rb_insert_color(&crq->rb_node, &cfqq->sort_list);
563
564         if (!cfqq->on_rr)
565                 cfq_add_cfqq_rr(cfqd, cfqq);
566
567         /*
568          * check if this request is a better next-serve candidate
569          */
570         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
571 }
572
573 static inline void
574 cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
575 {
576         if (ON_RB(&crq->rb_node)) {
577                 rb_erase(&crq->rb_node, &cfqq->sort_list);
578                 cfqq->queued[crq->is_sync]--;
579         }
580
581         cfq_add_crq_rb(crq);
582 }
583
584 static struct request *
585 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
586 {
587         const unsigned long key = cfq_hash_key(cfqd, current);
588         struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
589         struct rb_node *n;
590
591         if (!cfqq)
592                 goto out;
593
594         n = cfqq->sort_list.rb_node;
595         while (n) {
596                 struct cfq_rq *crq = rb_entry_crq(n);
597
598                 if (sector < crq->rb_key)
599                         n = n->rb_left;
600                 else if (sector > crq->rb_key)
601                         n = n->rb_right;
602                 else
603                         return crq->request;
604         }
605
606 out:
607         return NULL;
608 }
609
610 /*
611  * make sure the service time gets corrected on reissue of this request
612  */
613 static void cfq_requeue_request(request_queue_t *q, struct request *rq)
614 {
615         struct cfq_rq *crq = RQ_DATA(rq);
616
617         if (crq) {
618                 struct cfq_queue *cfqq = crq->cfq_queue;
619
620                 if (cfqq->cfqd->cfq_tagged) {
621                         cfqq->service_used--;
622                         cfq_sort_rr_list(cfqq, 0);
623                 }
624
625                 crq->accounted = 0;
626                 cfqq->cfqd->rq_in_driver--;
627         }
628         list_add(&rq->queuelist, &q->queue_head);
629 }
630
631 static void cfq_remove_request(request_queue_t *q, struct request *rq)
632 {
633         struct cfq_rq *crq = RQ_DATA(rq);
634
635         if (crq) {
636                 cfq_remove_merge_hints(q, crq);
637                 list_del_init(&rq->queuelist);
638
639                 if (crq->cfq_queue)
640                         cfq_del_crq_rb(crq);
641         }
642 }
643
644 static int
645 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
646 {
647         struct cfq_data *cfqd = q->elevator->elevator_data;
648         struct request *__rq;
649         int ret;
650
651         ret = elv_try_last_merge(q, bio);
652         if (ret != ELEVATOR_NO_MERGE) {
653                 __rq = q->last_merge;
654                 goto out_insert;
655         }
656
657         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
658         if (__rq) {
659                 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
660
661                 if (elv_rq_merge_ok(__rq, bio)) {
662                         ret = ELEVATOR_BACK_MERGE;
663                         goto out;
664                 }
665         }
666
667         __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
668         if (__rq) {
669                 if (elv_rq_merge_ok(__rq, bio)) {
670                         ret = ELEVATOR_FRONT_MERGE;
671                         goto out;
672                 }
673         }
674
675         return ELEVATOR_NO_MERGE;
676 out:
677         q->last_merge = __rq;
678 out_insert:
679         *req = __rq;
680         return ret;
681 }
682
683 static void cfq_merged_request(request_queue_t *q, struct request *req)
684 {
685         struct cfq_data *cfqd = q->elevator->elevator_data;
686         struct cfq_rq *crq = RQ_DATA(req);
687
688         cfq_del_crq_hash(crq);
689         cfq_add_crq_hash(cfqd, crq);
690
691         if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
692                 struct cfq_queue *cfqq = crq->cfq_queue;
693
694                 cfq_update_next_crq(crq);
695                 cfq_reposition_crq_rb(cfqq, crq);
696         }
697
698         q->last_merge = req;
699 }
700
701 static void
702 cfq_merged_requests(request_queue_t *q, struct request *rq,
703                     struct request *next)
704 {
705         struct cfq_rq *crq = RQ_DATA(rq);
706         struct cfq_rq *cnext = RQ_DATA(next);
707
708         cfq_merged_request(q, rq);
709
710         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) {
711                 if (time_before(cnext->queue_start, crq->queue_start)) {
712                         list_move(&rq->queuelist, &next->queuelist);
713                         crq->queue_start = cnext->queue_start;
714                 }
715         }
716
717         cfq_update_next_crq(cnext);
718         cfq_remove_request(q, next);
719 }
720
721 /*
722  * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues,
723  * this function sector sorts the selected request to minimize seeks. we start
724  * at cfqd->last_sector, not 0.
725  */
726 static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
727 {
728         struct cfq_data *cfqd = q->elevator->elevator_data;
729         struct cfq_queue *cfqq = crq->cfq_queue;
730         struct list_head *head = &q->queue_head, *entry = head;
731         struct request *__rq;
732         sector_t last;
733
734         cfq_del_crq_rb(crq);
735         cfq_remove_merge_hints(q, crq);
736         list_del(&crq->request->queuelist);
737
738         last = cfqd->last_sector;
739         while ((entry = entry->prev) != head) {
740                 __rq = list_entry_rq(entry);
741
742                 if (blk_barrier_rq(crq->request))
743                         break;
744                 if (!blk_fs_request(crq->request))
745                         break;
746
747                 if (crq->request->sector > __rq->sector)
748                         break;
749                 if (__rq->sector > last && crq->request->sector < last) {
750                         last = crq->request->sector;
751                         break;
752                 }
753         }
754
755         cfqd->last_sector = last;
756         crq->in_flight = 1;
757         cfqq->in_flight++;
758         list_add(&crq->request->queuelist, entry);
759 }
760
761 /*
762  * return expired entry, or NULL to just start from scratch in rbtree
763  */
764 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
765 {
766         struct cfq_data *cfqd = cfqq->cfqd;
767         const int reads = !list_empty(&cfqq->fifo[0]);
768         const int writes = !list_empty(&cfqq->fifo[1]);
769         unsigned long now = jiffies;
770         struct cfq_rq *crq;
771
772         if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire))
773                 return NULL;
774
775         crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist));
776         if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) {
777                 cfqq->last_fifo_expire = now;
778                 return crq;
779         }
780
781         crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist));
782         if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) {
783                 cfqq->last_fifo_expire = now;
784                 return crq;
785         }
786
787         return NULL;
788 }
789
790 /*
791  * dispatch a single request from given queue
792  */
793 static inline void
794 cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd,
795                      struct cfq_queue *cfqq)
796 {
797         struct cfq_rq *crq;
798
799         /*
800          * follow expired path, else get first next available
801          */
802         if ((crq = cfq_check_fifo(cfqq)) == NULL) {
803                 if (cfqd->find_best_crq)
804                         crq = cfqq->next_crq;
805                 else
806                         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
807         }
808
809         cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
810
811         /*
812          * finally, insert request into driver list
813          */
814         cfq_dispatch_sort(q, crq);
815 }
816
817 static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch)
818 {
819         struct cfq_data *cfqd = q->elevator->elevator_data;
820         struct cfq_queue *cfqq;
821         struct list_head *entry, *tmp;
822         int queued, busy_queues, first_round;
823
824         if (list_empty(&cfqd->rr_list))
825                 return 0;
826
827         queued = 0;
828         first_round = 1;
829 restart:
830         busy_queues = 0;
831         list_for_each_safe(entry, tmp, &cfqd->rr_list) {
832                 cfqq = list_entry_cfqq(entry);
833
834                 BUG_ON(RB_EMPTY(&cfqq->sort_list));
835
836                 /*
837                  * first round of queueing, only select from queues that
838                  * don't already have io in-flight
839                  */
840                 if (first_round && cfqq->in_flight)
841                         continue;
842
843                 cfq_dispatch_request(q, cfqd, cfqq);
844
845                 if (!RB_EMPTY(&cfqq->sort_list))
846                         busy_queues++;
847
848                 queued++;
849         }
850
851         if ((queued < max_dispatch) && (busy_queues || first_round)) {
852                 first_round = 0;
853                 goto restart;
854         }
855
856         return queued;
857 }
858
859 static inline void cfq_account_dispatch(struct cfq_rq *crq)
860 {
861         struct cfq_queue *cfqq = crq->cfq_queue;
862         struct cfq_data *cfqd = cfqq->cfqd;
863         unsigned long now, elapsed;
864
865         if (!blk_fs_request(crq->request))
866                 return;
867
868         /*
869          * accounted bit is necessary since some drivers will call
870          * elv_next_request() many times for the same request (eg ide)
871          */
872         if (crq->accounted)
873                 return;
874
875         now = jiffies;
876         if (cfqq->service_start == ~0UL)
877                 cfqq->service_start = now;
878
879         /*
880          * on drives with tagged command queueing, command turn-around time
881          * doesn't necessarily reflect the time spent processing this very
882          * command inside the drive. so do the accounting differently there,
883          * by just sorting on the number of requests
884          */
885         if (cfqd->cfq_tagged) {
886                 if (time_after(now, cfqq->service_start + cfq_service)) {
887                         cfqq->service_start = now;
888                         cfqq->service_used /= 10;
889                 }
890
891                 cfqq->service_used++;
892                 cfq_sort_rr_list(cfqq, 0);
893         }
894
895         elapsed = now - crq->queue_start;
896         if (elapsed > max_elapsed_dispatch)
897                 max_elapsed_dispatch = elapsed;
898
899         crq->accounted = 1;
900         crq->service_start = now;
901
902         if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) {
903                 cfqq->cfqd->cfq_tagged = 1;
904                 printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG);
905         }
906 }
907
908 static inline void
909 cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
910 {
911         struct cfq_data *cfqd = cfqq->cfqd;
912
913         if (!crq->accounted)
914                 return;
915
916         WARN_ON(!cfqd->rq_in_driver);
917         cfqd->rq_in_driver--;
918
919         if (!cfqd->cfq_tagged) {
920                 unsigned long now = jiffies;
921                 unsigned long duration = now - crq->service_start;
922
923                 if (time_after(now, cfqq->service_start + cfq_service)) {
924                         cfqq->service_start = now;
925                         cfqq->service_used >>= 3;
926                 }
927
928                 cfqq->service_used += duration;
929                 cfq_sort_rr_list(cfqq, 0);
930
931                 if (duration > max_elapsed_crq)
932                         max_elapsed_crq = duration;
933         }
934 }
935
936 static struct request *cfq_next_request(request_queue_t *q)
937 {
938         struct cfq_data *cfqd = q->elevator->elevator_data;
939         struct request *rq;
940
941         if (!list_empty(&q->queue_head)) {
942                 struct cfq_rq *crq;
943 dispatch:
944                 rq = list_entry_rq(q->queue_head.next);
945
946                 if ((crq = RQ_DATA(rq)) != NULL) {
947                         cfq_remove_merge_hints(q, crq);
948                         cfq_account_dispatch(crq);
949                 }
950
951                 return rq;
952         }
953
954         if (cfq_dispatch_requests(q, cfqd->cfq_quantum))
955                 goto dispatch;
956
957         return NULL;
958 }
959
960 /*
961  * task holds one reference to the queue, dropped when task exits. each crq
962  * in-flight on this queue also holds a reference, dropped when crq is freed.
963  *
964  * queue lock must be held here.
965  */
966 static void cfq_put_queue(struct cfq_queue *cfqq)
967 {
968         BUG_ON(!atomic_read(&cfqq->ref));
969
970         if (!atomic_dec_and_test(&cfqq->ref))
971                 return;
972
973         BUG_ON(rb_first(&cfqq->sort_list));
974         BUG_ON(cfqq->on_rr);
975
976         cfq_put_cfqd(cfqq->cfqd);
977
978         /*
979          * it's on the empty list and still hashed
980          */
981         list_del(&cfqq->cfq_list);
982         hlist_del(&cfqq->cfq_hash);
983         kmem_cache_free(cfq_pool, cfqq);
984 }
985
986 static inline struct cfq_queue *
987 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
988 {
989         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
990         struct hlist_node *entry, *next;
991
992         hlist_for_each_safe(entry, next, hash_list) {
993                 struct cfq_queue *__cfqq = list_entry_qhash(entry);
994
995                 if (__cfqq->key == key)
996                         return __cfqq;
997         }
998
999         return NULL;
1000 }
1001
1002 static struct cfq_queue *
1003 cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key)
1004 {
1005         return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
1006 }
1007
1008 static inline void
1009 cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
1010                 struct cfq_io_context *cic)
1011 {
1012         unsigned long hashkey = cfq_hash_key(cfqd, current);
1013         unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
1014         struct cfq_queue *__cfqq;
1015         unsigned long flags;
1016
1017         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1018
1019         hlist_del(&(*cfqq)->cfq_hash);
1020
1021         __cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
1022         if (!__cfqq || __cfqq == *cfqq) {
1023                 __cfqq = *cfqq;
1024                 hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1025                 __cfqq->key_type = cfqd->key_type;
1026         } else {
1027                 atomic_inc(&__cfqq->ref);
1028                 cic->cfqq = __cfqq;
1029                 cfq_put_queue(*cfqq);
1030                 *cfqq = __cfqq;
1031         }
1032
1033         cic->cfqq = __cfqq;
1034         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1035 }
1036
1037 static void cfq_free_io_context(struct cfq_io_context *cic)
1038 {
1039         kmem_cache_free(cfq_ioc_pool, cic);
1040 }
1041
1042 /*
1043  * locking hierarchy is: io_context lock -> queue locks
1044  */
1045 static void cfq_exit_io_context(struct cfq_io_context *cic)
1046 {
1047         struct cfq_queue *cfqq = cic->cfqq;
1048         struct list_head *entry = &cic->list;
1049         request_queue_t *q;
1050         unsigned long flags;
1051
1052         /*
1053          * put the reference this task is holding to the various queues
1054          */
1055         spin_lock_irqsave(&cic->ioc->lock, flags);
1056         while ((entry = cic->list.next) != &cic->list) {
1057                 struct cfq_io_context *__cic;
1058
1059                 __cic = list_entry(entry, struct cfq_io_context, list);
1060                 list_del(entry);
1061
1062                 q = __cic->cfqq->cfqd->queue;
1063                 spin_lock(q->queue_lock);
1064                 cfq_put_queue(__cic->cfqq);
1065                 spin_unlock(q->queue_lock);
1066         }
1067
1068         q = cfqq->cfqd->queue;
1069         spin_lock(q->queue_lock);
1070         cfq_put_queue(cfqq);
1071         spin_unlock(q->queue_lock);
1072
1073         cic->cfqq = NULL;
1074         spin_unlock_irqrestore(&cic->ioc->lock, flags);
1075 }
1076
1077 static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
1078 {
1079         struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags);
1080
1081         if (cic) {
1082                 cic->dtor = cfq_free_io_context;
1083                 cic->exit = cfq_exit_io_context;
1084                 INIT_LIST_HEAD(&cic->list);
1085                 cic->cfqq = NULL;
1086         }
1087
1088         return cic;
1089 }
1090
1091 /*
1092  * Setup general io context and cfq io context. There can be several cfq
1093  * io contexts per general io context, if this process is doing io to more
1094  * than one device managed by cfq. Note that caller is holding a reference to
1095  * cfqq, so we don't need to worry about it disappearing
1096  */
1097 static struct cfq_io_context *
1098 cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
1099 {
1100         struct cfq_data *cfqd = (*cfqq)->cfqd;
1101         struct cfq_queue *__cfqq = *cfqq;
1102         struct cfq_io_context *cic;
1103         struct io_context *ioc;
1104
1105         might_sleep_if(gfp_flags & __GFP_WAIT);
1106
1107         ioc = get_io_context(gfp_flags);
1108         if (!ioc)
1109                 return NULL;
1110
1111         if ((cic = ioc->cic) == NULL) {
1112                 cic = cfq_alloc_io_context(gfp_flags);
1113
1114                 if (cic == NULL)
1115                         goto err;
1116
1117                 ioc->cic = cic;
1118                 cic->ioc = ioc;
1119                 cic->cfqq = __cfqq;
1120                 atomic_inc(&__cfqq->ref);
1121         } else {
1122                 struct cfq_io_context *__cic;
1123                 unsigned long flags;
1124
1125                 /*
1126                  * since the first cic on the list is actually the head
1127                  * itself, need to check this here or we'll duplicate an
1128                  * cic per ioc for no reason
1129                  */
1130                 if (cic->cfqq == __cfqq)
1131                         goto out;
1132
1133                 /*
1134                  * cic exists, check if we already are there. linear search
1135                  * should be ok here, the list will usually not be more than
1136                  * 1 or a few entries long
1137                  */
1138                 spin_lock_irqsave(&ioc->lock, flags);
1139                 list_for_each_entry(__cic, &cic->list, list) {
1140                         /*
1141                          * this process is already holding a reference to
1142                          * this queue, so no need to get one more
1143                          */
1144                         if (__cic->cfqq == __cfqq) {
1145                                 cic = __cic;
1146                                 spin_unlock_irqrestore(&ioc->lock, flags);
1147                                 goto out;
1148                         }
1149                 }
1150                 spin_unlock_irqrestore(&ioc->lock, flags);
1151
1152                 /*
1153                  * nope, process doesn't have a cic assoicated with this
1154                  * cfqq yet. get a new one and add to list
1155                  */
1156                 __cic = cfq_alloc_io_context(gfp_flags);
1157                 if (__cic == NULL)
1158                         goto err;
1159
1160                 __cic->ioc = ioc;
1161                 __cic->cfqq = __cfqq;
1162                 atomic_inc(&__cfqq->ref);
1163                 spin_lock_irqsave(&ioc->lock, flags);
1164                 list_add(&__cic->list, &cic->list);
1165                 spin_unlock_irqrestore(&ioc->lock, flags);
1166
1167                 cic = __cic;
1168                 *cfqq = __cfqq;
1169         }
1170
1171 out:
1172         /*
1173          * if key_type has been changed on the fly, we lazily rehash
1174          * each queue at lookup time
1175          */
1176         if ((*cfqq)->key_type != cfqd->key_type)
1177                 cfq_rehash_cfqq(cfqd, cfqq, cic);
1178
1179         return cic;
1180 err:
1181         put_io_context(ioc);
1182         return NULL;
1183 }
1184
1185 static struct cfq_queue *
1186 __cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
1187 {
1188         const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1189         struct cfq_queue *cfqq, *new_cfqq = NULL;
1190
1191 retry:
1192         cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
1193
1194         if (!cfqq) {
1195                 if (new_cfqq) {
1196                         cfqq = new_cfqq;
1197                         new_cfqq = NULL;
1198                 } else if (gfp_mask & __GFP_WAIT) {
1199                         spin_unlock_irq(cfqd->queue->queue_lock);
1200                         new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1201                         spin_lock_irq(cfqd->queue->queue_lock);
1202                         goto retry;
1203                 } else
1204                         goto out;
1205
1206                 memset(cfqq, 0, sizeof(*cfqq));
1207
1208                 INIT_HLIST_NODE(&cfqq->cfq_hash);
1209                 INIT_LIST_HEAD(&cfqq->cfq_list);
1210                 RB_CLEAR_ROOT(&cfqq->sort_list);
1211                 INIT_LIST_HEAD(&cfqq->fifo[0]);
1212                 INIT_LIST_HEAD(&cfqq->fifo[1]);
1213
1214                 cfqq->key = key;
1215                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1216                 atomic_set(&cfqq->ref, 0);
1217                 cfqq->cfqd = cfqd;
1218                 atomic_inc(&cfqd->ref);
1219                 cfqq->key_type = cfqd->key_type;
1220                 cfqq->service_start = ~0UL;
1221         }
1222
1223         if (new_cfqq)
1224                 kmem_cache_free(cfq_pool, new_cfqq);
1225
1226         atomic_inc(&cfqq->ref);
1227 out:
1228         WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1229         return cfqq;
1230 }
1231
1232 static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
1233 {
1234         crq->is_sync = 0;
1235         if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE)
1236                 crq->is_sync = 1;
1237
1238         cfq_add_crq_rb(crq);
1239         crq->queue_start = jiffies;
1240
1241         list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]);
1242 }
1243
1244 static void
1245 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
1246 {
1247         struct cfq_data *cfqd = q->elevator->elevator_data;
1248         struct cfq_rq *crq = RQ_DATA(rq);
1249
1250         switch (where) {
1251                 case ELEVATOR_INSERT_BACK:
1252                         while (cfq_dispatch_requests(q, cfqd->cfq_quantum))
1253                                 ;
1254                         list_add_tail(&rq->queuelist, &q->queue_head);
1255                         break;
1256                 case ELEVATOR_INSERT_FRONT:
1257                         list_add(&rq->queuelist, &q->queue_head);
1258                         break;
1259                 case ELEVATOR_INSERT_SORT:
1260                         BUG_ON(!blk_fs_request(rq));
1261                         cfq_enqueue(cfqd, crq);
1262                         break;
1263                 default:
1264                         printk("%s: bad insert point %d\n", __FUNCTION__,where);
1265                         return;
1266         }
1267
1268         if (rq_mergeable(rq)) {
1269                 cfq_add_crq_hash(cfqd, crq);
1270
1271                 if (!q->last_merge)
1272                         q->last_merge = rq;
1273         }
1274 }
1275
1276 static int cfq_queue_empty(request_queue_t *q)
1277 {
1278         struct cfq_data *cfqd = q->elevator->elevator_data;
1279
1280         return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list);
1281 }
1282
1283 static void cfq_completed_request(request_queue_t *q, struct request *rq)
1284 {
1285         struct cfq_rq *crq = RQ_DATA(rq);
1286
1287         if (unlikely(!blk_fs_request(rq)))
1288                 return;
1289
1290         if (crq->in_flight) {
1291                 struct cfq_queue *cfqq = crq->cfq_queue;
1292
1293                 WARN_ON(!cfqq->in_flight);
1294                 cfqq->in_flight--;
1295
1296                 cfq_account_completion(cfqq, crq);
1297         }
1298
1299 }
1300
1301 static struct request *
1302 cfq_former_request(request_queue_t *q, struct request *rq)
1303 {
1304         struct cfq_rq *crq = RQ_DATA(rq);
1305         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1306
1307         if (rbprev)
1308                 return rb_entry_crq(rbprev)->request;
1309
1310         return NULL;
1311 }
1312
1313 static struct request *
1314 cfq_latter_request(request_queue_t *q, struct request *rq)
1315 {
1316         struct cfq_rq *crq = RQ_DATA(rq);
1317         struct rb_node *rbnext = rb_next(&crq->rb_node);
1318
1319         if (rbnext)
1320                 return rb_entry_crq(rbnext)->request;
1321
1322         return NULL;
1323 }
1324
1325 static int cfq_may_queue(request_queue_t *q, int rw)
1326 {
1327         struct cfq_data *cfqd = q->elevator->elevator_data;
1328         struct cfq_queue *cfqq;
1329         int ret = ELV_MQUEUE_MAY;
1330
1331         if (current->flags & PF_MEMALLOC)
1332                 return ELV_MQUEUE_MAY;
1333
1334         cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current));
1335         if (cfqq) {
1336                 int limit = cfqd->max_queued;
1337
1338                 if (cfqq->allocated[rw] < cfqd->cfq_queued)
1339                         return ELV_MQUEUE_MUST;
1340
1341                 if (cfqd->busy_queues)
1342                         limit = q->nr_requests / cfqd->busy_queues;
1343
1344                 if (limit < cfqd->cfq_queued)
1345                         limit = cfqd->cfq_queued;
1346                 else if (limit > cfqd->max_queued)
1347                         limit = cfqd->max_queued;
1348
1349                 if (cfqq->allocated[rw] >= limit) {
1350                         if (limit > cfqq->alloc_limit[rw])
1351                                 cfqq->alloc_limit[rw] = limit;
1352
1353                         ret = ELV_MQUEUE_NO;
1354                 }
1355         }
1356
1357         return ret;
1358 }
1359
1360 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1361 {
1362         struct request_list *rl = &q->rq;
1363         const int write = waitqueue_active(&rl->wait[WRITE]);
1364         const int read = waitqueue_active(&rl->wait[READ]);
1365
1366         if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ])
1367                 wake_up(&rl->wait[READ]);
1368         if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE])
1369                 wake_up(&rl->wait[WRITE]);
1370 }
1371
1372 /*
1373  * queue lock held here
1374  */
1375 static void cfq_put_request(request_queue_t *q, struct request *rq)
1376 {
1377         struct cfq_data *cfqd = q->elevator->elevator_data;
1378         struct cfq_rq *crq = RQ_DATA(rq);
1379
1380         if (crq) {
1381                 struct cfq_queue *cfqq = crq->cfq_queue;
1382
1383                 BUG_ON(q->last_merge == rq);
1384                 BUG_ON(!hlist_unhashed(&crq->hash));
1385
1386                 if (crq->io_context)
1387                         put_io_context(crq->io_context->ioc);
1388
1389                 BUG_ON(!cfqq->allocated[crq->is_write]);
1390                 cfqq->allocated[crq->is_write]--;
1391
1392                 mempool_free(crq, cfqd->crq_pool);
1393                 rq->elevator_private = NULL;
1394
1395                 smp_mb();
1396                 cfq_check_waiters(q, cfqq);
1397                 cfq_put_queue(cfqq);
1398         }
1399 }
1400
1401 /*
1402  * Allocate cfq data structures associated with this request. A queue and
1403  */
1404 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
1405 {
1406         struct cfq_data *cfqd = q->elevator->elevator_data;
1407         struct cfq_io_context *cic;
1408         const int rw = rq_data_dir(rq);
1409         struct cfq_queue *cfqq, *saved_cfqq;
1410         struct cfq_rq *crq;
1411         unsigned long flags;
1412
1413         might_sleep_if(gfp_mask & __GFP_WAIT);
1414
1415         spin_lock_irqsave(q->queue_lock, flags);
1416
1417         cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask);
1418         if (!cfqq)
1419                 goto out_lock;
1420
1421 repeat:
1422         if (cfqq->allocated[rw] >= cfqd->max_queued)
1423                 goto out_lock;
1424
1425         cfqq->allocated[rw]++;
1426         spin_unlock_irqrestore(q->queue_lock, flags);
1427
1428         /*
1429          * if hashing type has changed, the cfq_queue might change here.
1430          */
1431         saved_cfqq = cfqq;
1432         cic = cfq_get_io_context(&cfqq, gfp_mask);
1433         if (!cic)
1434                 goto err;
1435
1436         /*
1437          * repeat allocation checks on queue change
1438          */
1439         if (unlikely(saved_cfqq != cfqq)) {
1440                 spin_lock_irqsave(q->queue_lock, flags);
1441                 saved_cfqq->allocated[rw]--;
1442                 goto repeat;
1443         }
1444
1445         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1446         if (crq) {
1447                 RB_CLEAR(&crq->rb_node);
1448                 crq->rb_key = 0;
1449                 crq->request = rq;
1450                 INIT_HLIST_NODE(&crq->hash);
1451                 crq->cfq_queue = cfqq;
1452                 crq->io_context = cic;
1453                 crq->service_start = crq->queue_start = 0;
1454                 crq->in_flight = crq->accounted = crq->is_sync = 0;
1455                 crq->is_write = rw;
1456                 rq->elevator_private = crq;
1457                 cfqq->alloc_limit[rw] = 0;
1458                 return 0;
1459         }
1460
1461         put_io_context(cic->ioc);
1462 err:
1463         spin_lock_irqsave(q->queue_lock, flags);
1464         cfqq->allocated[rw]--;
1465         cfq_put_queue(cfqq);
1466 out_lock:
1467         spin_unlock_irqrestore(q->queue_lock, flags);
1468         return 1;
1469 }
1470
1471 static void cfq_put_cfqd(struct cfq_data *cfqd)
1472 {
1473         request_queue_t *q = cfqd->queue;
1474
1475         if (!atomic_dec_and_test(&cfqd->ref))
1476                 return;
1477
1478         blk_put_queue(q);
1479
1480         mempool_destroy(cfqd->crq_pool);
1481         kfree(cfqd->crq_hash);
1482         kfree(cfqd->cfq_hash);
1483         kfree(cfqd);
1484 }
1485
1486 static void cfq_exit_queue(elevator_t *e)
1487 {
1488         cfq_put_cfqd(e->elevator_data);
1489 }
1490
1491 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
1492 {
1493         struct cfq_data *cfqd;
1494         int i;
1495
1496         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
1497         if (!cfqd)
1498                 return -ENOMEM;
1499
1500         memset(cfqd, 0, sizeof(*cfqd));
1501         INIT_LIST_HEAD(&cfqd->rr_list);
1502         INIT_LIST_HEAD(&cfqd->empty_list);
1503
1504         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
1505         if (!cfqd->crq_hash)
1506                 goto out_crqhash;
1507
1508         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
1509         if (!cfqd->cfq_hash)
1510                 goto out_cfqhash;
1511
1512         cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
1513         if (!cfqd->crq_pool)
1514                 goto out_crqpool;
1515
1516         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
1517                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
1518         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
1519                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
1520
1521         e->elevator_data = cfqd;
1522
1523         cfqd->queue = q;
1524         atomic_inc(&q->refcnt);
1525
1526         /*
1527          * just set it to some high value, we want anyone to be able to queue
1528          * some requests. fairness is handled differently
1529          */
1530         q->nr_requests = 1024;
1531         cfqd->max_queued = q->nr_requests / 16;
1532         q->nr_batching = cfq_queued;
1533         cfqd->key_type = CFQ_KEY_TGID;
1534         cfqd->find_best_crq = 1;
1535         atomic_set(&cfqd->ref, 1);
1536
1537         cfqd->cfq_queued = cfq_queued;
1538         cfqd->cfq_quantum = cfq_quantum;
1539         cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r;
1540         cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w;
1541         cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
1542         cfqd->cfq_back_max = cfq_back_max;
1543         cfqd->cfq_back_penalty = cfq_back_penalty;
1544
1545         return 0;
1546 out_crqpool:
1547         kfree(cfqd->cfq_hash);
1548 out_cfqhash:
1549         kfree(cfqd->crq_hash);
1550 out_crqhash:
1551         kfree(cfqd);
1552         return -ENOMEM;
1553 }
1554
1555 static void cfq_slab_kill(void)
1556 {
1557         if (crq_pool)
1558                 kmem_cache_destroy(crq_pool);
1559         if (cfq_pool)
1560                 kmem_cache_destroy(cfq_pool);
1561         if (cfq_ioc_pool)
1562                 kmem_cache_destroy(cfq_ioc_pool);
1563 }
1564
1565 static int __init cfq_slab_setup(void)
1566 {
1567         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
1568                                         NULL, NULL);
1569         if (!crq_pool)
1570                 goto fail;
1571
1572         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
1573                                         NULL, NULL);
1574         if (!cfq_pool)
1575                 goto fail;
1576
1577         cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
1578                         sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
1579         if (!cfq_ioc_pool)
1580                 goto fail;
1581
1582         return 0;
1583 fail:
1584         cfq_slab_kill();
1585         return -ENOMEM;
1586 }
1587
1588
1589 /*
1590  * sysfs parts below -->
1591  */
1592 struct cfq_fs_entry {
1593         struct attribute attr;
1594         ssize_t (*show)(struct cfq_data *, char *);
1595         ssize_t (*store)(struct cfq_data *, const char *, size_t);
1596 };
1597
1598 static ssize_t
1599 cfq_var_show(unsigned int var, char *page)
1600 {
1601         return sprintf(page, "%d\n", var);
1602 }
1603
1604 static ssize_t
1605 cfq_var_store(unsigned int *var, const char *page, size_t count)
1606 {
1607         char *p = (char *) page;
1608
1609         *var = simple_strtoul(p, &p, 10);
1610         return count;
1611 }
1612
1613 static ssize_t
1614 cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
1615 {
1616         max_elapsed_dispatch = max_elapsed_crq = 0;
1617         return count;
1618 }
1619
1620 static ssize_t
1621 cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
1622 {
1623         spin_lock_irq(cfqd->queue->queue_lock);
1624         if (!strncmp(page, "pgid", 4))
1625                 cfqd->key_type = CFQ_KEY_PGID;
1626         else if (!strncmp(page, "tgid", 4))
1627                 cfqd->key_type = CFQ_KEY_TGID;
1628         else if (!strncmp(page, "uid", 3))
1629                 cfqd->key_type = CFQ_KEY_UID;
1630         else if (!strncmp(page, "gid", 3))
1631                 cfqd->key_type = CFQ_KEY_GID;
1632         spin_unlock_irq(cfqd->queue->queue_lock);
1633         return count;
1634 }
1635
1636 static ssize_t
1637 cfq_read_key_type(struct cfq_data *cfqd, char *page)
1638 {
1639         ssize_t len = 0;
1640         int i;
1641
1642         for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
1643                 if (cfqd->key_type == i)
1644                         len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
1645                 else
1646                         len += sprintf(page+len, "%s ", cfq_key_types[i]);
1647         }
1648         len += sprintf(page+len, "\n");
1649         return len;
1650 }
1651
1652 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
1653 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)                \
1654 {                                                                       \
1655         unsigned int __data = __VAR;                                    \
1656         if (__CONV)                                                     \
1657                 __data = jiffies_to_msecs(__data);                      \
1658         return cfq_var_show(__data, (page));                            \
1659 }
1660 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
1661 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
1662 SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1);
1663 SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1);
1664 SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1);
1665 SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0);
1666 SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
1667 SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
1668 #undef SHOW_FUNCTION
1669
1670 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
1671 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)    \
1672 {                                                                       \
1673         unsigned int __data;                                            \
1674         int ret = cfq_var_store(&__data, (page), count);                \
1675         if (__data < (MIN))                                             \
1676                 __data = (MIN);                                         \
1677         else if (__data > (MAX))                                        \
1678                 __data = (MAX);                                         \
1679         if (__CONV)                                                     \
1680                 *(__PTR) = msecs_to_jiffies(__data);                    \
1681         else                                                            \
1682                 *(__PTR) = __data;                                      \
1683         return ret;                                                     \
1684 }
1685 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
1686 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
1687 STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1);
1688 STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1);
1689 STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1);
1690 STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0);
1691 STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
1692 STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
1693 #undef STORE_FUNCTION
1694
1695 static struct cfq_fs_entry cfq_quantum_entry = {
1696         .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
1697         .show = cfq_quantum_show,
1698         .store = cfq_quantum_store,
1699 };
1700 static struct cfq_fs_entry cfq_queued_entry = {
1701         .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
1702         .show = cfq_queued_show,
1703         .store = cfq_queued_store,
1704 };
1705 static struct cfq_fs_entry cfq_fifo_expire_r_entry = {
1706         .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
1707         .show = cfq_fifo_expire_r_show,
1708         .store = cfq_fifo_expire_r_store,
1709 };
1710 static struct cfq_fs_entry cfq_fifo_expire_w_entry = {
1711         .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
1712         .show = cfq_fifo_expire_w_show,
1713         .store = cfq_fifo_expire_w_store,
1714 };
1715 static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
1716         .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
1717         .show = cfq_fifo_batch_expire_show,
1718         .store = cfq_fifo_batch_expire_store,
1719 };
1720 static struct cfq_fs_entry cfq_find_best_entry = {
1721         .attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
1722         .show = cfq_find_best_show,
1723         .store = cfq_find_best_store,
1724 };
1725 static struct cfq_fs_entry cfq_back_max_entry = {
1726         .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
1727         .show = cfq_back_max_show,
1728         .store = cfq_back_max_store,
1729 };
1730 static struct cfq_fs_entry cfq_back_penalty_entry = {
1731         .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
1732         .show = cfq_back_penalty_show,
1733         .store = cfq_back_penalty_store,
1734 };
1735 static struct cfq_fs_entry cfq_clear_elapsed_entry = {
1736         .attr = {.name = "clear_elapsed", .mode = S_IWUSR },
1737         .store = cfq_clear_elapsed,
1738 };
1739 static struct cfq_fs_entry cfq_key_type_entry = {
1740         .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
1741         .show = cfq_read_key_type,
1742         .store = cfq_set_key_type,
1743 };
1744
1745 static struct attribute *default_attrs[] = {
1746         &cfq_quantum_entry.attr,
1747         &cfq_queued_entry.attr,
1748         &cfq_fifo_expire_r_entry.attr,
1749         &cfq_fifo_expire_w_entry.attr,
1750         &cfq_fifo_batch_expire_entry.attr,
1751         &cfq_key_type_entry.attr,
1752         &cfq_find_best_entry.attr,
1753         &cfq_back_max_entry.attr,
1754         &cfq_back_penalty_entry.attr,
1755         &cfq_clear_elapsed_entry.attr,
1756         NULL,
1757 };
1758
1759 #define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
1760
1761 static ssize_t
1762 cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1763 {
1764         elevator_t *e = container_of(kobj, elevator_t, kobj);
1765         struct cfq_fs_entry *entry = to_cfq(attr);
1766
1767         if (!entry->show)
1768                 return 0;
1769
1770         return entry->show(e->elevator_data, page);
1771 }
1772
1773 static ssize_t
1774 cfq_attr_store(struct kobject *kobj, struct attribute *attr,
1775                const char *page, size_t length)
1776 {
1777         elevator_t *e = container_of(kobj, elevator_t, kobj);
1778         struct cfq_fs_entry *entry = to_cfq(attr);
1779
1780         if (!entry->store)
1781                 return -EINVAL;
1782
1783         return entry->store(e->elevator_data, page, length);
1784 }
1785
1786 static struct sysfs_ops cfq_sysfs_ops = {
1787         .show   = cfq_attr_show,
1788         .store  = cfq_attr_store,
1789 };
1790
1791 struct kobj_type cfq_ktype = {
1792         .sysfs_ops      = &cfq_sysfs_ops,
1793         .default_attrs  = default_attrs,
1794 };
1795
1796 static struct elevator_type iosched_cfq = {
1797         .ops = {
1798                 .elevator_merge_fn =            cfq_merge,
1799                 .elevator_merged_fn =           cfq_merged_request,
1800                 .elevator_merge_req_fn =        cfq_merged_requests,
1801                 .elevator_next_req_fn =         cfq_next_request,
1802                 .elevator_add_req_fn =          cfq_insert_request,
1803                 .elevator_remove_req_fn =       cfq_remove_request,
1804                 .elevator_requeue_req_fn =      cfq_requeue_request,
1805                 .elevator_queue_empty_fn =      cfq_queue_empty,
1806                 .elevator_completed_req_fn =    cfq_completed_request,
1807                 .elevator_former_req_fn =       cfq_former_request,
1808                 .elevator_latter_req_fn =       cfq_latter_request,
1809                 .elevator_set_req_fn =          cfq_set_request,
1810                 .elevator_put_req_fn =          cfq_put_request,
1811                 .elevator_may_queue_fn =        cfq_may_queue,
1812                 .elevator_init_fn =             cfq_init_queue,
1813                 .elevator_exit_fn =             cfq_exit_queue,
1814         },
1815         .elevator_ktype =       &cfq_ktype,
1816         .elevator_name =        "cfq",
1817         .elevator_owner =       THIS_MODULE,
1818 };
1819
1820 int cfq_init(void)
1821 {
1822         int ret;
1823
1824         if (cfq_slab_setup())
1825                 return -ENOMEM;
1826
1827         ret = elv_register(&iosched_cfq);
1828         if (!ret) {
1829                 __module_get(THIS_MODULE);
1830                 return 0;
1831         }
1832
1833         cfq_slab_kill();
1834         return ret;
1835 }
1836
1837 static void __exit cfq_exit(void)
1838 {
1839         cfq_slab_kill();
1840         elv_unregister(&iosched_cfq);
1841 }
1842
1843 module_init(cfq_init);
1844 module_exit(cfq_exit);
1845
1846 MODULE_AUTHOR("Jens Axboe");
1847 MODULE_LICENSE("GPL");
1848 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");