vserver 2.0 rc7
[linux-2.6.git] / drivers / block / cfq-iosched.c
1 /*
2  *  linux/drivers/block/cfq-iosched.c
3  *
4  *  CFQ, or complete fairness queueing, disk scheduler.
5  *
6  *  Based on ideas from a previously unfinished io
7  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
8  *
9  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
10  */
11 #include <linux/kernel.h>
12 #include <linux/fs.h>
13 #include <linux/blkdev.h>
14 #include <linux/elevator.h>
15 #include <linux/bio.h>
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/init.h>
20 #include <linux/compiler.h>
21 #include <linux/hash.h>
22 #include <linux/rbtree.h>
23 #include <linux/mempool.h>
24
25 static unsigned long max_elapsed_crq;
26 static unsigned long max_elapsed_dispatch;
27
28 /*
29  * tunables
30  */
31 static int cfq_quantum = 4;             /* max queue in one round of service */
32 static int cfq_queued = 8;              /* minimum rq allocate limit per-queue*/
33 static int cfq_service = HZ;            /* period over which service is avg */
34 static int cfq_fifo_expire_r = HZ / 2;  /* fifo timeout for sync requests */
35 static int cfq_fifo_expire_w = 5 * HZ;  /* fifo timeout for async requests */
36 static int cfq_fifo_rate = HZ / 8;      /* fifo expiry rate */
37 static int cfq_back_max = 16 * 1024;    /* maximum backwards seek, in KiB */
38 static int cfq_back_penalty = 2;        /* penalty of a backwards seek */
39
40 /*
41  * for the hash of cfqq inside the cfqd
42  */
43 #define CFQ_QHASH_SHIFT         6
44 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
45 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
46
47 /*
48  * for the hash of crq inside the cfqq
49  */
50 #define CFQ_MHASH_SHIFT         6
51 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
52 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
53 #define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
54 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
55 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
56
57 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
58
59 #define RQ_DATA(rq)             (rq)->elevator_private
60
61 /*
62  * rb-tree defines
63  */
64 #define RB_NONE                 (2)
65 #define RB_EMPTY(node)          ((node)->rb_node == NULL)
66 #define RB_CLEAR_COLOR(node)    (node)->rb_color = RB_NONE
67 #define RB_CLEAR(node)          do {    \
68         (node)->rb_parent = NULL;       \
69         RB_CLEAR_COLOR((node));         \
70         (node)->rb_right = NULL;        \
71         (node)->rb_left = NULL;         \
72 } while (0)
73 #define RB_CLEAR_ROOT(root)     ((root)->rb_node = NULL)
74 #define ON_RB(node)             ((node)->rb_color != RB_NONE)
75 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
76 #define rq_rb_key(rq)           (rq)->sector
77
78 /*
79  * threshold for switching off non-tag accounting
80  */
81 #define CFQ_MAX_TAG             (4)
82
83 /*
84  * sort key types and names
85  */
86 enum {
87         CFQ_KEY_PGID,
88         CFQ_KEY_TGID,
89         CFQ_KEY_UID,
90         CFQ_KEY_GID,
91         CFQ_KEY_LAST,
92 };
93
94 static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
95
96 static kmem_cache_t *crq_pool;
97 static kmem_cache_t *cfq_pool;
98 static kmem_cache_t *cfq_ioc_pool;
99
100 struct cfq_data {
101         struct list_head rr_list;
102         struct list_head empty_list;
103
104         struct hlist_head *cfq_hash;
105         struct hlist_head *crq_hash;
106
107         /* queues on rr_list (ie they have pending requests */
108         unsigned int busy_queues;
109
110         unsigned int max_queued;
111
112         atomic_t ref;
113
114         int key_type;
115
116         mempool_t *crq_pool;
117
118         request_queue_t *queue;
119
120         sector_t last_sector;
121
122         int rq_in_driver;
123
124         /*
125          * tunables, see top of file
126          */
127         unsigned int cfq_quantum;
128         unsigned int cfq_queued;
129         unsigned int cfq_fifo_expire_r;
130         unsigned int cfq_fifo_expire_w;
131         unsigned int cfq_fifo_batch_expire;
132         unsigned int cfq_back_penalty;
133         unsigned int cfq_back_max;
134         unsigned int find_best_crq;
135
136         unsigned int cfq_tagged;
137 };
138
139 struct cfq_queue {
140         /* reference count */
141         atomic_t ref;
142         /* parent cfq_data */
143         struct cfq_data *cfqd;
144         /* hash of mergeable requests */
145         struct hlist_node cfq_hash;
146         /* hash key */
147         unsigned long key;
148         /* whether queue is on rr (or empty) list */
149         int on_rr;
150         /* on either rr or empty list of cfqd */
151         struct list_head cfq_list;
152         /* sorted list of pending requests */
153         struct rb_root sort_list;
154         /* if fifo isn't expired, next request to serve */
155         struct cfq_rq *next_crq;
156         /* requests queued in sort_list */
157         int queued[2];
158         /* currently allocated requests */
159         int allocated[2];
160         /* fifo list of requests in sort_list */
161         struct list_head fifo[2];
162         /* last time fifo expired */
163         unsigned long last_fifo_expire;
164
165         int key_type;
166
167         unsigned long service_start;
168         unsigned long service_used;
169
170         unsigned int max_rate;
171
172         /* number of requests that have been handed to the driver */
173         int in_flight;
174         /* number of currently allocated requests */
175         int alloc_limit[2];
176 };
177
178 struct cfq_rq {
179         struct rb_node rb_node;
180         sector_t rb_key;
181         struct request *request;
182         struct hlist_node hash;
183
184         struct cfq_queue *cfq_queue;
185         struct cfq_io_context *io_context;
186
187         unsigned long service_start;
188         unsigned long queue_start;
189
190         unsigned int in_flight : 1;
191         unsigned int accounted : 1;
192         unsigned int is_sync   : 1;
193         unsigned int is_write  : 1;
194 };
195
196 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
197 static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
198 static void cfq_update_next_crq(struct cfq_rq *);
199 static void cfq_put_cfqd(struct cfq_data *cfqd);
200
201 /*
202  * what the fairness is based on (ie how processes are grouped and
203  * differentiated)
204  */
205 static inline unsigned long
206 cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
207 {
208         /*
209          * optimize this so that ->key_type is the offset into the struct
210          */
211         switch (cfqd->key_type) {
212                 case CFQ_KEY_PGID:
213                         return process_group(tsk);
214                 default:
215                 case CFQ_KEY_TGID:
216                         return tsk->tgid;
217                 case CFQ_KEY_UID:
218                         return tsk->uid;
219                 case CFQ_KEY_GID:
220                         return tsk->gid;
221         }
222 }
223
224 /*
225  * lots of deadline iosched dupes, can be abstracted later...
226  */
227 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
228 {
229         hlist_del_init(&crq->hash);
230 }
231
232 static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
233 {
234         cfq_del_crq_hash(crq);
235
236         if (q->last_merge == crq->request)
237                 q->last_merge = NULL;
238
239         cfq_update_next_crq(crq);
240 }
241
242 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
243 {
244         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
245
246         BUG_ON(!hlist_unhashed(&crq->hash));
247
248         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
249 }
250
251 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
252 {
253         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
254         struct hlist_node *entry, *next;
255
256         hlist_for_each_safe(entry, next, hash_list) {
257                 struct cfq_rq *crq = list_entry_hash(entry);
258                 struct request *__rq = crq->request;
259
260                 BUG_ON(hlist_unhashed(&crq->hash));
261
262                 if (!rq_mergeable(__rq)) {
263                         cfq_del_crq_hash(crq);
264                         continue;
265                 }
266
267                 if (rq_hash_key(__rq) == offset)
268                         return __rq;
269         }
270
271         return NULL;
272 }
273
274 /*
275  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
276  * We choose the request that is closest to the head right now. Distance
277  * behind the head are penalized and only allowed to a certain extent.
278  */
279 static struct cfq_rq *
280 cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
281 {
282         sector_t last, s1, s2, d1 = 0, d2 = 0;
283         int r1_wrap = 0, r2_wrap = 0;   /* requests are behind the disk head */
284         unsigned long back_max;
285
286         if (crq1 == NULL || crq1 == crq2)
287                 return crq2;
288         if (crq2 == NULL)
289                 return crq1;
290
291         s1 = crq1->request->sector;
292         s2 = crq2->request->sector;
293
294         last = cfqd->last_sector;
295
296 #if 0
297         if (!list_empty(&cfqd->queue->queue_head)) {
298                 struct list_head *entry = &cfqd->queue->queue_head;
299                 unsigned long distance = ~0UL;
300                 struct request *rq;
301
302                 while ((entry = entry->prev) != &cfqd->queue->queue_head) {
303                         rq = list_entry_rq(entry);
304
305                         if (blk_barrier_rq(rq))
306                                 break;
307
308                         if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
309                                 distance = abs(s1 - rq->sector +rq->nr_sectors);
310                                 last = rq->sector + rq->nr_sectors;
311                         }
312                         if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
313                                 distance = abs(s2 - rq->sector +rq->nr_sectors);
314                                 last = rq->sector + rq->nr_sectors;
315                         }
316                 }
317         }
318 #endif
319
320         /*
321          * by definition, 1KiB is 2 sectors
322          */
323         back_max = cfqd->cfq_back_max * 2;
324
325         /*
326          * Strict one way elevator _except_ in the case where we allow
327          * short backward seeks which are biased as twice the cost of a
328          * similar forward seek.
329          */
330         if (s1 >= last)
331                 d1 = s1 - last;
332         else if (s1 + back_max >= last)
333                 d1 = (last - s1) * cfqd->cfq_back_penalty;
334         else
335                 r1_wrap = 1;
336
337         if (s2 >= last)
338                 d2 = s2 - last;
339         else if (s2 + back_max >= last)
340                 d2 = (last - s2) * cfqd->cfq_back_penalty;
341         else
342                 r2_wrap = 1;
343
344         /* Found required data */
345         if (!r1_wrap && r2_wrap)
346                 return crq1;
347         else if (!r2_wrap && r1_wrap)
348                 return crq2;
349         else if (r1_wrap && r2_wrap) {
350                 /* both behind the head */
351                 if (s1 <= s2)
352                         return crq1;
353                 else
354                         return crq2;
355         }
356
357         /* Both requests in front of the head */
358         if (d1 < d2)
359                 return crq1;
360         else if (d2 < d1)
361                 return crq2;
362         else {
363                 if (s1 >= s2)
364                         return crq1;
365                 else
366                         return crq2;
367         }
368 }
369
370 /*
371  * would be nice to take fifo expire time into account as well
372  */
373 static struct cfq_rq *
374 cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
375                   struct cfq_rq *last)
376 {
377         struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
378         struct rb_node *rbnext, *rbprev;
379
380         if (!ON_RB(&last->rb_node))
381                 return NULL;
382
383         if ((rbnext = rb_next(&last->rb_node)) == NULL)
384                 rbnext = rb_first(&cfqq->sort_list);
385
386         rbprev = rb_prev(&last->rb_node);
387
388         if (rbprev)
389                 crq_prev = rb_entry_crq(rbprev);
390         if (rbnext)
391                 crq_next = rb_entry_crq(rbnext);
392
393         return cfq_choose_req(cfqd, crq_next, crq_prev);
394 }
395
396 static void cfq_update_next_crq(struct cfq_rq *crq)
397 {
398         struct cfq_queue *cfqq = crq->cfq_queue;
399
400         if (cfqq->next_crq == crq)
401                 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
402 }
403
404 static int cfq_check_sort_rr_list(struct cfq_queue *cfqq)
405 {
406         struct list_head *head = &cfqq->cfqd->rr_list;
407         struct list_head *next, *prev;
408
409         /*
410          * list might still be ordered
411          */
412         next = cfqq->cfq_list.next;
413         if (next != head) {
414                 struct cfq_queue *cnext = list_entry_cfqq(next);
415
416                 if (cfqq->service_used > cnext->service_used)
417                         return 1;
418         }
419
420         prev = cfqq->cfq_list.prev;
421         if (prev != head) {
422                 struct cfq_queue *cprev = list_entry_cfqq(prev);
423
424                 if (cfqq->service_used < cprev->service_used)
425                         return 1;
426         }
427
428         return 0;
429 }
430
431 static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
432 {
433         struct list_head *entry = &cfqq->cfqd->rr_list;
434
435         if (!cfqq->on_rr)
436                 return;
437         if (!new_queue && !cfq_check_sort_rr_list(cfqq))
438                 return;
439
440         list_del(&cfqq->cfq_list);
441
442         /*
443          * sort by our mean service_used, sub-sort by in-flight requests
444          */
445         while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
446                 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
447
448                 if (cfqq->service_used > __cfqq->service_used)
449                         break;
450                 else if (cfqq->service_used == __cfqq->service_used) {
451                         struct list_head *prv;
452
453                         while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
454                                 __cfqq = list_entry_cfqq(prv);
455
456                                 WARN_ON(__cfqq->service_used > cfqq->service_used);
457                                 if (cfqq->service_used != __cfqq->service_used)
458                                         break;
459                                 if (cfqq->in_flight > __cfqq->in_flight)
460                                         break;
461
462                                 entry = prv;
463                         }
464                 }
465         }
466
467         list_add(&cfqq->cfq_list, entry);
468 }
469
470 /*
471  * add to busy list of queues for service, trying to be fair in ordering
472  * the pending list according to requests serviced
473  */
474 static inline void
475 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
476 {
477         /*
478          * it's currently on the empty list
479          */
480         cfqq->on_rr = 1;
481         cfqd->busy_queues++;
482
483         if (time_after(jiffies, cfqq->service_start + cfq_service))
484                 cfqq->service_used >>= 3;
485
486         cfq_sort_rr_list(cfqq, 1);
487 }
488
489 static inline void
490 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
491 {
492         list_move(&cfqq->cfq_list, &cfqd->empty_list);
493         cfqq->on_rr = 0;
494
495         BUG_ON(!cfqd->busy_queues);
496         cfqd->busy_queues--;
497 }
498
499 /*
500  * rb tree support functions
501  */
502 static inline void cfq_del_crq_rb(struct cfq_rq *crq)
503 {
504         struct cfq_queue *cfqq = crq->cfq_queue;
505
506         if (ON_RB(&crq->rb_node)) {
507                 struct cfq_data *cfqd = cfqq->cfqd;
508
509                 BUG_ON(!cfqq->queued[crq->is_sync]);
510
511                 cfq_update_next_crq(crq);
512
513                 cfqq->queued[crq->is_sync]--;
514                 rb_erase(&crq->rb_node, &cfqq->sort_list);
515                 RB_CLEAR_COLOR(&crq->rb_node);
516
517                 if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr)
518                         cfq_del_cfqq_rr(cfqd, cfqq);
519         }
520 }
521
522 static struct cfq_rq *
523 __cfq_add_crq_rb(struct cfq_rq *crq)
524 {
525         struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
526         struct rb_node *parent = NULL;
527         struct cfq_rq *__crq;
528
529         while (*p) {
530                 parent = *p;
531                 __crq = rb_entry_crq(parent);
532
533                 if (crq->rb_key < __crq->rb_key)
534                         p = &(*p)->rb_left;
535                 else if (crq->rb_key > __crq->rb_key)
536                         p = &(*p)->rb_right;
537                 else
538                         return __crq;
539         }
540
541         rb_link_node(&crq->rb_node, parent, p);
542         return NULL;
543 }
544
545 static void cfq_add_crq_rb(struct cfq_rq *crq)
546 {
547         struct cfq_queue *cfqq = crq->cfq_queue;
548         struct cfq_data *cfqd = cfqq->cfqd;
549         struct request *rq = crq->request;
550         struct cfq_rq *__alias;
551
552         crq->rb_key = rq_rb_key(rq);
553         cfqq->queued[crq->is_sync]++;
554
555         /*
556          * looks a little odd, but the first insert might return an alias.
557          * if that happens, put the alias on the dispatch list
558          */
559         while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
560                 cfq_dispatch_sort(cfqd->queue, __alias);
561
562         rb_insert_color(&crq->rb_node, &cfqq->sort_list);
563
564         if (!cfqq->on_rr)
565                 cfq_add_cfqq_rr(cfqd, cfqq);
566
567         /*
568          * check if this request is a better next-serve candidate
569          */
570         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
571 }
572
573 static inline void
574 cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
575 {
576         if (ON_RB(&crq->rb_node)) {
577                 rb_erase(&crq->rb_node, &cfqq->sort_list);
578                 cfqq->queued[crq->is_sync]--;
579         }
580
581         cfq_add_crq_rb(crq);
582 }
583
584 static struct request *
585 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
586 {
587         const unsigned long key = cfq_hash_key(cfqd, current);
588         struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
589         struct rb_node *n;
590
591         if (!cfqq)
592                 goto out;
593
594         n = cfqq->sort_list.rb_node;
595         while (n) {
596                 struct cfq_rq *crq = rb_entry_crq(n);
597
598                 if (sector < crq->rb_key)
599                         n = n->rb_left;
600                 else if (sector > crq->rb_key)
601                         n = n->rb_right;
602                 else
603                         return crq->request;
604         }
605
606 out:
607         return NULL;
608 }
609
610 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
611 {
612         struct cfq_rq *crq = RQ_DATA(rq);
613
614         if (crq) {
615                 struct cfq_queue *cfqq = crq->cfq_queue;
616
617                 if (cfqq->cfqd->cfq_tagged) {
618                         cfqq->service_used--;
619                         cfq_sort_rr_list(cfqq, 0);
620                 }
621
622                 if (crq->accounted) {
623                         crq->accounted = 0;
624                         cfqq->cfqd->rq_in_driver--;
625                 }
626         }
627 }
628
629 /*
630  * make sure the service time gets corrected on reissue of this request
631  */
632 static void cfq_requeue_request(request_queue_t *q, struct request *rq)
633 {
634         cfq_deactivate_request(q, rq);
635         list_add(&rq->queuelist, &q->queue_head);
636 }
637
638 static void cfq_remove_request(request_queue_t *q, struct request *rq)
639 {
640         struct cfq_rq *crq = RQ_DATA(rq);
641
642         if (crq) {
643                 cfq_remove_merge_hints(q, crq);
644                 list_del_init(&rq->queuelist);
645
646                 if (crq->cfq_queue)
647                         cfq_del_crq_rb(crq);
648         }
649 }
650
651 static int
652 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
653 {
654         struct cfq_data *cfqd = q->elevator->elevator_data;
655         struct request *__rq;
656         int ret;
657
658         ret = elv_try_last_merge(q, bio);
659         if (ret != ELEVATOR_NO_MERGE) {
660                 __rq = q->last_merge;
661                 goto out_insert;
662         }
663
664         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
665         if (__rq) {
666                 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
667
668                 if (elv_rq_merge_ok(__rq, bio)) {
669                         ret = ELEVATOR_BACK_MERGE;
670                         goto out;
671                 }
672         }
673
674         __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
675         if (__rq) {
676                 if (elv_rq_merge_ok(__rq, bio)) {
677                         ret = ELEVATOR_FRONT_MERGE;
678                         goto out;
679                 }
680         }
681
682         return ELEVATOR_NO_MERGE;
683 out:
684         q->last_merge = __rq;
685 out_insert:
686         *req = __rq;
687         return ret;
688 }
689
690 static void cfq_merged_request(request_queue_t *q, struct request *req)
691 {
692         struct cfq_data *cfqd = q->elevator->elevator_data;
693         struct cfq_rq *crq = RQ_DATA(req);
694
695         cfq_del_crq_hash(crq);
696         cfq_add_crq_hash(cfqd, crq);
697
698         if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
699                 struct cfq_queue *cfqq = crq->cfq_queue;
700
701                 cfq_update_next_crq(crq);
702                 cfq_reposition_crq_rb(cfqq, crq);
703         }
704
705         q->last_merge = req;
706 }
707
708 static void
709 cfq_merged_requests(request_queue_t *q, struct request *rq,
710                     struct request *next)
711 {
712         struct cfq_rq *crq = RQ_DATA(rq);
713         struct cfq_rq *cnext = RQ_DATA(next);
714
715         cfq_merged_request(q, rq);
716
717         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) {
718                 if (time_before(cnext->queue_start, crq->queue_start)) {
719                         list_move(&rq->queuelist, &next->queuelist);
720                         crq->queue_start = cnext->queue_start;
721                 }
722         }
723
724         cfq_update_next_crq(cnext);
725         cfq_remove_request(q, next);
726 }
727
728 /*
729  * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues,
730  * this function sector sorts the selected request to minimize seeks. we start
731  * at cfqd->last_sector, not 0.
732  */
733 static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
734 {
735         struct cfq_data *cfqd = q->elevator->elevator_data;
736         struct cfq_queue *cfqq = crq->cfq_queue;
737         struct list_head *head = &q->queue_head, *entry = head;
738         struct request *__rq;
739         sector_t last;
740
741         cfq_del_crq_rb(crq);
742         cfq_remove_merge_hints(q, crq);
743         list_del(&crq->request->queuelist);
744
745         last = cfqd->last_sector;
746         while ((entry = entry->prev) != head) {
747                 __rq = list_entry_rq(entry);
748
749                 if (blk_barrier_rq(crq->request))
750                         break;
751                 if (!blk_fs_request(crq->request))
752                         break;
753
754                 if (crq->request->sector > __rq->sector)
755                         break;
756                 if (__rq->sector > last && crq->request->sector < last) {
757                         last = crq->request->sector;
758                         break;
759                 }
760         }
761
762         cfqd->last_sector = last;
763         crq->in_flight = 1;
764         cfqq->in_flight++;
765         list_add(&crq->request->queuelist, entry);
766 }
767
768 /*
769  * return expired entry, or NULL to just start from scratch in rbtree
770  */
771 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
772 {
773         struct cfq_data *cfqd = cfqq->cfqd;
774         const int reads = !list_empty(&cfqq->fifo[0]);
775         const int writes = !list_empty(&cfqq->fifo[1]);
776         unsigned long now = jiffies;
777         struct cfq_rq *crq;
778
779         if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire))
780                 return NULL;
781
782         crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist));
783         if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) {
784                 cfqq->last_fifo_expire = now;
785                 return crq;
786         }
787
788         crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist));
789         if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) {
790                 cfqq->last_fifo_expire = now;
791                 return crq;
792         }
793
794         return NULL;
795 }
796
797 /*
798  * dispatch a single request from given queue
799  */
800 static inline void
801 cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd,
802                      struct cfq_queue *cfqq)
803 {
804         struct cfq_rq *crq;
805
806         /*
807          * follow expired path, else get first next available
808          */
809         if ((crq = cfq_check_fifo(cfqq)) == NULL) {
810                 if (cfqd->find_best_crq)
811                         crq = cfqq->next_crq;
812                 else
813                         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
814         }
815
816         cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
817
818         /*
819          * finally, insert request into driver list
820          */
821         cfq_dispatch_sort(q, crq);
822 }
823
824 static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch)
825 {
826         struct cfq_data *cfqd = q->elevator->elevator_data;
827         struct cfq_queue *cfqq;
828         struct list_head *entry, *tmp;
829         int queued, busy_queues, first_round;
830
831         if (list_empty(&cfqd->rr_list))
832                 return 0;
833
834         queued = 0;
835         first_round = 1;
836 restart:
837         busy_queues = 0;
838         list_for_each_safe(entry, tmp, &cfqd->rr_list) {
839                 cfqq = list_entry_cfqq(entry);
840
841                 BUG_ON(RB_EMPTY(&cfqq->sort_list));
842
843                 /*
844                  * first round of queueing, only select from queues that
845                  * don't already have io in-flight
846                  */
847                 if (first_round && cfqq->in_flight)
848                         continue;
849
850                 cfq_dispatch_request(q, cfqd, cfqq);
851
852                 if (!RB_EMPTY(&cfqq->sort_list))
853                         busy_queues++;
854
855                 queued++;
856         }
857
858         if ((queued < max_dispatch) && (busy_queues || first_round)) {
859                 first_round = 0;
860                 goto restart;
861         }
862
863         return queued;
864 }
865
866 static inline void cfq_account_dispatch(struct cfq_rq *crq)
867 {
868         struct cfq_queue *cfqq = crq->cfq_queue;
869         struct cfq_data *cfqd = cfqq->cfqd;
870         unsigned long now, elapsed;
871
872         if (!blk_fs_request(crq->request))
873                 return;
874
875         /*
876          * accounted bit is necessary since some drivers will call
877          * elv_next_request() many times for the same request (eg ide)
878          */
879         if (crq->accounted)
880                 return;
881
882         now = jiffies;
883         if (cfqq->service_start == ~0UL)
884                 cfqq->service_start = now;
885
886         /*
887          * on drives with tagged command queueing, command turn-around time
888          * doesn't necessarily reflect the time spent processing this very
889          * command inside the drive. so do the accounting differently there,
890          * by just sorting on the number of requests
891          */
892         if (cfqd->cfq_tagged) {
893                 if (time_after(now, cfqq->service_start + cfq_service)) {
894                         cfqq->service_start = now;
895                         cfqq->service_used /= 10;
896                 }
897
898                 cfqq->service_used++;
899                 cfq_sort_rr_list(cfqq, 0);
900         }
901
902         elapsed = now - crq->queue_start;
903         if (elapsed > max_elapsed_dispatch)
904                 max_elapsed_dispatch = elapsed;
905
906         crq->accounted = 1;
907         crq->service_start = now;
908
909         if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) {
910                 cfqq->cfqd->cfq_tagged = 1;
911                 printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG);
912         }
913 }
914
915 static inline void
916 cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
917 {
918         struct cfq_data *cfqd = cfqq->cfqd;
919
920         if (!crq->accounted)
921                 return;
922
923         WARN_ON(!cfqd->rq_in_driver);
924         cfqd->rq_in_driver--;
925
926         if (!cfqd->cfq_tagged) {
927                 unsigned long now = jiffies;
928                 unsigned long duration = now - crq->service_start;
929
930                 if (time_after(now, cfqq->service_start + cfq_service)) {
931                         cfqq->service_start = now;
932                         cfqq->service_used >>= 3;
933                 }
934
935                 cfqq->service_used += duration;
936                 cfq_sort_rr_list(cfqq, 0);
937
938                 if (duration > max_elapsed_crq)
939                         max_elapsed_crq = duration;
940         }
941 }
942
943 static struct request *cfq_next_request(request_queue_t *q)
944 {
945         struct cfq_data *cfqd = q->elevator->elevator_data;
946         struct request *rq;
947
948         if (!list_empty(&q->queue_head)) {
949                 struct cfq_rq *crq;
950 dispatch:
951                 rq = list_entry_rq(q->queue_head.next);
952
953                 if ((crq = RQ_DATA(rq)) != NULL) {
954                         cfq_remove_merge_hints(q, crq);
955                         cfq_account_dispatch(crq);
956                 }
957
958                 return rq;
959         }
960
961         if (cfq_dispatch_requests(q, cfqd->cfq_quantum))
962                 goto dispatch;
963
964         return NULL;
965 }
966
967 /*
968  * task holds one reference to the queue, dropped when task exits. each crq
969  * in-flight on this queue also holds a reference, dropped when crq is freed.
970  *
971  * queue lock must be held here.
972  */
973 static void cfq_put_queue(struct cfq_queue *cfqq)
974 {
975         BUG_ON(!atomic_read(&cfqq->ref));
976
977         if (!atomic_dec_and_test(&cfqq->ref))
978                 return;
979
980         BUG_ON(rb_first(&cfqq->sort_list));
981         BUG_ON(cfqq->on_rr);
982
983         cfq_put_cfqd(cfqq->cfqd);
984
985         /*
986          * it's on the empty list and still hashed
987          */
988         list_del(&cfqq->cfq_list);
989         hlist_del(&cfqq->cfq_hash);
990         kmem_cache_free(cfq_pool, cfqq);
991 }
992
993 static inline struct cfq_queue *
994 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
995 {
996         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
997         struct hlist_node *entry, *next;
998
999         hlist_for_each_safe(entry, next, hash_list) {
1000                 struct cfq_queue *__cfqq = list_entry_qhash(entry);
1001
1002                 if (__cfqq->key == key)
1003                         return __cfqq;
1004         }
1005
1006         return NULL;
1007 }
1008
1009 static struct cfq_queue *
1010 cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key)
1011 {
1012         return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
1013 }
1014
1015 static inline void
1016 cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
1017                 struct cfq_io_context *cic)
1018 {
1019         unsigned long hashkey = cfq_hash_key(cfqd, current);
1020         unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
1021         struct cfq_queue *__cfqq;
1022         unsigned long flags;
1023
1024         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1025
1026         hlist_del(&(*cfqq)->cfq_hash);
1027
1028         __cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
1029         if (!__cfqq || __cfqq == *cfqq) {
1030                 __cfqq = *cfqq;
1031                 hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1032                 __cfqq->key_type = cfqd->key_type;
1033         } else {
1034                 atomic_inc(&__cfqq->ref);
1035                 cic->cfqq = __cfqq;
1036                 cfq_put_queue(*cfqq);
1037                 *cfqq = __cfqq;
1038         }
1039
1040         cic->cfqq = __cfqq;
1041         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1042 }
1043
1044 static void cfq_free_io_context(struct cfq_io_context *cic)
1045 {
1046         kmem_cache_free(cfq_ioc_pool, cic);
1047 }
1048
1049 /*
1050  * locking hierarchy is: io_context lock -> queue locks
1051  */
1052 static void cfq_exit_io_context(struct cfq_io_context *cic)
1053 {
1054         struct cfq_queue *cfqq = cic->cfqq;
1055         struct list_head *entry = &cic->list;
1056         request_queue_t *q;
1057         unsigned long flags;
1058
1059         /*
1060          * put the reference this task is holding to the various queues
1061          */
1062         spin_lock_irqsave(&cic->ioc->lock, flags);
1063         while ((entry = cic->list.next) != &cic->list) {
1064                 struct cfq_io_context *__cic;
1065
1066                 __cic = list_entry(entry, struct cfq_io_context, list);
1067                 list_del(entry);
1068
1069                 q = __cic->cfqq->cfqd->queue;
1070                 spin_lock(q->queue_lock);
1071                 cfq_put_queue(__cic->cfqq);
1072                 spin_unlock(q->queue_lock);
1073         }
1074
1075         q = cfqq->cfqd->queue;
1076         spin_lock(q->queue_lock);
1077         cfq_put_queue(cfqq);
1078         spin_unlock(q->queue_lock);
1079
1080         cic->cfqq = NULL;
1081         spin_unlock_irqrestore(&cic->ioc->lock, flags);
1082 }
1083
1084 static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
1085 {
1086         struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags);
1087
1088         if (cic) {
1089                 cic->dtor = cfq_free_io_context;
1090                 cic->exit = cfq_exit_io_context;
1091                 INIT_LIST_HEAD(&cic->list);
1092                 cic->cfqq = NULL;
1093         }
1094
1095         return cic;
1096 }
1097
1098 /*
1099  * Setup general io context and cfq io context. There can be several cfq
1100  * io contexts per general io context, if this process is doing io to more
1101  * than one device managed by cfq. Note that caller is holding a reference to
1102  * cfqq, so we don't need to worry about it disappearing
1103  */
1104 static struct cfq_io_context *
1105 cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
1106 {
1107         struct cfq_data *cfqd = (*cfqq)->cfqd;
1108         struct cfq_queue *__cfqq = *cfqq;
1109         struct cfq_io_context *cic;
1110         struct io_context *ioc;
1111
1112         might_sleep_if(gfp_flags & __GFP_WAIT);
1113
1114         ioc = get_io_context(gfp_flags);
1115         if (!ioc)
1116                 return NULL;
1117
1118         if ((cic = ioc->cic) == NULL) {
1119                 cic = cfq_alloc_io_context(gfp_flags);
1120
1121                 if (cic == NULL)
1122                         goto err;
1123
1124                 ioc->cic = cic;
1125                 cic->ioc = ioc;
1126                 cic->cfqq = __cfqq;
1127                 atomic_inc(&__cfqq->ref);
1128         } else {
1129                 struct cfq_io_context *__cic;
1130                 unsigned long flags;
1131
1132                 /*
1133                  * since the first cic on the list is actually the head
1134                  * itself, need to check this here or we'll duplicate an
1135                  * cic per ioc for no reason
1136                  */
1137                 if (cic->cfqq == __cfqq)
1138                         goto out;
1139
1140                 /*
1141                  * cic exists, check if we already are there. linear search
1142                  * should be ok here, the list will usually not be more than
1143                  * 1 or a few entries long
1144                  */
1145                 spin_lock_irqsave(&ioc->lock, flags);
1146                 list_for_each_entry(__cic, &cic->list, list) {
1147                         /*
1148                          * this process is already holding a reference to
1149                          * this queue, so no need to get one more
1150                          */
1151                         if (__cic->cfqq == __cfqq) {
1152                                 cic = __cic;
1153                                 spin_unlock_irqrestore(&ioc->lock, flags);
1154                                 goto out;
1155                         }
1156                 }
1157                 spin_unlock_irqrestore(&ioc->lock, flags);
1158
1159                 /*
1160                  * nope, process doesn't have a cic assoicated with this
1161                  * cfqq yet. get a new one and add to list
1162                  */
1163                 __cic = cfq_alloc_io_context(gfp_flags);
1164                 if (__cic == NULL)
1165                         goto err;
1166
1167                 __cic->ioc = ioc;
1168                 __cic->cfqq = __cfqq;
1169                 atomic_inc(&__cfqq->ref);
1170                 spin_lock_irqsave(&ioc->lock, flags);
1171                 list_add(&__cic->list, &cic->list);
1172                 spin_unlock_irqrestore(&ioc->lock, flags);
1173
1174                 cic = __cic;
1175                 *cfqq = __cfqq;
1176         }
1177
1178 out:
1179         /*
1180          * if key_type has been changed on the fly, we lazily rehash
1181          * each queue at lookup time
1182          */
1183         if ((*cfqq)->key_type != cfqd->key_type)
1184                 cfq_rehash_cfqq(cfqd, cfqq, cic);
1185
1186         return cic;
1187 err:
1188         put_io_context(ioc);
1189         return NULL;
1190 }
1191
1192 static struct cfq_queue *
1193 __cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
1194 {
1195         const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1196         struct cfq_queue *cfqq, *new_cfqq = NULL;
1197
1198 retry:
1199         cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
1200
1201         if (!cfqq) {
1202                 if (new_cfqq) {
1203                         cfqq = new_cfqq;
1204                         new_cfqq = NULL;
1205                 } else {
1206                         spin_unlock_irq(cfqd->queue->queue_lock);
1207                         new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1208                         spin_lock_irq(cfqd->queue->queue_lock);
1209
1210                         if (!new_cfqq && !(gfp_mask & __GFP_WAIT))
1211                                 goto out;
1212
1213                         goto retry;
1214                 }
1215
1216                 memset(cfqq, 0, sizeof(*cfqq));
1217
1218                 INIT_HLIST_NODE(&cfqq->cfq_hash);
1219                 INIT_LIST_HEAD(&cfqq->cfq_list);
1220                 RB_CLEAR_ROOT(&cfqq->sort_list);
1221                 INIT_LIST_HEAD(&cfqq->fifo[0]);
1222                 INIT_LIST_HEAD(&cfqq->fifo[1]);
1223
1224                 cfqq->key = key;
1225                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1226                 atomic_set(&cfqq->ref, 0);
1227                 cfqq->cfqd = cfqd;
1228                 atomic_inc(&cfqd->ref);
1229                 cfqq->key_type = cfqd->key_type;
1230                 cfqq->service_start = ~0UL;
1231         }
1232
1233         if (new_cfqq)
1234                 kmem_cache_free(cfq_pool, new_cfqq);
1235
1236         atomic_inc(&cfqq->ref);
1237 out:
1238         WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1239         return cfqq;
1240 }
1241
1242 static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
1243 {
1244         crq->is_sync = 0;
1245         if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE)
1246                 crq->is_sync = 1;
1247
1248         cfq_add_crq_rb(crq);
1249         crq->queue_start = jiffies;
1250
1251         list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]);
1252 }
1253
1254 static void
1255 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
1256 {
1257         struct cfq_data *cfqd = q->elevator->elevator_data;
1258         struct cfq_rq *crq = RQ_DATA(rq);
1259
1260         switch (where) {
1261                 case ELEVATOR_INSERT_BACK:
1262                         while (cfq_dispatch_requests(q, cfqd->cfq_quantum))
1263                                 ;
1264                         list_add_tail(&rq->queuelist, &q->queue_head);
1265                         break;
1266                 case ELEVATOR_INSERT_FRONT:
1267                         list_add(&rq->queuelist, &q->queue_head);
1268                         break;
1269                 case ELEVATOR_INSERT_SORT:
1270                         BUG_ON(!blk_fs_request(rq));
1271                         cfq_enqueue(cfqd, crq);
1272                         break;
1273                 default:
1274                         printk("%s: bad insert point %d\n", __FUNCTION__,where);
1275                         return;
1276         }
1277
1278         if (rq_mergeable(rq)) {
1279                 cfq_add_crq_hash(cfqd, crq);
1280
1281                 if (!q->last_merge)
1282                         q->last_merge = rq;
1283         }
1284 }
1285
1286 static int cfq_queue_empty(request_queue_t *q)
1287 {
1288         struct cfq_data *cfqd = q->elevator->elevator_data;
1289
1290         return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list);
1291 }
1292
1293 static void cfq_completed_request(request_queue_t *q, struct request *rq)
1294 {
1295         struct cfq_rq *crq = RQ_DATA(rq);
1296         struct cfq_queue *cfqq;
1297
1298         if (unlikely(!blk_fs_request(rq)))
1299                 return;
1300
1301         cfqq = crq->cfq_queue;
1302
1303         if (crq->in_flight) {
1304                 WARN_ON(!cfqq->in_flight);
1305                 cfqq->in_flight--;
1306         }
1307
1308         cfq_account_completion(cfqq, crq);
1309 }
1310
1311 static struct request *
1312 cfq_former_request(request_queue_t *q, struct request *rq)
1313 {
1314         struct cfq_rq *crq = RQ_DATA(rq);
1315         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1316
1317         if (rbprev)
1318                 return rb_entry_crq(rbprev)->request;
1319
1320         return NULL;
1321 }
1322
1323 static struct request *
1324 cfq_latter_request(request_queue_t *q, struct request *rq)
1325 {
1326         struct cfq_rq *crq = RQ_DATA(rq);
1327         struct rb_node *rbnext = rb_next(&crq->rb_node);
1328
1329         if (rbnext)
1330                 return rb_entry_crq(rbnext)->request;
1331
1332         return NULL;
1333 }
1334
1335 static int cfq_may_queue(request_queue_t *q, int rw)
1336 {
1337         struct cfq_data *cfqd = q->elevator->elevator_data;
1338         struct cfq_queue *cfqq;
1339         int ret = ELV_MQUEUE_MAY;
1340
1341         if (current->flags & PF_MEMALLOC)
1342                 return ELV_MQUEUE_MAY;
1343
1344         cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current));
1345         if (cfqq) {
1346                 int limit = cfqd->max_queued;
1347
1348                 if (cfqq->allocated[rw] < cfqd->cfq_queued)
1349                         return ELV_MQUEUE_MUST;
1350
1351                 if (cfqd->busy_queues)
1352                         limit = q->nr_requests / cfqd->busy_queues;
1353
1354                 if (limit < cfqd->cfq_queued)
1355                         limit = cfqd->cfq_queued;
1356                 else if (limit > cfqd->max_queued)
1357                         limit = cfqd->max_queued;
1358
1359                 if (cfqq->allocated[rw] >= limit) {
1360                         if (limit > cfqq->alloc_limit[rw])
1361                                 cfqq->alloc_limit[rw] = limit;
1362
1363                         ret = ELV_MQUEUE_NO;
1364                 }
1365         }
1366
1367         return ret;
1368 }
1369
1370 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1371 {
1372         struct request_list *rl = &q->rq;
1373         const int write = waitqueue_active(&rl->wait[WRITE]);
1374         const int read = waitqueue_active(&rl->wait[READ]);
1375
1376         if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ])
1377                 wake_up(&rl->wait[READ]);
1378         if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE])
1379                 wake_up(&rl->wait[WRITE]);
1380 }
1381
1382 /*
1383  * queue lock held here
1384  */
1385 static void cfq_put_request(request_queue_t *q, struct request *rq)
1386 {
1387         struct cfq_data *cfqd = q->elevator->elevator_data;
1388         struct cfq_rq *crq = RQ_DATA(rq);
1389
1390         if (crq) {
1391                 struct cfq_queue *cfqq = crq->cfq_queue;
1392
1393                 BUG_ON(q->last_merge == rq);
1394                 BUG_ON(!hlist_unhashed(&crq->hash));
1395
1396                 if (crq->io_context)
1397                         put_io_context(crq->io_context->ioc);
1398
1399                 BUG_ON(!cfqq->allocated[crq->is_write]);
1400                 cfqq->allocated[crq->is_write]--;
1401
1402                 mempool_free(crq, cfqd->crq_pool);
1403                 rq->elevator_private = NULL;
1404
1405                 smp_mb();
1406                 cfq_check_waiters(q, cfqq);
1407                 cfq_put_queue(cfqq);
1408         }
1409 }
1410
1411 /*
1412  * Allocate cfq data structures associated with this request. A queue and
1413  */
1414 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
1415 {
1416         struct cfq_data *cfqd = q->elevator->elevator_data;
1417         struct cfq_io_context *cic;
1418         const int rw = rq_data_dir(rq);
1419         struct cfq_queue *cfqq, *saved_cfqq;
1420         struct cfq_rq *crq;
1421         unsigned long flags;
1422
1423         might_sleep_if(gfp_mask & __GFP_WAIT);
1424
1425         spin_lock_irqsave(q->queue_lock, flags);
1426
1427         cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask);
1428         if (!cfqq)
1429                 goto out_lock;
1430
1431 repeat:
1432         if (cfqq->allocated[rw] >= cfqd->max_queued)
1433                 goto out_lock;
1434
1435         cfqq->allocated[rw]++;
1436         spin_unlock_irqrestore(q->queue_lock, flags);
1437
1438         /*
1439          * if hashing type has changed, the cfq_queue might change here.
1440          */
1441         saved_cfqq = cfqq;
1442         cic = cfq_get_io_context(&cfqq, gfp_mask);
1443         if (!cic)
1444                 goto err;
1445
1446         /*
1447          * repeat allocation checks on queue change
1448          */
1449         if (unlikely(saved_cfqq != cfqq)) {
1450                 spin_lock_irqsave(q->queue_lock, flags);
1451                 saved_cfqq->allocated[rw]--;
1452                 goto repeat;
1453         }
1454
1455         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1456         if (crq) {
1457                 RB_CLEAR(&crq->rb_node);
1458                 crq->rb_key = 0;
1459                 crq->request = rq;
1460                 INIT_HLIST_NODE(&crq->hash);
1461                 crq->cfq_queue = cfqq;
1462                 crq->io_context = cic;
1463                 crq->service_start = crq->queue_start = 0;
1464                 crq->in_flight = crq->accounted = crq->is_sync = 0;
1465                 crq->is_write = rw;
1466                 rq->elevator_private = crq;
1467                 cfqq->alloc_limit[rw] = 0;
1468                 return 0;
1469         }
1470
1471         put_io_context(cic->ioc);
1472 err:
1473         spin_lock_irqsave(q->queue_lock, flags);
1474         cfqq->allocated[rw]--;
1475         cfq_put_queue(cfqq);
1476 out_lock:
1477         spin_unlock_irqrestore(q->queue_lock, flags);
1478         return 1;
1479 }
1480
1481 static void cfq_put_cfqd(struct cfq_data *cfqd)
1482 {
1483         request_queue_t *q = cfqd->queue;
1484
1485         if (!atomic_dec_and_test(&cfqd->ref))
1486                 return;
1487
1488         blk_put_queue(q);
1489
1490         mempool_destroy(cfqd->crq_pool);
1491         kfree(cfqd->crq_hash);
1492         kfree(cfqd->cfq_hash);
1493         kfree(cfqd);
1494 }
1495
1496 static void cfq_exit_queue(elevator_t *e)
1497 {
1498         cfq_put_cfqd(e->elevator_data);
1499 }
1500
1501 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
1502 {
1503         struct cfq_data *cfqd;
1504         int i;
1505
1506         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
1507         if (!cfqd)
1508                 return -ENOMEM;
1509
1510         memset(cfqd, 0, sizeof(*cfqd));
1511         INIT_LIST_HEAD(&cfqd->rr_list);
1512         INIT_LIST_HEAD(&cfqd->empty_list);
1513
1514         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
1515         if (!cfqd->crq_hash)
1516                 goto out_crqhash;
1517
1518         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
1519         if (!cfqd->cfq_hash)
1520                 goto out_cfqhash;
1521
1522         cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
1523         if (!cfqd->crq_pool)
1524                 goto out_crqpool;
1525
1526         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
1527                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
1528         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
1529                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
1530
1531         e->elevator_data = cfqd;
1532
1533         cfqd->queue = q;
1534         atomic_inc(&q->refcnt);
1535
1536         /*
1537          * just set it to some high value, we want anyone to be able to queue
1538          * some requests. fairness is handled differently
1539          */
1540         q->nr_requests = 1024;
1541         cfqd->max_queued = q->nr_requests / 16;
1542         q->nr_batching = cfq_queued;
1543         cfqd->key_type = CFQ_KEY_TGID;
1544         cfqd->find_best_crq = 1;
1545         atomic_set(&cfqd->ref, 1);
1546
1547         cfqd->cfq_queued = cfq_queued;
1548         cfqd->cfq_quantum = cfq_quantum;
1549         cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r;
1550         cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w;
1551         cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
1552         cfqd->cfq_back_max = cfq_back_max;
1553         cfqd->cfq_back_penalty = cfq_back_penalty;
1554
1555         return 0;
1556 out_crqpool:
1557         kfree(cfqd->cfq_hash);
1558 out_cfqhash:
1559         kfree(cfqd->crq_hash);
1560 out_crqhash:
1561         kfree(cfqd);
1562         return -ENOMEM;
1563 }
1564
1565 static void cfq_slab_kill(void)
1566 {
1567         if (crq_pool)
1568                 kmem_cache_destroy(crq_pool);
1569         if (cfq_pool)
1570                 kmem_cache_destroy(cfq_pool);
1571         if (cfq_ioc_pool)
1572                 kmem_cache_destroy(cfq_ioc_pool);
1573 }
1574
1575 static int __init cfq_slab_setup(void)
1576 {
1577         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
1578                                         NULL, NULL);
1579         if (!crq_pool)
1580                 goto fail;
1581
1582         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
1583                                         NULL, NULL);
1584         if (!cfq_pool)
1585                 goto fail;
1586
1587         cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
1588                         sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
1589         if (!cfq_ioc_pool)
1590                 goto fail;
1591
1592         return 0;
1593 fail:
1594         cfq_slab_kill();
1595         return -ENOMEM;
1596 }
1597
1598
1599 /*
1600  * sysfs parts below -->
1601  */
1602 struct cfq_fs_entry {
1603         struct attribute attr;
1604         ssize_t (*show)(struct cfq_data *, char *);
1605         ssize_t (*store)(struct cfq_data *, const char *, size_t);
1606 };
1607
1608 static ssize_t
1609 cfq_var_show(unsigned int var, char *page)
1610 {
1611         return sprintf(page, "%d\n", var);
1612 }
1613
1614 static ssize_t
1615 cfq_var_store(unsigned int *var, const char *page, size_t count)
1616 {
1617         char *p = (char *) page;
1618
1619         *var = simple_strtoul(p, &p, 10);
1620         return count;
1621 }
1622
1623 static ssize_t
1624 cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
1625 {
1626         max_elapsed_dispatch = max_elapsed_crq = 0;
1627         return count;
1628 }
1629
1630 static ssize_t
1631 cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
1632 {
1633         spin_lock_irq(cfqd->queue->queue_lock);
1634         if (!strncmp(page, "pgid", 4))
1635                 cfqd->key_type = CFQ_KEY_PGID;
1636         else if (!strncmp(page, "tgid", 4))
1637                 cfqd->key_type = CFQ_KEY_TGID;
1638         else if (!strncmp(page, "uid", 3))
1639                 cfqd->key_type = CFQ_KEY_UID;
1640         else if (!strncmp(page, "gid", 3))
1641                 cfqd->key_type = CFQ_KEY_GID;
1642         spin_unlock_irq(cfqd->queue->queue_lock);
1643         return count;
1644 }
1645
1646 static ssize_t
1647 cfq_read_key_type(struct cfq_data *cfqd, char *page)
1648 {
1649         ssize_t len = 0;
1650         int i;
1651
1652         for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
1653                 if (cfqd->key_type == i)
1654                         len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
1655                 else
1656                         len += sprintf(page+len, "%s ", cfq_key_types[i]);
1657         }
1658         len += sprintf(page+len, "\n");
1659         return len;
1660 }
1661
1662 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
1663 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)                \
1664 {                                                                       \
1665         unsigned int __data = __VAR;                                    \
1666         if (__CONV)                                                     \
1667                 __data = jiffies_to_msecs(__data);                      \
1668         return cfq_var_show(__data, (page));                            \
1669 }
1670 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
1671 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
1672 SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1);
1673 SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1);
1674 SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1);
1675 SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0);
1676 SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
1677 SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
1678 #undef SHOW_FUNCTION
1679
1680 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
1681 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)    \
1682 {                                                                       \
1683         unsigned int __data;                                            \
1684         int ret = cfq_var_store(&__data, (page), count);                \
1685         if (__data < (MIN))                                             \
1686                 __data = (MIN);                                         \
1687         else if (__data > (MAX))                                        \
1688                 __data = (MAX);                                         \
1689         if (__CONV)                                                     \
1690                 *(__PTR) = msecs_to_jiffies(__data);                    \
1691         else                                                            \
1692                 *(__PTR) = __data;                                      \
1693         return ret;                                                     \
1694 }
1695 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
1696 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
1697 STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1);
1698 STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1);
1699 STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1);
1700 STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0);
1701 STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
1702 STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
1703 #undef STORE_FUNCTION
1704
1705 static struct cfq_fs_entry cfq_quantum_entry = {
1706         .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
1707         .show = cfq_quantum_show,
1708         .store = cfq_quantum_store,
1709 };
1710 static struct cfq_fs_entry cfq_queued_entry = {
1711         .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
1712         .show = cfq_queued_show,
1713         .store = cfq_queued_store,
1714 };
1715 static struct cfq_fs_entry cfq_fifo_expire_r_entry = {
1716         .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
1717         .show = cfq_fifo_expire_r_show,
1718         .store = cfq_fifo_expire_r_store,
1719 };
1720 static struct cfq_fs_entry cfq_fifo_expire_w_entry = {
1721         .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
1722         .show = cfq_fifo_expire_w_show,
1723         .store = cfq_fifo_expire_w_store,
1724 };
1725 static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
1726         .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
1727         .show = cfq_fifo_batch_expire_show,
1728         .store = cfq_fifo_batch_expire_store,
1729 };
1730 static struct cfq_fs_entry cfq_find_best_entry = {
1731         .attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
1732         .show = cfq_find_best_show,
1733         .store = cfq_find_best_store,
1734 };
1735 static struct cfq_fs_entry cfq_back_max_entry = {
1736         .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
1737         .show = cfq_back_max_show,
1738         .store = cfq_back_max_store,
1739 };
1740 static struct cfq_fs_entry cfq_back_penalty_entry = {
1741         .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
1742         .show = cfq_back_penalty_show,
1743         .store = cfq_back_penalty_store,
1744 };
1745 static struct cfq_fs_entry cfq_clear_elapsed_entry = {
1746         .attr = {.name = "clear_elapsed", .mode = S_IWUSR },
1747         .store = cfq_clear_elapsed,
1748 };
1749 static struct cfq_fs_entry cfq_key_type_entry = {
1750         .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
1751         .show = cfq_read_key_type,
1752         .store = cfq_set_key_type,
1753 };
1754
1755 static struct attribute *default_attrs[] = {
1756         &cfq_quantum_entry.attr,
1757         &cfq_queued_entry.attr,
1758         &cfq_fifo_expire_r_entry.attr,
1759         &cfq_fifo_expire_w_entry.attr,
1760         &cfq_fifo_batch_expire_entry.attr,
1761         &cfq_key_type_entry.attr,
1762         &cfq_find_best_entry.attr,
1763         &cfq_back_max_entry.attr,
1764         &cfq_back_penalty_entry.attr,
1765         &cfq_clear_elapsed_entry.attr,
1766         NULL,
1767 };
1768
1769 #define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
1770
1771 static ssize_t
1772 cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1773 {
1774         elevator_t *e = container_of(kobj, elevator_t, kobj);
1775         struct cfq_fs_entry *entry = to_cfq(attr);
1776
1777         if (!entry->show)
1778                 return 0;
1779
1780         return entry->show(e->elevator_data, page);
1781 }
1782
1783 static ssize_t
1784 cfq_attr_store(struct kobject *kobj, struct attribute *attr,
1785                const char *page, size_t length)
1786 {
1787         elevator_t *e = container_of(kobj, elevator_t, kobj);
1788         struct cfq_fs_entry *entry = to_cfq(attr);
1789
1790         if (!entry->store)
1791                 return -EINVAL;
1792
1793         return entry->store(e->elevator_data, page, length);
1794 }
1795
1796 static struct sysfs_ops cfq_sysfs_ops = {
1797         .show   = cfq_attr_show,
1798         .store  = cfq_attr_store,
1799 };
1800
1801 static struct kobj_type cfq_ktype = {
1802         .sysfs_ops      = &cfq_sysfs_ops,
1803         .default_attrs  = default_attrs,
1804 };
1805
1806 static struct elevator_type iosched_cfq = {
1807         .ops = {
1808                 .elevator_merge_fn =            cfq_merge,
1809                 .elevator_merged_fn =           cfq_merged_request,
1810                 .elevator_merge_req_fn =        cfq_merged_requests,
1811                 .elevator_next_req_fn =         cfq_next_request,
1812                 .elevator_add_req_fn =          cfq_insert_request,
1813                 .elevator_remove_req_fn =       cfq_remove_request,
1814                 .elevator_requeue_req_fn =      cfq_requeue_request,
1815                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
1816                 .elevator_queue_empty_fn =      cfq_queue_empty,
1817                 .elevator_completed_req_fn =    cfq_completed_request,
1818                 .elevator_former_req_fn =       cfq_former_request,
1819                 .elevator_latter_req_fn =       cfq_latter_request,
1820                 .elevator_set_req_fn =          cfq_set_request,
1821                 .elevator_put_req_fn =          cfq_put_request,
1822                 .elevator_may_queue_fn =        cfq_may_queue,
1823                 .elevator_init_fn =             cfq_init_queue,
1824                 .elevator_exit_fn =             cfq_exit_queue,
1825         },
1826         .elevator_ktype =       &cfq_ktype,
1827         .elevator_name =        "cfq",
1828         .elevator_owner =       THIS_MODULE,
1829 };
1830
1831 static int __init cfq_init(void)
1832 {
1833         int ret;
1834
1835         if (cfq_slab_setup())
1836                 return -ENOMEM;
1837
1838         ret = elv_register(&iosched_cfq);
1839         if (!ret) {
1840                 __module_get(THIS_MODULE);
1841                 return 0;
1842         }
1843
1844         cfq_slab_kill();
1845         return ret;
1846 }
1847
1848 static void __exit cfq_exit(void)
1849 {
1850         cfq_slab_kill();
1851         elv_unregister(&iosched_cfq);
1852 }
1853
1854 module_init(cfq_init);
1855 module_exit(cfq_exit);
1856
1857 MODULE_AUTHOR("Jens Axboe");
1858 MODULE_LICENSE("GPL");
1859 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");