vserver 1.9.5.x5
[linux-2.6.git] / drivers / block / cfq-iosched.c
1 /*
2  *  linux/drivers/block/cfq-iosched.c
3  *
4  *  CFQ, or complete fairness queueing, disk scheduler.
5  *
6  *  Based on ideas from a previously unfinished io
7  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
8  *
9  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
10  */
11 #include <linux/kernel.h>
12 #include <linux/fs.h>
13 #include <linux/blkdev.h>
14 #include <linux/elevator.h>
15 #include <linux/bio.h>
16 #include <linux/config.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/init.h>
20 #include <linux/compiler.h>
21 #include <linux/hash.h>
22 #include <linux/rbtree.h>
23 #include <linux/mempool.h>
24
25 static unsigned long max_elapsed_crq;
26 static unsigned long max_elapsed_dispatch;
27
28 /*
29  * tunables
30  */
31 static int cfq_quantum = 4;             /* max queue in one round of service */
32 static int cfq_queued = 8;              /* minimum rq allocate limit per-queue*/
33 static int cfq_service = HZ;            /* period over which service is avg */
34 static int cfq_fifo_expire_r = HZ / 2;  /* fifo timeout for sync requests */
35 static int cfq_fifo_expire_w = 5 * HZ;  /* fifo timeout for async requests */
36 static int cfq_fifo_rate = HZ / 8;      /* fifo expiry rate */
37 static int cfq_back_max = 16 * 1024;    /* maximum backwards seek, in KiB */
38 static int cfq_back_penalty = 2;        /* penalty of a backwards seek */
39
40 /*
41  * for the hash of cfqq inside the cfqd
42  */
43 #define CFQ_QHASH_SHIFT         6
44 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
45 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
46
47 /*
48  * for the hash of crq inside the cfqq
49  */
50 #define CFQ_MHASH_SHIFT         6
51 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
52 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
53 #define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
54 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
55 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
56
57 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
58
59 #define RQ_DATA(rq)             (rq)->elevator_private
60
61 /*
62  * rb-tree defines
63  */
64 #define RB_NONE                 (2)
65 #define RB_EMPTY(node)          ((node)->rb_node == NULL)
66 #define RB_CLEAR_COLOR(node)    (node)->rb_color = RB_NONE
67 #define RB_CLEAR(node)          do {    \
68         (node)->rb_parent = NULL;       \
69         RB_CLEAR_COLOR((node));         \
70         (node)->rb_right = NULL;        \
71         (node)->rb_left = NULL;         \
72 } while (0)
73 #define RB_CLEAR_ROOT(root)     ((root)->rb_node = NULL)
74 #define ON_RB(node)             ((node)->rb_color != RB_NONE)
75 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
76 #define rq_rb_key(rq)           (rq)->sector
77
78 /*
79  * threshold for switching off non-tag accounting
80  */
81 #define CFQ_MAX_TAG             (4)
82
83 /*
84  * sort key types and names
85  */
86 enum {
87         CFQ_KEY_PGID,
88         CFQ_KEY_TGID,
89         CFQ_KEY_UID,
90         CFQ_KEY_GID,
91         CFQ_KEY_LAST,
92 };
93
94 static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
95
96 static kmem_cache_t *crq_pool;
97 static kmem_cache_t *cfq_pool;
98 static kmem_cache_t *cfq_ioc_pool;
99
100 struct cfq_data {
101         struct list_head rr_list;
102         struct list_head empty_list;
103
104         struct hlist_head *cfq_hash;
105         struct hlist_head *crq_hash;
106
107         /* queues on rr_list (ie they have pending requests */
108         unsigned int busy_queues;
109
110         unsigned int max_queued;
111
112         atomic_t ref;
113
114         int key_type;
115
116         mempool_t *crq_pool;
117
118         request_queue_t *queue;
119
120         sector_t last_sector;
121
122         int rq_in_driver;
123
124         /*
125          * tunables, see top of file
126          */
127         unsigned int cfq_quantum;
128         unsigned int cfq_queued;
129         unsigned int cfq_fifo_expire_r;
130         unsigned int cfq_fifo_expire_w;
131         unsigned int cfq_fifo_batch_expire;
132         unsigned int cfq_back_penalty;
133         unsigned int cfq_back_max;
134         unsigned int find_best_crq;
135
136         unsigned int cfq_tagged;
137 };
138
139 struct cfq_queue {
140         /* reference count */
141         atomic_t ref;
142         /* parent cfq_data */
143         struct cfq_data *cfqd;
144         /* hash of mergeable requests */
145         struct hlist_node cfq_hash;
146         /* hash key */
147         unsigned long key;
148         /* whether queue is on rr (or empty) list */
149         int on_rr;
150         /* on either rr or empty list of cfqd */
151         struct list_head cfq_list;
152         /* sorted list of pending requests */
153         struct rb_root sort_list;
154         /* if fifo isn't expired, next request to serve */
155         struct cfq_rq *next_crq;
156         /* requests queued in sort_list */
157         int queued[2];
158         /* currently allocated requests */
159         int allocated[2];
160         /* fifo list of requests in sort_list */
161         struct list_head fifo[2];
162         /* last time fifo expired */
163         unsigned long last_fifo_expire;
164
165         int key_type;
166
167         unsigned long service_start;
168         unsigned long service_used;
169
170         unsigned int max_rate;
171
172         /* number of requests that have been handed to the driver */
173         int in_flight;
174         /* number of currently allocated requests */
175         int alloc_limit[2];
176 };
177
178 struct cfq_rq {
179         struct rb_node rb_node;
180         sector_t rb_key;
181         struct request *request;
182         struct hlist_node hash;
183
184         struct cfq_queue *cfq_queue;
185         struct cfq_io_context *io_context;
186
187         unsigned long service_start;
188         unsigned long queue_start;
189
190         unsigned int in_flight : 1;
191         unsigned int accounted : 1;
192         unsigned int is_sync   : 1;
193         unsigned int is_write  : 1;
194 };
195
196 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
197 static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
198 static void cfq_update_next_crq(struct cfq_rq *);
199 static void cfq_put_cfqd(struct cfq_data *cfqd);
200
201 /*
202  * what the fairness is based on (ie how processes are grouped and
203  * differentiated)
204  */
205 static inline unsigned long
206 cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
207 {
208         /*
209          * optimize this so that ->key_type is the offset into the struct
210          */
211         switch (cfqd->key_type) {
212                 case CFQ_KEY_PGID:
213                         return process_group(tsk);
214                 default:
215                 case CFQ_KEY_TGID:
216                         return tsk->tgid;
217                 case CFQ_KEY_UID:
218                         return tsk->uid;
219                 case CFQ_KEY_GID:
220                         return tsk->gid;
221         }
222 }
223
224 /*
225  * lots of deadline iosched dupes, can be abstracted later...
226  */
227 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
228 {
229         hlist_del_init(&crq->hash);
230 }
231
232 static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
233 {
234         cfq_del_crq_hash(crq);
235
236         if (q->last_merge == crq->request)
237                 q->last_merge = NULL;
238
239         cfq_update_next_crq(crq);
240 }
241
242 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
243 {
244         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
245
246         BUG_ON(!hlist_unhashed(&crq->hash));
247
248         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
249 }
250
251 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
252 {
253         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
254         struct hlist_node *entry, *next;
255
256         hlist_for_each_safe(entry, next, hash_list) {
257                 struct cfq_rq *crq = list_entry_hash(entry);
258                 struct request *__rq = crq->request;
259
260                 BUG_ON(hlist_unhashed(&crq->hash));
261
262                 if (!rq_mergeable(__rq)) {
263                         cfq_del_crq_hash(crq);
264                         continue;
265                 }
266
267                 if (rq_hash_key(__rq) == offset)
268                         return __rq;
269         }
270
271         return NULL;
272 }
273
274 /*
275  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
276  * We choose the request that is closest to the head right now. Distance
277  * behind the head are penalized and only allowed to a certain extent.
278  */
279 static struct cfq_rq *
280 cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
281 {
282         sector_t last, s1, s2, d1 = 0, d2 = 0;
283         int r1_wrap = 0, r2_wrap = 0;   /* requests are behind the disk head */
284         unsigned long back_max;
285
286         if (crq1 == NULL || crq1 == crq2)
287                 return crq2;
288         if (crq2 == NULL)
289                 return crq1;
290
291         s1 = crq1->request->sector;
292         s2 = crq2->request->sector;
293
294         last = cfqd->last_sector;
295
296 #if 0
297         if (!list_empty(&cfqd->queue->queue_head)) {
298                 struct list_head *entry = &cfqd->queue->queue_head;
299                 unsigned long distance = ~0UL;
300                 struct request *rq;
301
302                 while ((entry = entry->prev) != &cfqd->queue->queue_head) {
303                         rq = list_entry_rq(entry);
304
305                         if (blk_barrier_rq(rq))
306                                 break;
307
308                         if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
309                                 distance = abs(s1 - rq->sector +rq->nr_sectors);
310                                 last = rq->sector + rq->nr_sectors;
311                         }
312                         if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
313                                 distance = abs(s2 - rq->sector +rq->nr_sectors);
314                                 last = rq->sector + rq->nr_sectors;
315                         }
316                 }
317         }
318 #endif
319
320         /*
321          * by definition, 1KiB is 2 sectors
322          */
323         back_max = cfqd->cfq_back_max * 2;
324
325         /*
326          * Strict one way elevator _except_ in the case where we allow
327          * short backward seeks which are biased as twice the cost of a
328          * similar forward seek.
329          */
330         if (s1 >= last)
331                 d1 = s1 - last;
332         else if (s1 + back_max >= last)
333                 d1 = (last - s1) * cfqd->cfq_back_penalty;
334         else
335                 r1_wrap = 1;
336
337         if (s2 >= last)
338                 d2 = s2 - last;
339         else if (s2 + back_max >= last)
340                 d2 = (last - s2) * cfqd->cfq_back_penalty;
341         else
342                 r2_wrap = 1;
343
344         /* Found required data */
345         if (!r1_wrap && r2_wrap)
346                 return crq1;
347         else if (!r2_wrap && r1_wrap)
348                 return crq2;
349         else if (r1_wrap && r2_wrap) {
350                 /* both behind the head */
351                 if (s1 <= s2)
352                         return crq1;
353                 else
354                         return crq2;
355         }
356
357         /* Both requests in front of the head */
358         if (d1 < d2)
359                 return crq1;
360         else if (d2 < d1)
361                 return crq2;
362         else {
363                 if (s1 >= s2)
364                         return crq1;
365                 else
366                         return crq2;
367         }
368 }
369
370 /*
371  * would be nice to take fifo expire time into account as well
372  */
373 static struct cfq_rq *
374 cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
375                   struct cfq_rq *last)
376 {
377         struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
378         struct rb_node *rbnext, *rbprev;
379
380         if (!ON_RB(&last->rb_node))
381                 return NULL;
382
383         if ((rbnext = rb_next(&last->rb_node)) == NULL)
384                 rbnext = rb_first(&cfqq->sort_list);
385
386         rbprev = rb_prev(&last->rb_node);
387
388         if (rbprev)
389                 crq_prev = rb_entry_crq(rbprev);
390         if (rbnext)
391                 crq_next = rb_entry_crq(rbnext);
392
393         return cfq_choose_req(cfqd, crq_next, crq_prev);
394 }
395
396 static void cfq_update_next_crq(struct cfq_rq *crq)
397 {
398         struct cfq_queue *cfqq = crq->cfq_queue;
399
400         if (cfqq->next_crq == crq)
401                 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
402 }
403
404 static int cfq_check_sort_rr_list(struct cfq_queue *cfqq)
405 {
406         struct list_head *head = &cfqq->cfqd->rr_list;
407         struct list_head *next, *prev;
408
409         /*
410          * list might still be ordered
411          */
412         next = cfqq->cfq_list.next;
413         if (next != head) {
414                 struct cfq_queue *cnext = list_entry_cfqq(next);
415
416                 if (cfqq->service_used > cnext->service_used)
417                         return 1;
418         }
419
420         prev = cfqq->cfq_list.prev;
421         if (prev != head) {
422                 struct cfq_queue *cprev = list_entry_cfqq(prev);
423
424                 if (cfqq->service_used < cprev->service_used)
425                         return 1;
426         }
427
428         return 0;
429 }
430
431 static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
432 {
433         struct list_head *entry = &cfqq->cfqd->rr_list;
434
435         if (!cfqq->on_rr)
436                 return;
437         if (!new_queue && !cfq_check_sort_rr_list(cfqq))
438                 return;
439
440         list_del(&cfqq->cfq_list);
441
442         /*
443          * sort by our mean service_used, sub-sort by in-flight requests
444          */
445         while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
446                 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
447
448                 if (cfqq->service_used > __cfqq->service_used)
449                         break;
450                 else if (cfqq->service_used == __cfqq->service_used) {
451                         struct list_head *prv;
452
453                         while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
454                                 __cfqq = list_entry_cfqq(prv);
455
456                                 WARN_ON(__cfqq->service_used > cfqq->service_used);
457                                 if (cfqq->service_used != __cfqq->service_used)
458                                         break;
459                                 if (cfqq->in_flight > __cfqq->in_flight)
460                                         break;
461
462                                 entry = prv;
463                         }
464                 }
465         }
466
467         list_add(&cfqq->cfq_list, entry);
468 }
469
470 /*
471  * add to busy list of queues for service, trying to be fair in ordering
472  * the pending list according to requests serviced
473  */
474 static inline void
475 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
476 {
477         /*
478          * it's currently on the empty list
479          */
480         cfqq->on_rr = 1;
481         cfqd->busy_queues++;
482
483         if (time_after(jiffies, cfqq->service_start + cfq_service))
484                 cfqq->service_used >>= 3;
485
486         cfq_sort_rr_list(cfqq, 1);
487 }
488
489 static inline void
490 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
491 {
492         list_move(&cfqq->cfq_list, &cfqd->empty_list);
493         cfqq->on_rr = 0;
494
495         BUG_ON(!cfqd->busy_queues);
496         cfqd->busy_queues--;
497 }
498
499 /*
500  * rb tree support functions
501  */
502 static inline void cfq_del_crq_rb(struct cfq_rq *crq)
503 {
504         struct cfq_queue *cfqq = crq->cfq_queue;
505
506         if (ON_RB(&crq->rb_node)) {
507                 struct cfq_data *cfqd = cfqq->cfqd;
508
509                 BUG_ON(!cfqq->queued[crq->is_sync]);
510
511                 cfq_update_next_crq(crq);
512
513                 cfqq->queued[crq->is_sync]--;
514                 rb_erase(&crq->rb_node, &cfqq->sort_list);
515                 RB_CLEAR_COLOR(&crq->rb_node);
516
517                 if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr)
518                         cfq_del_cfqq_rr(cfqd, cfqq);
519         }
520 }
521
522 static struct cfq_rq *
523 __cfq_add_crq_rb(struct cfq_rq *crq)
524 {
525         struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
526         struct rb_node *parent = NULL;
527         struct cfq_rq *__crq;
528
529         while (*p) {
530                 parent = *p;
531                 __crq = rb_entry_crq(parent);
532
533                 if (crq->rb_key < __crq->rb_key)
534                         p = &(*p)->rb_left;
535                 else if (crq->rb_key > __crq->rb_key)
536                         p = &(*p)->rb_right;
537                 else
538                         return __crq;
539         }
540
541         rb_link_node(&crq->rb_node, parent, p);
542         return NULL;
543 }
544
545 static void cfq_add_crq_rb(struct cfq_rq *crq)
546 {
547         struct cfq_queue *cfqq = crq->cfq_queue;
548         struct cfq_data *cfqd = cfqq->cfqd;
549         struct request *rq = crq->request;
550         struct cfq_rq *__alias;
551
552         crq->rb_key = rq_rb_key(rq);
553         cfqq->queued[crq->is_sync]++;
554
555         /*
556          * looks a little odd, but the first insert might return an alias.
557          * if that happens, put the alias on the dispatch list
558          */
559         while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
560                 cfq_dispatch_sort(cfqd->queue, __alias);
561
562         rb_insert_color(&crq->rb_node, &cfqq->sort_list);
563
564         if (!cfqq->on_rr)
565                 cfq_add_cfqq_rr(cfqd, cfqq);
566
567         /*
568          * check if this request is a better next-serve candidate
569          */
570         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
571 }
572
573 static inline void
574 cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
575 {
576         if (ON_RB(&crq->rb_node)) {
577                 rb_erase(&crq->rb_node, &cfqq->sort_list);
578                 cfqq->queued[crq->is_sync]--;
579         }
580
581         cfq_add_crq_rb(crq);
582 }
583
584 static struct request *
585 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
586 {
587         const unsigned long key = cfq_hash_key(cfqd, current);
588         struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
589         struct rb_node *n;
590
591         if (!cfqq)
592                 goto out;
593
594         n = cfqq->sort_list.rb_node;
595         while (n) {
596                 struct cfq_rq *crq = rb_entry_crq(n);
597
598                 if (sector < crq->rb_key)
599                         n = n->rb_left;
600                 else if (sector > crq->rb_key)
601                         n = n->rb_right;
602                 else
603                         return crq->request;
604         }
605
606 out:
607         return NULL;
608 }
609
610 /*
611  * make sure the service time gets corrected on reissue of this request
612  */
613 static void cfq_requeue_request(request_queue_t *q, struct request *rq)
614 {
615         struct cfq_rq *crq = RQ_DATA(rq);
616
617         if (crq) {
618                 struct cfq_queue *cfqq = crq->cfq_queue;
619
620                 if (cfqq->cfqd->cfq_tagged) {
621                         cfqq->service_used--;
622                         cfq_sort_rr_list(cfqq, 0);
623                 }
624
625                 if (crq->accounted) {
626                         crq->accounted = 0;
627                         cfqq->cfqd->rq_in_driver--;
628                 }
629         }
630         list_add(&rq->queuelist, &q->queue_head);
631 }
632
633 static void cfq_remove_request(request_queue_t *q, struct request *rq)
634 {
635         struct cfq_rq *crq = RQ_DATA(rq);
636
637         if (crq) {
638                 cfq_remove_merge_hints(q, crq);
639                 list_del_init(&rq->queuelist);
640
641                 if (crq->cfq_queue)
642                         cfq_del_crq_rb(crq);
643         }
644 }
645
646 static int
647 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
648 {
649         struct cfq_data *cfqd = q->elevator->elevator_data;
650         struct request *__rq;
651         int ret;
652
653         ret = elv_try_last_merge(q, bio);
654         if (ret != ELEVATOR_NO_MERGE) {
655                 __rq = q->last_merge;
656                 goto out_insert;
657         }
658
659         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
660         if (__rq) {
661                 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
662
663                 if (elv_rq_merge_ok(__rq, bio)) {
664                         ret = ELEVATOR_BACK_MERGE;
665                         goto out;
666                 }
667         }
668
669         __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
670         if (__rq) {
671                 if (elv_rq_merge_ok(__rq, bio)) {
672                         ret = ELEVATOR_FRONT_MERGE;
673                         goto out;
674                 }
675         }
676
677         return ELEVATOR_NO_MERGE;
678 out:
679         q->last_merge = __rq;
680 out_insert:
681         *req = __rq;
682         return ret;
683 }
684
685 static void cfq_merged_request(request_queue_t *q, struct request *req)
686 {
687         struct cfq_data *cfqd = q->elevator->elevator_data;
688         struct cfq_rq *crq = RQ_DATA(req);
689
690         cfq_del_crq_hash(crq);
691         cfq_add_crq_hash(cfqd, crq);
692
693         if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
694                 struct cfq_queue *cfqq = crq->cfq_queue;
695
696                 cfq_update_next_crq(crq);
697                 cfq_reposition_crq_rb(cfqq, crq);
698         }
699
700         q->last_merge = req;
701 }
702
703 static void
704 cfq_merged_requests(request_queue_t *q, struct request *rq,
705                     struct request *next)
706 {
707         struct cfq_rq *crq = RQ_DATA(rq);
708         struct cfq_rq *cnext = RQ_DATA(next);
709
710         cfq_merged_request(q, rq);
711
712         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) {
713                 if (time_before(cnext->queue_start, crq->queue_start)) {
714                         list_move(&rq->queuelist, &next->queuelist);
715                         crq->queue_start = cnext->queue_start;
716                 }
717         }
718
719         cfq_update_next_crq(cnext);
720         cfq_remove_request(q, next);
721 }
722
723 /*
724  * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues,
725  * this function sector sorts the selected request to minimize seeks. we start
726  * at cfqd->last_sector, not 0.
727  */
728 static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
729 {
730         struct cfq_data *cfqd = q->elevator->elevator_data;
731         struct cfq_queue *cfqq = crq->cfq_queue;
732         struct list_head *head = &q->queue_head, *entry = head;
733         struct request *__rq;
734         sector_t last;
735
736         cfq_del_crq_rb(crq);
737         cfq_remove_merge_hints(q, crq);
738         list_del(&crq->request->queuelist);
739
740         last = cfqd->last_sector;
741         while ((entry = entry->prev) != head) {
742                 __rq = list_entry_rq(entry);
743
744                 if (blk_barrier_rq(crq->request))
745                         break;
746                 if (!blk_fs_request(crq->request))
747                         break;
748
749                 if (crq->request->sector > __rq->sector)
750                         break;
751                 if (__rq->sector > last && crq->request->sector < last) {
752                         last = crq->request->sector;
753                         break;
754                 }
755         }
756
757         cfqd->last_sector = last;
758         crq->in_flight = 1;
759         cfqq->in_flight++;
760         list_add(&crq->request->queuelist, entry);
761 }
762
763 /*
764  * return expired entry, or NULL to just start from scratch in rbtree
765  */
766 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
767 {
768         struct cfq_data *cfqd = cfqq->cfqd;
769         const int reads = !list_empty(&cfqq->fifo[0]);
770         const int writes = !list_empty(&cfqq->fifo[1]);
771         unsigned long now = jiffies;
772         struct cfq_rq *crq;
773
774         if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire))
775                 return NULL;
776
777         crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist));
778         if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) {
779                 cfqq->last_fifo_expire = now;
780                 return crq;
781         }
782
783         crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist));
784         if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) {
785                 cfqq->last_fifo_expire = now;
786                 return crq;
787         }
788
789         return NULL;
790 }
791
792 /*
793  * dispatch a single request from given queue
794  */
795 static inline void
796 cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd,
797                      struct cfq_queue *cfqq)
798 {
799         struct cfq_rq *crq;
800
801         /*
802          * follow expired path, else get first next available
803          */
804         if ((crq = cfq_check_fifo(cfqq)) == NULL) {
805                 if (cfqd->find_best_crq)
806                         crq = cfqq->next_crq;
807                 else
808                         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
809         }
810
811         cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
812
813         /*
814          * finally, insert request into driver list
815          */
816         cfq_dispatch_sort(q, crq);
817 }
818
819 static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch)
820 {
821         struct cfq_data *cfqd = q->elevator->elevator_data;
822         struct cfq_queue *cfqq;
823         struct list_head *entry, *tmp;
824         int queued, busy_queues, first_round;
825
826         if (list_empty(&cfqd->rr_list))
827                 return 0;
828
829         queued = 0;
830         first_round = 1;
831 restart:
832         busy_queues = 0;
833         list_for_each_safe(entry, tmp, &cfqd->rr_list) {
834                 cfqq = list_entry_cfqq(entry);
835
836                 BUG_ON(RB_EMPTY(&cfqq->sort_list));
837
838                 /*
839                  * first round of queueing, only select from queues that
840                  * don't already have io in-flight
841                  */
842                 if (first_round && cfqq->in_flight)
843                         continue;
844
845                 cfq_dispatch_request(q, cfqd, cfqq);
846
847                 if (!RB_EMPTY(&cfqq->sort_list))
848                         busy_queues++;
849
850                 queued++;
851         }
852
853         if ((queued < max_dispatch) && (busy_queues || first_round)) {
854                 first_round = 0;
855                 goto restart;
856         }
857
858         return queued;
859 }
860
861 static inline void cfq_account_dispatch(struct cfq_rq *crq)
862 {
863         struct cfq_queue *cfqq = crq->cfq_queue;
864         struct cfq_data *cfqd = cfqq->cfqd;
865         unsigned long now, elapsed;
866
867         if (!blk_fs_request(crq->request))
868                 return;
869
870         /*
871          * accounted bit is necessary since some drivers will call
872          * elv_next_request() many times for the same request (eg ide)
873          */
874         if (crq->accounted)
875                 return;
876
877         now = jiffies;
878         if (cfqq->service_start == ~0UL)
879                 cfqq->service_start = now;
880
881         /*
882          * on drives with tagged command queueing, command turn-around time
883          * doesn't necessarily reflect the time spent processing this very
884          * command inside the drive. so do the accounting differently there,
885          * by just sorting on the number of requests
886          */
887         if (cfqd->cfq_tagged) {
888                 if (time_after(now, cfqq->service_start + cfq_service)) {
889                         cfqq->service_start = now;
890                         cfqq->service_used /= 10;
891                 }
892
893                 cfqq->service_used++;
894                 cfq_sort_rr_list(cfqq, 0);
895         }
896
897         elapsed = now - crq->queue_start;
898         if (elapsed > max_elapsed_dispatch)
899                 max_elapsed_dispatch = elapsed;
900
901         crq->accounted = 1;
902         crq->service_start = now;
903
904         if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) {
905                 cfqq->cfqd->cfq_tagged = 1;
906                 printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG);
907         }
908 }
909
910 static inline void
911 cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
912 {
913         struct cfq_data *cfqd = cfqq->cfqd;
914
915         if (!crq->accounted)
916                 return;
917
918         WARN_ON(!cfqd->rq_in_driver);
919         cfqd->rq_in_driver--;
920
921         if (!cfqd->cfq_tagged) {
922                 unsigned long now = jiffies;
923                 unsigned long duration = now - crq->service_start;
924
925                 if (time_after(now, cfqq->service_start + cfq_service)) {
926                         cfqq->service_start = now;
927                         cfqq->service_used >>= 3;
928                 }
929
930                 cfqq->service_used += duration;
931                 cfq_sort_rr_list(cfqq, 0);
932
933                 if (duration > max_elapsed_crq)
934                         max_elapsed_crq = duration;
935         }
936 }
937
938 static struct request *cfq_next_request(request_queue_t *q)
939 {
940         struct cfq_data *cfqd = q->elevator->elevator_data;
941         struct request *rq;
942
943         if (!list_empty(&q->queue_head)) {
944                 struct cfq_rq *crq;
945 dispatch:
946                 rq = list_entry_rq(q->queue_head.next);
947
948                 if ((crq = RQ_DATA(rq)) != NULL) {
949                         cfq_remove_merge_hints(q, crq);
950                         cfq_account_dispatch(crq);
951                 }
952
953                 return rq;
954         }
955
956         if (cfq_dispatch_requests(q, cfqd->cfq_quantum))
957                 goto dispatch;
958
959         return NULL;
960 }
961
962 /*
963  * task holds one reference to the queue, dropped when task exits. each crq
964  * in-flight on this queue also holds a reference, dropped when crq is freed.
965  *
966  * queue lock must be held here.
967  */
968 static void cfq_put_queue(struct cfq_queue *cfqq)
969 {
970         BUG_ON(!atomic_read(&cfqq->ref));
971
972         if (!atomic_dec_and_test(&cfqq->ref))
973                 return;
974
975         BUG_ON(rb_first(&cfqq->sort_list));
976         BUG_ON(cfqq->on_rr);
977
978         cfq_put_cfqd(cfqq->cfqd);
979
980         /*
981          * it's on the empty list and still hashed
982          */
983         list_del(&cfqq->cfq_list);
984         hlist_del(&cfqq->cfq_hash);
985         kmem_cache_free(cfq_pool, cfqq);
986 }
987
988 static inline struct cfq_queue *
989 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
990 {
991         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
992         struct hlist_node *entry, *next;
993
994         hlist_for_each_safe(entry, next, hash_list) {
995                 struct cfq_queue *__cfqq = list_entry_qhash(entry);
996
997                 if (__cfqq->key == key)
998                         return __cfqq;
999         }
1000
1001         return NULL;
1002 }
1003
1004 static struct cfq_queue *
1005 cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key)
1006 {
1007         return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
1008 }
1009
1010 static inline void
1011 cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
1012                 struct cfq_io_context *cic)
1013 {
1014         unsigned long hashkey = cfq_hash_key(cfqd, current);
1015         unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
1016         struct cfq_queue *__cfqq;
1017         unsigned long flags;
1018
1019         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1020
1021         hlist_del(&(*cfqq)->cfq_hash);
1022
1023         __cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
1024         if (!__cfqq || __cfqq == *cfqq) {
1025                 __cfqq = *cfqq;
1026                 hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1027                 __cfqq->key_type = cfqd->key_type;
1028         } else {
1029                 atomic_inc(&__cfqq->ref);
1030                 cic->cfqq = __cfqq;
1031                 cfq_put_queue(*cfqq);
1032                 *cfqq = __cfqq;
1033         }
1034
1035         cic->cfqq = __cfqq;
1036         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1037 }
1038
1039 static void cfq_free_io_context(struct cfq_io_context *cic)
1040 {
1041         kmem_cache_free(cfq_ioc_pool, cic);
1042 }
1043
1044 /*
1045  * locking hierarchy is: io_context lock -> queue locks
1046  */
1047 static void cfq_exit_io_context(struct cfq_io_context *cic)
1048 {
1049         struct cfq_queue *cfqq = cic->cfqq;
1050         struct list_head *entry = &cic->list;
1051         request_queue_t *q;
1052         unsigned long flags;
1053
1054         /*
1055          * put the reference this task is holding to the various queues
1056          */
1057         spin_lock_irqsave(&cic->ioc->lock, flags);
1058         while ((entry = cic->list.next) != &cic->list) {
1059                 struct cfq_io_context *__cic;
1060
1061                 __cic = list_entry(entry, struct cfq_io_context, list);
1062                 list_del(entry);
1063
1064                 q = __cic->cfqq->cfqd->queue;
1065                 spin_lock(q->queue_lock);
1066                 cfq_put_queue(__cic->cfqq);
1067                 spin_unlock(q->queue_lock);
1068         }
1069
1070         q = cfqq->cfqd->queue;
1071         spin_lock(q->queue_lock);
1072         cfq_put_queue(cfqq);
1073         spin_unlock(q->queue_lock);
1074
1075         cic->cfqq = NULL;
1076         spin_unlock_irqrestore(&cic->ioc->lock, flags);
1077 }
1078
1079 static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
1080 {
1081         struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags);
1082
1083         if (cic) {
1084                 cic->dtor = cfq_free_io_context;
1085                 cic->exit = cfq_exit_io_context;
1086                 INIT_LIST_HEAD(&cic->list);
1087                 cic->cfqq = NULL;
1088         }
1089
1090         return cic;
1091 }
1092
1093 /*
1094  * Setup general io context and cfq io context. There can be several cfq
1095  * io contexts per general io context, if this process is doing io to more
1096  * than one device managed by cfq. Note that caller is holding a reference to
1097  * cfqq, so we don't need to worry about it disappearing
1098  */
1099 static struct cfq_io_context *
1100 cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
1101 {
1102         struct cfq_data *cfqd = (*cfqq)->cfqd;
1103         struct cfq_queue *__cfqq = *cfqq;
1104         struct cfq_io_context *cic;
1105         struct io_context *ioc;
1106
1107         might_sleep_if(gfp_flags & __GFP_WAIT);
1108
1109         ioc = get_io_context(gfp_flags);
1110         if (!ioc)
1111                 return NULL;
1112
1113         if ((cic = ioc->cic) == NULL) {
1114                 cic = cfq_alloc_io_context(gfp_flags);
1115
1116                 if (cic == NULL)
1117                         goto err;
1118
1119                 ioc->cic = cic;
1120                 cic->ioc = ioc;
1121                 cic->cfqq = __cfqq;
1122                 atomic_inc(&__cfqq->ref);
1123         } else {
1124                 struct cfq_io_context *__cic;
1125                 unsigned long flags;
1126
1127                 /*
1128                  * since the first cic on the list is actually the head
1129                  * itself, need to check this here or we'll duplicate an
1130                  * cic per ioc for no reason
1131                  */
1132                 if (cic->cfqq == __cfqq)
1133                         goto out;
1134
1135                 /*
1136                  * cic exists, check if we already are there. linear search
1137                  * should be ok here, the list will usually not be more than
1138                  * 1 or a few entries long
1139                  */
1140                 spin_lock_irqsave(&ioc->lock, flags);
1141                 list_for_each_entry(__cic, &cic->list, list) {
1142                         /*
1143                          * this process is already holding a reference to
1144                          * this queue, so no need to get one more
1145                          */
1146                         if (__cic->cfqq == __cfqq) {
1147                                 cic = __cic;
1148                                 spin_unlock_irqrestore(&ioc->lock, flags);
1149                                 goto out;
1150                         }
1151                 }
1152                 spin_unlock_irqrestore(&ioc->lock, flags);
1153
1154                 /*
1155                  * nope, process doesn't have a cic assoicated with this
1156                  * cfqq yet. get a new one and add to list
1157                  */
1158                 __cic = cfq_alloc_io_context(gfp_flags);
1159                 if (__cic == NULL)
1160                         goto err;
1161
1162                 __cic->ioc = ioc;
1163                 __cic->cfqq = __cfqq;
1164                 atomic_inc(&__cfqq->ref);
1165                 spin_lock_irqsave(&ioc->lock, flags);
1166                 list_add(&__cic->list, &cic->list);
1167                 spin_unlock_irqrestore(&ioc->lock, flags);
1168
1169                 cic = __cic;
1170                 *cfqq = __cfqq;
1171         }
1172
1173 out:
1174         /*
1175          * if key_type has been changed on the fly, we lazily rehash
1176          * each queue at lookup time
1177          */
1178         if ((*cfqq)->key_type != cfqd->key_type)
1179                 cfq_rehash_cfqq(cfqd, cfqq, cic);
1180
1181         return cic;
1182 err:
1183         put_io_context(ioc);
1184         return NULL;
1185 }
1186
1187 static struct cfq_queue *
1188 __cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
1189 {
1190         const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1191         struct cfq_queue *cfqq, *new_cfqq = NULL;
1192
1193 retry:
1194         cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
1195
1196         if (!cfqq) {
1197                 if (new_cfqq) {
1198                         cfqq = new_cfqq;
1199                         new_cfqq = NULL;
1200                 } else if (gfp_mask & __GFP_WAIT) {
1201                         spin_unlock_irq(cfqd->queue->queue_lock);
1202                         new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1203                         spin_lock_irq(cfqd->queue->queue_lock);
1204                         goto retry;
1205                 } else
1206                         goto out;
1207
1208                 memset(cfqq, 0, sizeof(*cfqq));
1209
1210                 INIT_HLIST_NODE(&cfqq->cfq_hash);
1211                 INIT_LIST_HEAD(&cfqq->cfq_list);
1212                 RB_CLEAR_ROOT(&cfqq->sort_list);
1213                 INIT_LIST_HEAD(&cfqq->fifo[0]);
1214                 INIT_LIST_HEAD(&cfqq->fifo[1]);
1215
1216                 cfqq->key = key;
1217                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1218                 atomic_set(&cfqq->ref, 0);
1219                 cfqq->cfqd = cfqd;
1220                 atomic_inc(&cfqd->ref);
1221                 cfqq->key_type = cfqd->key_type;
1222                 cfqq->service_start = ~0UL;
1223         }
1224
1225         if (new_cfqq)
1226                 kmem_cache_free(cfq_pool, new_cfqq);
1227
1228         atomic_inc(&cfqq->ref);
1229 out:
1230         WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1231         return cfqq;
1232 }
1233
1234 static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
1235 {
1236         crq->is_sync = 0;
1237         if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE)
1238                 crq->is_sync = 1;
1239
1240         cfq_add_crq_rb(crq);
1241         crq->queue_start = jiffies;
1242
1243         list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]);
1244 }
1245
1246 static void
1247 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
1248 {
1249         struct cfq_data *cfqd = q->elevator->elevator_data;
1250         struct cfq_rq *crq = RQ_DATA(rq);
1251
1252         switch (where) {
1253                 case ELEVATOR_INSERT_BACK:
1254                         while (cfq_dispatch_requests(q, cfqd->cfq_quantum))
1255                                 ;
1256                         list_add_tail(&rq->queuelist, &q->queue_head);
1257                         break;
1258                 case ELEVATOR_INSERT_FRONT:
1259                         list_add(&rq->queuelist, &q->queue_head);
1260                         break;
1261                 case ELEVATOR_INSERT_SORT:
1262                         BUG_ON(!blk_fs_request(rq));
1263                         cfq_enqueue(cfqd, crq);
1264                         break;
1265                 default:
1266                         printk("%s: bad insert point %d\n", __FUNCTION__,where);
1267                         return;
1268         }
1269
1270         if (rq_mergeable(rq)) {
1271                 cfq_add_crq_hash(cfqd, crq);
1272
1273                 if (!q->last_merge)
1274                         q->last_merge = rq;
1275         }
1276 }
1277
1278 static int cfq_queue_empty(request_queue_t *q)
1279 {
1280         struct cfq_data *cfqd = q->elevator->elevator_data;
1281
1282         return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list);
1283 }
1284
1285 static void cfq_completed_request(request_queue_t *q, struct request *rq)
1286 {
1287         struct cfq_rq *crq = RQ_DATA(rq);
1288         struct cfq_queue *cfqq;
1289
1290         if (unlikely(!blk_fs_request(rq)))
1291                 return;
1292
1293         cfqq = crq->cfq_queue;
1294
1295         if (crq->in_flight) {
1296                 WARN_ON(!cfqq->in_flight);
1297                 cfqq->in_flight--;
1298         }
1299
1300         cfq_account_completion(cfqq, crq);
1301 }
1302
1303 static struct request *
1304 cfq_former_request(request_queue_t *q, struct request *rq)
1305 {
1306         struct cfq_rq *crq = RQ_DATA(rq);
1307         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1308
1309         if (rbprev)
1310                 return rb_entry_crq(rbprev)->request;
1311
1312         return NULL;
1313 }
1314
1315 static struct request *
1316 cfq_latter_request(request_queue_t *q, struct request *rq)
1317 {
1318         struct cfq_rq *crq = RQ_DATA(rq);
1319         struct rb_node *rbnext = rb_next(&crq->rb_node);
1320
1321         if (rbnext)
1322                 return rb_entry_crq(rbnext)->request;
1323
1324         return NULL;
1325 }
1326
1327 static int cfq_may_queue(request_queue_t *q, int rw)
1328 {
1329         struct cfq_data *cfqd = q->elevator->elevator_data;
1330         struct cfq_queue *cfqq;
1331         int ret = ELV_MQUEUE_MAY;
1332
1333         if (current->flags & PF_MEMALLOC)
1334                 return ELV_MQUEUE_MAY;
1335
1336         cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current));
1337         if (cfqq) {
1338                 int limit = cfqd->max_queued;
1339
1340                 if (cfqq->allocated[rw] < cfqd->cfq_queued)
1341                         return ELV_MQUEUE_MUST;
1342
1343                 if (cfqd->busy_queues)
1344                         limit = q->nr_requests / cfqd->busy_queues;
1345
1346                 if (limit < cfqd->cfq_queued)
1347                         limit = cfqd->cfq_queued;
1348                 else if (limit > cfqd->max_queued)
1349                         limit = cfqd->max_queued;
1350
1351                 if (cfqq->allocated[rw] >= limit) {
1352                         if (limit > cfqq->alloc_limit[rw])
1353                                 cfqq->alloc_limit[rw] = limit;
1354
1355                         ret = ELV_MQUEUE_NO;
1356                 }
1357         }
1358
1359         return ret;
1360 }
1361
1362 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1363 {
1364         struct request_list *rl = &q->rq;
1365         const int write = waitqueue_active(&rl->wait[WRITE]);
1366         const int read = waitqueue_active(&rl->wait[READ]);
1367
1368         if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ])
1369                 wake_up(&rl->wait[READ]);
1370         if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE])
1371                 wake_up(&rl->wait[WRITE]);
1372 }
1373
1374 /*
1375  * queue lock held here
1376  */
1377 static void cfq_put_request(request_queue_t *q, struct request *rq)
1378 {
1379         struct cfq_data *cfqd = q->elevator->elevator_data;
1380         struct cfq_rq *crq = RQ_DATA(rq);
1381
1382         if (crq) {
1383                 struct cfq_queue *cfqq = crq->cfq_queue;
1384
1385                 BUG_ON(q->last_merge == rq);
1386                 BUG_ON(!hlist_unhashed(&crq->hash));
1387
1388                 if (crq->io_context)
1389                         put_io_context(crq->io_context->ioc);
1390
1391                 BUG_ON(!cfqq->allocated[crq->is_write]);
1392                 cfqq->allocated[crq->is_write]--;
1393
1394                 mempool_free(crq, cfqd->crq_pool);
1395                 rq->elevator_private = NULL;
1396
1397                 smp_mb();
1398                 cfq_check_waiters(q, cfqq);
1399                 cfq_put_queue(cfqq);
1400         }
1401 }
1402
1403 /*
1404  * Allocate cfq data structures associated with this request. A queue and
1405  */
1406 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
1407 {
1408         struct cfq_data *cfqd = q->elevator->elevator_data;
1409         struct cfq_io_context *cic;
1410         const int rw = rq_data_dir(rq);
1411         struct cfq_queue *cfqq, *saved_cfqq;
1412         struct cfq_rq *crq;
1413         unsigned long flags;
1414
1415         might_sleep_if(gfp_mask & __GFP_WAIT);
1416
1417         spin_lock_irqsave(q->queue_lock, flags);
1418
1419         cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask);
1420         if (!cfqq)
1421                 goto out_lock;
1422
1423 repeat:
1424         if (cfqq->allocated[rw] >= cfqd->max_queued)
1425                 goto out_lock;
1426
1427         cfqq->allocated[rw]++;
1428         spin_unlock_irqrestore(q->queue_lock, flags);
1429
1430         /*
1431          * if hashing type has changed, the cfq_queue might change here.
1432          */
1433         saved_cfqq = cfqq;
1434         cic = cfq_get_io_context(&cfqq, gfp_mask);
1435         if (!cic)
1436                 goto err;
1437
1438         /*
1439          * repeat allocation checks on queue change
1440          */
1441         if (unlikely(saved_cfqq != cfqq)) {
1442                 spin_lock_irqsave(q->queue_lock, flags);
1443                 saved_cfqq->allocated[rw]--;
1444                 goto repeat;
1445         }
1446
1447         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1448         if (crq) {
1449                 RB_CLEAR(&crq->rb_node);
1450                 crq->rb_key = 0;
1451                 crq->request = rq;
1452                 INIT_HLIST_NODE(&crq->hash);
1453                 crq->cfq_queue = cfqq;
1454                 crq->io_context = cic;
1455                 crq->service_start = crq->queue_start = 0;
1456                 crq->in_flight = crq->accounted = crq->is_sync = 0;
1457                 crq->is_write = rw;
1458                 rq->elevator_private = crq;
1459                 cfqq->alloc_limit[rw] = 0;
1460                 return 0;
1461         }
1462
1463         put_io_context(cic->ioc);
1464 err:
1465         spin_lock_irqsave(q->queue_lock, flags);
1466         cfqq->allocated[rw]--;
1467         cfq_put_queue(cfqq);
1468 out_lock:
1469         spin_unlock_irqrestore(q->queue_lock, flags);
1470         return 1;
1471 }
1472
1473 static void cfq_put_cfqd(struct cfq_data *cfqd)
1474 {
1475         request_queue_t *q = cfqd->queue;
1476
1477         if (!atomic_dec_and_test(&cfqd->ref))
1478                 return;
1479
1480         blk_put_queue(q);
1481
1482         mempool_destroy(cfqd->crq_pool);
1483         kfree(cfqd->crq_hash);
1484         kfree(cfqd->cfq_hash);
1485         kfree(cfqd);
1486 }
1487
1488 static void cfq_exit_queue(elevator_t *e)
1489 {
1490         cfq_put_cfqd(e->elevator_data);
1491 }
1492
1493 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
1494 {
1495         struct cfq_data *cfqd;
1496         int i;
1497
1498         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
1499         if (!cfqd)
1500                 return -ENOMEM;
1501
1502         memset(cfqd, 0, sizeof(*cfqd));
1503         INIT_LIST_HEAD(&cfqd->rr_list);
1504         INIT_LIST_HEAD(&cfqd->empty_list);
1505
1506         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
1507         if (!cfqd->crq_hash)
1508                 goto out_crqhash;
1509
1510         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
1511         if (!cfqd->cfq_hash)
1512                 goto out_cfqhash;
1513
1514         cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
1515         if (!cfqd->crq_pool)
1516                 goto out_crqpool;
1517
1518         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
1519                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
1520         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
1521                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
1522
1523         e->elevator_data = cfqd;
1524
1525         cfqd->queue = q;
1526         atomic_inc(&q->refcnt);
1527
1528         /*
1529          * just set it to some high value, we want anyone to be able to queue
1530          * some requests. fairness is handled differently
1531          */
1532         q->nr_requests = 1024;
1533         cfqd->max_queued = q->nr_requests / 16;
1534         q->nr_batching = cfq_queued;
1535         cfqd->key_type = CFQ_KEY_TGID;
1536         cfqd->find_best_crq = 1;
1537         atomic_set(&cfqd->ref, 1);
1538
1539         cfqd->cfq_queued = cfq_queued;
1540         cfqd->cfq_quantum = cfq_quantum;
1541         cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r;
1542         cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w;
1543         cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
1544         cfqd->cfq_back_max = cfq_back_max;
1545         cfqd->cfq_back_penalty = cfq_back_penalty;
1546
1547         return 0;
1548 out_crqpool:
1549         kfree(cfqd->cfq_hash);
1550 out_cfqhash:
1551         kfree(cfqd->crq_hash);
1552 out_crqhash:
1553         kfree(cfqd);
1554         return -ENOMEM;
1555 }
1556
1557 static void cfq_slab_kill(void)
1558 {
1559         if (crq_pool)
1560                 kmem_cache_destroy(crq_pool);
1561         if (cfq_pool)
1562                 kmem_cache_destroy(cfq_pool);
1563         if (cfq_ioc_pool)
1564                 kmem_cache_destroy(cfq_ioc_pool);
1565 }
1566
1567 static int __init cfq_slab_setup(void)
1568 {
1569         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
1570                                         NULL, NULL);
1571         if (!crq_pool)
1572                 goto fail;
1573
1574         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
1575                                         NULL, NULL);
1576         if (!cfq_pool)
1577                 goto fail;
1578
1579         cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
1580                         sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
1581         if (!cfq_ioc_pool)
1582                 goto fail;
1583
1584         return 0;
1585 fail:
1586         cfq_slab_kill();
1587         return -ENOMEM;
1588 }
1589
1590
1591 /*
1592  * sysfs parts below -->
1593  */
1594 struct cfq_fs_entry {
1595         struct attribute attr;
1596         ssize_t (*show)(struct cfq_data *, char *);
1597         ssize_t (*store)(struct cfq_data *, const char *, size_t);
1598 };
1599
1600 static ssize_t
1601 cfq_var_show(unsigned int var, char *page)
1602 {
1603         return sprintf(page, "%d\n", var);
1604 }
1605
1606 static ssize_t
1607 cfq_var_store(unsigned int *var, const char *page, size_t count)
1608 {
1609         char *p = (char *) page;
1610
1611         *var = simple_strtoul(p, &p, 10);
1612         return count;
1613 }
1614
1615 static ssize_t
1616 cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
1617 {
1618         max_elapsed_dispatch = max_elapsed_crq = 0;
1619         return count;
1620 }
1621
1622 static ssize_t
1623 cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
1624 {
1625         spin_lock_irq(cfqd->queue->queue_lock);
1626         if (!strncmp(page, "pgid", 4))
1627                 cfqd->key_type = CFQ_KEY_PGID;
1628         else if (!strncmp(page, "tgid", 4))
1629                 cfqd->key_type = CFQ_KEY_TGID;
1630         else if (!strncmp(page, "uid", 3))
1631                 cfqd->key_type = CFQ_KEY_UID;
1632         else if (!strncmp(page, "gid", 3))
1633                 cfqd->key_type = CFQ_KEY_GID;
1634         spin_unlock_irq(cfqd->queue->queue_lock);
1635         return count;
1636 }
1637
1638 static ssize_t
1639 cfq_read_key_type(struct cfq_data *cfqd, char *page)
1640 {
1641         ssize_t len = 0;
1642         int i;
1643
1644         for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
1645                 if (cfqd->key_type == i)
1646                         len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
1647                 else
1648                         len += sprintf(page+len, "%s ", cfq_key_types[i]);
1649         }
1650         len += sprintf(page+len, "\n");
1651         return len;
1652 }
1653
1654 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
1655 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)                \
1656 {                                                                       \
1657         unsigned int __data = __VAR;                                    \
1658         if (__CONV)                                                     \
1659                 __data = jiffies_to_msecs(__data);                      \
1660         return cfq_var_show(__data, (page));                            \
1661 }
1662 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
1663 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
1664 SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1);
1665 SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1);
1666 SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1);
1667 SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0);
1668 SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
1669 SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
1670 #undef SHOW_FUNCTION
1671
1672 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
1673 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)    \
1674 {                                                                       \
1675         unsigned int __data;                                            \
1676         int ret = cfq_var_store(&__data, (page), count);                \
1677         if (__data < (MIN))                                             \
1678                 __data = (MIN);                                         \
1679         else if (__data > (MAX))                                        \
1680                 __data = (MAX);                                         \
1681         if (__CONV)                                                     \
1682                 *(__PTR) = msecs_to_jiffies(__data);                    \
1683         else                                                            \
1684                 *(__PTR) = __data;                                      \
1685         return ret;                                                     \
1686 }
1687 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
1688 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
1689 STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1);
1690 STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1);
1691 STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1);
1692 STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0);
1693 STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
1694 STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
1695 #undef STORE_FUNCTION
1696
1697 static struct cfq_fs_entry cfq_quantum_entry = {
1698         .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
1699         .show = cfq_quantum_show,
1700         .store = cfq_quantum_store,
1701 };
1702 static struct cfq_fs_entry cfq_queued_entry = {
1703         .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
1704         .show = cfq_queued_show,
1705         .store = cfq_queued_store,
1706 };
1707 static struct cfq_fs_entry cfq_fifo_expire_r_entry = {
1708         .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
1709         .show = cfq_fifo_expire_r_show,
1710         .store = cfq_fifo_expire_r_store,
1711 };
1712 static struct cfq_fs_entry cfq_fifo_expire_w_entry = {
1713         .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
1714         .show = cfq_fifo_expire_w_show,
1715         .store = cfq_fifo_expire_w_store,
1716 };
1717 static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
1718         .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
1719         .show = cfq_fifo_batch_expire_show,
1720         .store = cfq_fifo_batch_expire_store,
1721 };
1722 static struct cfq_fs_entry cfq_find_best_entry = {
1723         .attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
1724         .show = cfq_find_best_show,
1725         .store = cfq_find_best_store,
1726 };
1727 static struct cfq_fs_entry cfq_back_max_entry = {
1728         .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
1729         .show = cfq_back_max_show,
1730         .store = cfq_back_max_store,
1731 };
1732 static struct cfq_fs_entry cfq_back_penalty_entry = {
1733         .attr = {.name = "back_seek_penalty", .mode = S_IRUGO | S_IWUSR },
1734         .show = cfq_back_penalty_show,
1735         .store = cfq_back_penalty_store,
1736 };
1737 static struct cfq_fs_entry cfq_clear_elapsed_entry = {
1738         .attr = {.name = "clear_elapsed", .mode = S_IWUSR },
1739         .store = cfq_clear_elapsed,
1740 };
1741 static struct cfq_fs_entry cfq_key_type_entry = {
1742         .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
1743         .show = cfq_read_key_type,
1744         .store = cfq_set_key_type,
1745 };
1746
1747 static struct attribute *default_attrs[] = {
1748         &cfq_quantum_entry.attr,
1749         &cfq_queued_entry.attr,
1750         &cfq_fifo_expire_r_entry.attr,
1751         &cfq_fifo_expire_w_entry.attr,
1752         &cfq_fifo_batch_expire_entry.attr,
1753         &cfq_key_type_entry.attr,
1754         &cfq_find_best_entry.attr,
1755         &cfq_back_max_entry.attr,
1756         &cfq_back_penalty_entry.attr,
1757         &cfq_clear_elapsed_entry.attr,
1758         NULL,
1759 };
1760
1761 #define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
1762
1763 static ssize_t
1764 cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1765 {
1766         elevator_t *e = container_of(kobj, elevator_t, kobj);
1767         struct cfq_fs_entry *entry = to_cfq(attr);
1768
1769         if (!entry->show)
1770                 return 0;
1771
1772         return entry->show(e->elevator_data, page);
1773 }
1774
1775 static ssize_t
1776 cfq_attr_store(struct kobject *kobj, struct attribute *attr,
1777                const char *page, size_t length)
1778 {
1779         elevator_t *e = container_of(kobj, elevator_t, kobj);
1780         struct cfq_fs_entry *entry = to_cfq(attr);
1781
1782         if (!entry->store)
1783                 return -EINVAL;
1784
1785         return entry->store(e->elevator_data, page, length);
1786 }
1787
1788 static struct sysfs_ops cfq_sysfs_ops = {
1789         .show   = cfq_attr_show,
1790         .store  = cfq_attr_store,
1791 };
1792
1793 struct kobj_type cfq_ktype = {
1794         .sysfs_ops      = &cfq_sysfs_ops,
1795         .default_attrs  = default_attrs,
1796 };
1797
1798 static struct elevator_type iosched_cfq = {
1799         .ops = {
1800                 .elevator_merge_fn =            cfq_merge,
1801                 .elevator_merged_fn =           cfq_merged_request,
1802                 .elevator_merge_req_fn =        cfq_merged_requests,
1803                 .elevator_next_req_fn =         cfq_next_request,
1804                 .elevator_add_req_fn =          cfq_insert_request,
1805                 .elevator_remove_req_fn =       cfq_remove_request,
1806                 .elevator_requeue_req_fn =      cfq_requeue_request,
1807                 .elevator_queue_empty_fn =      cfq_queue_empty,
1808                 .elevator_completed_req_fn =    cfq_completed_request,
1809                 .elevator_former_req_fn =       cfq_former_request,
1810                 .elevator_latter_req_fn =       cfq_latter_request,
1811                 .elevator_set_req_fn =          cfq_set_request,
1812                 .elevator_put_req_fn =          cfq_put_request,
1813                 .elevator_may_queue_fn =        cfq_may_queue,
1814                 .elevator_init_fn =             cfq_init_queue,
1815                 .elevator_exit_fn =             cfq_exit_queue,
1816         },
1817         .elevator_ktype =       &cfq_ktype,
1818         .elevator_name =        "cfq",
1819         .elevator_owner =       THIS_MODULE,
1820 };
1821
1822 static int __init cfq_init(void)
1823 {
1824         int ret;
1825
1826         if (cfq_slab_setup())
1827                 return -ENOMEM;
1828
1829         ret = elv_register(&iosched_cfq);
1830         if (!ret) {
1831                 __module_get(THIS_MODULE);
1832                 return 0;
1833         }
1834
1835         cfq_slab_kill();
1836         return ret;
1837 }
1838
1839 static void __exit cfq_exit(void)
1840 {
1841         cfq_slab_kill();
1842         elv_unregister(&iosched_cfq);
1843 }
1844
1845 module_init(cfq_init);
1846 module_exit(cfq_exit);
1847
1848 MODULE_AUTHOR("Jens Axboe");
1849 MODULE_LICENSE("GPL");
1850 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");