Merge to Fedora kernel-2.6.18-1.2224_FC5 patched with stable patch-2.6.18.1-vs2.0...
[linux-2.6.git] / block / cfq-iosched.c
1 /*
2  *  CFQ, or complete fairness queueing, disk scheduler.
3  *
4  *  Based on ideas from a previously unfinished io
5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6  *
7  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
8  */
9 #include <linux/module.h>
10 #include <linux/blkdev.h>
11 #include <linux/elevator.h>
12 #include <linux/hash.h>
13 #include <linux/rbtree.h>
14 #include <linux/ioprio.h>
15
16 /*
17  * tunables
18  */
19 static const int cfq_quantum = 4;               /* max queue in one round of service */
20 static const int cfq_queued = 8;                /* minimum rq allocate limit per-queue*/
21 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
22 static const int cfq_back_max = 16 * 1024;      /* maximum backwards seek, in KiB */
23 static const int cfq_back_penalty = 2;          /* penalty of a backwards seek */
24
25 static const int cfq_slice_sync = HZ / 10;
26 static int cfq_slice_async = HZ / 25;
27 static const int cfq_slice_async_rq = 2;
28 static int cfq_slice_idle = HZ / 125;
29
30 #define CFQ_IDLE_GRACE          (HZ / 10)
31 #define CFQ_SLICE_SCALE         (5)
32
33 #define CFQ_KEY_ASYNC           (0)
34
35 static DEFINE_SPINLOCK(cfq_exit_lock);
36
37 /*
38  * for the hash of cfqq inside the cfqd
39  */
40 #define CFQ_QHASH_SHIFT         6
41 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
42 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
43
44 /*
45  * for the hash of crq inside the cfqq
46  */
47 #define CFQ_MHASH_SHIFT         6
48 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
49 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
50 #define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
51 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
52 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
53
54 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
55 #define list_entry_fifo(ptr)    list_entry((ptr), struct request, queuelist)
56
57 #define RQ_DATA(rq)             (rq)->elevator_private
58
59 /*
60  * rb-tree defines
61  */
62 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
63 #define rq_rb_key(rq)           (rq)->sector
64
65 static kmem_cache_t *crq_pool;
66 static kmem_cache_t *cfq_pool;
67 static kmem_cache_t *cfq_ioc_pool;
68
69 static atomic_t ioc_count = ATOMIC_INIT(0);
70 static struct completion *ioc_gone;
71
72 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
73 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
74 #define cfq_class_be(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
75 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
76
77 #define ASYNC                   (0)
78 #define SYNC                    (1)
79
80 #define cfq_cfqq_dispatched(cfqq)       \
81         ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
82
83 #define cfq_cfqq_class_sync(cfqq)       ((cfqq)->key != CFQ_KEY_ASYNC)
84
85 #define cfq_cfqq_sync(cfqq)             \
86         (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
87
88 #define sample_valid(samples)   ((samples) > 80)
89
90 /*
91  * Per block device queue structure
92  */
93 struct cfq_data {
94         request_queue_t *queue;
95
96         /*
97          * rr list of queues with requests and the count of them
98          */
99         struct list_head rr_list[CFQ_PRIO_LISTS];
100         struct list_head busy_rr;
101         struct list_head cur_rr;
102         struct list_head idle_rr;
103         unsigned int busy_queues;
104
105         /*
106          * non-ordered list of empty cfqq's
107          */
108         struct list_head empty_list;
109
110         /*
111          * cfqq lookup hash
112          */
113         struct hlist_head *cfq_hash;
114
115         /*
116          * global crq hash for all queues
117          */
118         struct hlist_head *crq_hash;
119
120         mempool_t *crq_pool;
121
122         int rq_in_driver;
123         int hw_tag;
124
125         /*
126          * schedule slice state info
127          */
128         /*
129          * idle window management
130          */
131         struct timer_list idle_slice_timer;
132         struct work_struct unplug_work;
133
134         struct cfq_queue *active_queue;
135         struct cfq_io_context *active_cic;
136         int cur_prio, cur_end_prio;
137         unsigned int dispatch_slice;
138
139         struct timer_list idle_class_timer;
140
141         sector_t last_sector;
142         unsigned long last_end_request;
143
144         unsigned int rq_starved;
145
146         /*
147          * tunables, see top of file
148          */
149         unsigned int cfq_quantum;
150         unsigned int cfq_queued;
151         unsigned int cfq_fifo_expire[2];
152         unsigned int cfq_back_penalty;
153         unsigned int cfq_back_max;
154         unsigned int cfq_slice[2];
155         unsigned int cfq_slice_async_rq;
156         unsigned int cfq_slice_idle;
157
158         struct list_head cic_list;
159 };
160
161 /*
162  * Per process-grouping structure
163  */
164 struct cfq_queue {
165         /* reference count */
166         atomic_t ref;
167         /* parent cfq_data */
168         struct cfq_data *cfqd;
169         /* cfqq lookup hash */
170         struct hlist_node cfq_hash;
171         /* hash key */
172         unsigned int key;
173         /* on either rr or empty list of cfqd */
174         struct list_head cfq_list;
175         /* sorted list of pending requests */
176         struct rb_root sort_list;
177         /* if fifo isn't expired, next request to serve */
178         struct cfq_rq *next_crq;
179         /* requests queued in sort_list */
180         int queued[2];
181         /* currently allocated requests */
182         int allocated[2];
183         /* fifo list of requests in sort_list */
184         struct list_head fifo;
185
186         unsigned long slice_start;
187         unsigned long slice_end;
188         unsigned long slice_left;
189         unsigned long service_last;
190
191         /* number of requests that are on the dispatch list */
192         int on_dispatch[2];
193
194         /* io prio of this group */
195         unsigned short ioprio, org_ioprio;
196         unsigned short ioprio_class, org_ioprio_class;
197
198         /* various state flags, see below */
199         unsigned int flags;
200 };
201
202 struct cfq_rq {
203         struct rb_node rb_node;
204         sector_t rb_key;
205         struct request *request;
206         struct hlist_node hash;
207
208         struct cfq_queue *cfq_queue;
209         struct cfq_io_context *io_context;
210
211         unsigned int crq_flags;
212 };
213
214 enum cfqq_state_flags {
215         CFQ_CFQQ_FLAG_on_rr = 0,
216         CFQ_CFQQ_FLAG_wait_request,
217         CFQ_CFQQ_FLAG_must_alloc,
218         CFQ_CFQQ_FLAG_must_alloc_slice,
219         CFQ_CFQQ_FLAG_must_dispatch,
220         CFQ_CFQQ_FLAG_fifo_expire,
221         CFQ_CFQQ_FLAG_idle_window,
222         CFQ_CFQQ_FLAG_prio_changed,
223 };
224
225 #define CFQ_CFQQ_FNS(name)                                              \
226 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
227 {                                                                       \
228         cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);                     \
229 }                                                                       \
230 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
231 {                                                                       \
232         cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                    \
233 }                                                                       \
234 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
235 {                                                                       \
236         return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;        \
237 }
238
239 CFQ_CFQQ_FNS(on_rr);
240 CFQ_CFQQ_FNS(wait_request);
241 CFQ_CFQQ_FNS(must_alloc);
242 CFQ_CFQQ_FNS(must_alloc_slice);
243 CFQ_CFQQ_FNS(must_dispatch);
244 CFQ_CFQQ_FNS(fifo_expire);
245 CFQ_CFQQ_FNS(idle_window);
246 CFQ_CFQQ_FNS(prio_changed);
247 #undef CFQ_CFQQ_FNS
248
249 enum cfq_rq_state_flags {
250         CFQ_CRQ_FLAG_is_sync = 0,
251 };
252
253 #define CFQ_CRQ_FNS(name)                                               \
254 static inline void cfq_mark_crq_##name(struct cfq_rq *crq)              \
255 {                                                                       \
256         crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);                   \
257 }                                                                       \
258 static inline void cfq_clear_crq_##name(struct cfq_rq *crq)             \
259 {                                                                       \
260         crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);                  \
261 }                                                                       \
262 static inline int cfq_crq_##name(const struct cfq_rq *crq)              \
263 {                                                                       \
264         return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;      \
265 }
266
267 CFQ_CRQ_FNS(is_sync);
268 #undef CFQ_CRQ_FNS
269
270 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
271 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
272 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
273
274 /*
275  * lots of deadline iosched dupes, can be abstracted later...
276  */
277 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
278 {
279         hlist_del_init(&crq->hash);
280 }
281
282 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
283 {
284         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
285
286         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
287 }
288
289 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
290 {
291         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
292         struct hlist_node *entry, *next;
293
294         hlist_for_each_safe(entry, next, hash_list) {
295                 struct cfq_rq *crq = list_entry_hash(entry);
296                 struct request *__rq = crq->request;
297
298                 if (!rq_mergeable(__rq)) {
299                         cfq_del_crq_hash(crq);
300                         continue;
301                 }
302
303                 if (rq_hash_key(__rq) == offset)
304                         return __rq;
305         }
306
307         return NULL;
308 }
309
310 /*
311  * scheduler run of queue, if there are requests pending and no one in the
312  * driver that will restart queueing
313  */
314 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
315 {
316         if (cfqd->busy_queues)
317                 kblockd_schedule_work(&cfqd->unplug_work);
318 }
319
320 static int cfq_queue_empty(request_queue_t *q)
321 {
322         struct cfq_data *cfqd = q->elevator->elevator_data;
323
324         return !cfqd->busy_queues;
325 }
326
327 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
328 {
329         if (task->xid)
330                 return task->xid + (1 << 16);
331         if (rw == READ || rw == WRITE_SYNC)
332                 return task->pid;
333
334         return CFQ_KEY_ASYNC;
335 }
336
337 /*
338  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
339  * We choose the request that is closest to the head right now. Distance
340  * behind the head is penalized and only allowed to a certain extent.
341  */
342 static struct cfq_rq *
343 cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
344 {
345         sector_t last, s1, s2, d1 = 0, d2 = 0;
346         unsigned long back_max;
347 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
348 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
349         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
350
351         if (crq1 == NULL || crq1 == crq2)
352                 return crq2;
353         if (crq2 == NULL)
354                 return crq1;
355
356         if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
357                 return crq1;
358         else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
359                 return crq2;
360
361         s1 = crq1->request->sector;
362         s2 = crq2->request->sector;
363
364         last = cfqd->last_sector;
365
366         /*
367          * by definition, 1KiB is 2 sectors
368          */
369         back_max = cfqd->cfq_back_max * 2;
370
371         /*
372          * Strict one way elevator _except_ in the case where we allow
373          * short backward seeks which are biased as twice the cost of a
374          * similar forward seek.
375          */
376         if (s1 >= last)
377                 d1 = s1 - last;
378         else if (s1 + back_max >= last)
379                 d1 = (last - s1) * cfqd->cfq_back_penalty;
380         else
381                 wrap |= CFQ_RQ1_WRAP;
382
383         if (s2 >= last)
384                 d2 = s2 - last;
385         else if (s2 + back_max >= last)
386                 d2 = (last - s2) * cfqd->cfq_back_penalty;
387         else
388                 wrap |= CFQ_RQ2_WRAP;
389
390         /* Found required data */
391
392         /*
393          * By doing switch() on the bit mask "wrap" we avoid having to
394          * check two variables for all permutations: --> faster!
395          */
396         switch (wrap) {
397         case 0: /* common case for CFQ: crq1 and crq2 not wrapped */
398                 if (d1 < d2)
399                         return crq1;
400                 else if (d2 < d1)
401                         return crq2;
402                 else {
403                         if (s1 >= s2)
404                                 return crq1;
405                         else
406                                 return crq2;
407                 }
408
409         case CFQ_RQ2_WRAP:
410                 return crq1;
411         case CFQ_RQ1_WRAP:
412                 return crq2;
413         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */
414         default:
415                 /*
416                  * Since both rqs are wrapped,
417                  * start with the one that's further behind head
418                  * (--> only *one* back seek required),
419                  * since back seek takes more time than forward.
420                  */
421                 if (s1 <= s2)
422                         return crq1;
423                 else
424                         return crq2;
425         }
426 }
427
428 /*
429  * would be nice to take fifo expire time into account as well
430  */
431 static struct cfq_rq *
432 cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
433                   struct cfq_rq *last)
434 {
435         struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
436         struct rb_node *rbnext, *rbprev;
437
438         if (!(rbnext = rb_next(&last->rb_node))) {
439                 rbnext = rb_first(&cfqq->sort_list);
440                 if (rbnext == &last->rb_node)
441                         rbnext = NULL;
442         }
443
444         rbprev = rb_prev(&last->rb_node);
445
446         if (rbprev)
447                 crq_prev = rb_entry_crq(rbprev);
448         if (rbnext)
449                 crq_next = rb_entry_crq(rbnext);
450
451         return cfq_choose_req(cfqd, crq_next, crq_prev);
452 }
453
454 static void cfq_update_next_crq(struct cfq_rq *crq)
455 {
456         struct cfq_queue *cfqq = crq->cfq_queue;
457
458         if (cfqq->next_crq == crq)
459                 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
460 }
461
462 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
463 {
464         struct cfq_data *cfqd = cfqq->cfqd;
465         struct list_head *list, *entry;
466
467         BUG_ON(!cfq_cfqq_on_rr(cfqq));
468
469         list_del(&cfqq->cfq_list);
470
471         if (cfq_class_rt(cfqq))
472                 list = &cfqd->cur_rr;
473         else if (cfq_class_idle(cfqq))
474                 list = &cfqd->idle_rr;
475         else {
476                 /*
477                  * if cfqq has requests in flight, don't allow it to be
478                  * found in cfq_set_active_queue before it has finished them.
479                  * this is done to increase fairness between a process that
480                  * has lots of io pending vs one that only generates one
481                  * sporadically or synchronously
482                  */
483                 if (cfq_cfqq_dispatched(cfqq))
484                         list = &cfqd->busy_rr;
485                 else
486                         list = &cfqd->rr_list[cfqq->ioprio];
487         }
488
489         /*
490          * if queue was preempted, just add to front to be fair. busy_rr
491          * isn't sorted, but insert at the back for fairness.
492          */
493         if (preempted || list == &cfqd->busy_rr) {
494                 if (preempted)
495                         list = list->prev;
496
497                 list_add_tail(&cfqq->cfq_list, list);
498                 return;
499         }
500
501         /*
502          * sort by when queue was last serviced
503          */
504         entry = list;
505         while ((entry = entry->prev) != list) {
506                 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
507
508                 if (!__cfqq->service_last)
509                         break;
510                 if (time_before(__cfqq->service_last, cfqq->service_last))
511                         break;
512         }
513
514         list_add(&cfqq->cfq_list, entry);
515 }
516
517 /*
518  * add to busy list of queues for service, trying to be fair in ordering
519  * the pending list according to last request service
520  */
521 static inline void
522 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
523 {
524         BUG_ON(cfq_cfqq_on_rr(cfqq));
525         cfq_mark_cfqq_on_rr(cfqq);
526         cfqd->busy_queues++;
527
528         cfq_resort_rr_list(cfqq, 0);
529 }
530
531 static inline void
532 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
533 {
534         BUG_ON(!cfq_cfqq_on_rr(cfqq));
535         cfq_clear_cfqq_on_rr(cfqq);
536         list_move(&cfqq->cfq_list, &cfqd->empty_list);
537
538         BUG_ON(!cfqd->busy_queues);
539         cfqd->busy_queues--;
540 }
541
542 /*
543  * rb tree support functions
544  */
545 static inline void cfq_del_crq_rb(struct cfq_rq *crq)
546 {
547         struct cfq_queue *cfqq = crq->cfq_queue;
548         struct cfq_data *cfqd = cfqq->cfqd;
549         const int sync = cfq_crq_is_sync(crq);
550
551         BUG_ON(!cfqq->queued[sync]);
552         cfqq->queued[sync]--;
553
554         cfq_update_next_crq(crq);
555
556         rb_erase(&crq->rb_node, &cfqq->sort_list);
557
558         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
559                 cfq_del_cfqq_rr(cfqd, cfqq);
560 }
561
562 static struct cfq_rq *
563 __cfq_add_crq_rb(struct cfq_rq *crq)
564 {
565         struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
566         struct rb_node *parent = NULL;
567         struct cfq_rq *__crq;
568
569         while (*p) {
570                 parent = *p;
571                 __crq = rb_entry_crq(parent);
572
573                 if (crq->rb_key < __crq->rb_key)
574                         p = &(*p)->rb_left;
575                 else if (crq->rb_key > __crq->rb_key)
576                         p = &(*p)->rb_right;
577                 else
578                         return __crq;
579         }
580
581         rb_link_node(&crq->rb_node, parent, p);
582         return NULL;
583 }
584
585 static void cfq_add_crq_rb(struct cfq_rq *crq)
586 {
587         struct cfq_queue *cfqq = crq->cfq_queue;
588         struct cfq_data *cfqd = cfqq->cfqd;
589         struct request *rq = crq->request;
590         struct cfq_rq *__alias;
591
592         crq->rb_key = rq_rb_key(rq);
593         cfqq->queued[cfq_crq_is_sync(crq)]++;
594
595         /*
596          * looks a little odd, but the first insert might return an alias.
597          * if that happens, put the alias on the dispatch list
598          */
599         while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
600                 cfq_dispatch_insert(cfqd->queue, __alias);
601
602         rb_insert_color(&crq->rb_node, &cfqq->sort_list);
603
604         if (!cfq_cfqq_on_rr(cfqq))
605                 cfq_add_cfqq_rr(cfqd, cfqq);
606
607         /*
608          * check if this request is a better next-serve candidate
609          */
610         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
611 }
612
613 static inline void
614 cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
615 {
616         rb_erase(&crq->rb_node, &cfqq->sort_list);
617         cfqq->queued[cfq_crq_is_sync(crq)]--;
618
619         cfq_add_crq_rb(crq);
620 }
621
622 static struct request *
623 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
624 {
625         struct task_struct *tsk = current;
626         pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
627         struct cfq_queue *cfqq;
628         struct rb_node *n;
629         sector_t sector;
630
631         cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
632         if (!cfqq)
633                 goto out;
634
635         sector = bio->bi_sector + bio_sectors(bio);
636         n = cfqq->sort_list.rb_node;
637         while (n) {
638                 struct cfq_rq *crq = rb_entry_crq(n);
639
640                 if (sector < crq->rb_key)
641                         n = n->rb_left;
642                 else if (sector > crq->rb_key)
643                         n = n->rb_right;
644                 else
645                         return crq->request;
646         }
647
648 out:
649         return NULL;
650 }
651
652 static void cfq_activate_request(request_queue_t *q, struct request *rq)
653 {
654         struct cfq_data *cfqd = q->elevator->elevator_data;
655
656         cfqd->rq_in_driver++;
657
658         /*
659          * If the depth is larger 1, it really could be queueing. But lets
660          * make the mark a little higher - idling could still be good for
661          * low queueing, and a low queueing number could also just indicate
662          * a SCSI mid layer like behaviour where limit+1 is often seen.
663          */
664         if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
665                 cfqd->hw_tag = 1;
666 }
667
668 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
669 {
670         struct cfq_data *cfqd = q->elevator->elevator_data;
671
672         WARN_ON(!cfqd->rq_in_driver);
673         cfqd->rq_in_driver--;
674 }
675
676 static void cfq_remove_request(struct request *rq)
677 {
678         struct cfq_rq *crq = RQ_DATA(rq);
679
680         list_del_init(&rq->queuelist);
681         cfq_del_crq_rb(crq);
682         cfq_del_crq_hash(crq);
683 }
684
685 static int
686 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
687 {
688         struct cfq_data *cfqd = q->elevator->elevator_data;
689         struct request *__rq;
690         int ret;
691
692         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
693         if (__rq && elv_rq_merge_ok(__rq, bio)) {
694                 ret = ELEVATOR_BACK_MERGE;
695                 goto out;
696         }
697
698         __rq = cfq_find_rq_fmerge(cfqd, bio);
699         if (__rq && elv_rq_merge_ok(__rq, bio)) {
700                 ret = ELEVATOR_FRONT_MERGE;
701                 goto out;
702         }
703
704         return ELEVATOR_NO_MERGE;
705 out:
706         *req = __rq;
707         return ret;
708 }
709
710 static void cfq_merged_request(request_queue_t *q, struct request *req)
711 {
712         struct cfq_data *cfqd = q->elevator->elevator_data;
713         struct cfq_rq *crq = RQ_DATA(req);
714
715         cfq_del_crq_hash(crq);
716         cfq_add_crq_hash(cfqd, crq);
717
718         if (rq_rb_key(req) != crq->rb_key) {
719                 struct cfq_queue *cfqq = crq->cfq_queue;
720
721                 cfq_update_next_crq(crq);
722                 cfq_reposition_crq_rb(cfqq, crq);
723         }
724 }
725
726 static void
727 cfq_merged_requests(request_queue_t *q, struct request *rq,
728                     struct request *next)
729 {
730         cfq_merged_request(q, rq);
731
732         /*
733          * reposition in fifo if next is older than rq
734          */
735         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
736             time_before(next->start_time, rq->start_time))
737                 list_move(&rq->queuelist, &next->queuelist);
738
739         cfq_remove_request(next);
740 }
741
742 static inline void
743 __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
744 {
745         if (cfqq) {
746                 /*
747                  * stop potential idle class queues waiting service
748                  */
749                 del_timer(&cfqd->idle_class_timer);
750
751                 cfqq->slice_start = jiffies;
752                 cfqq->slice_end = 0;
753                 cfqq->slice_left = 0;
754                 cfq_clear_cfqq_must_alloc_slice(cfqq);
755                 cfq_clear_cfqq_fifo_expire(cfqq);
756         }
757
758         cfqd->active_queue = cfqq;
759 }
760
761 /*
762  * current cfqq expired its slice (or was too idle), select new one
763  */
764 static void
765 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
766                     int preempted)
767 {
768         unsigned long now = jiffies;
769
770         if (cfq_cfqq_wait_request(cfqq))
771                 del_timer(&cfqd->idle_slice_timer);
772
773         if (!preempted && !cfq_cfqq_dispatched(cfqq)) {
774                 cfqq->service_last = now;
775                 cfq_schedule_dispatch(cfqd);
776         }
777
778         cfq_clear_cfqq_must_dispatch(cfqq);
779         cfq_clear_cfqq_wait_request(cfqq);
780
781         /*
782          * store what was left of this slice, if the queue idled out
783          * or was preempted
784          */
785         if (time_after(cfqq->slice_end, now))
786                 cfqq->slice_left = cfqq->slice_end - now;
787         else
788                 cfqq->slice_left = 0;
789
790         if (cfq_cfqq_on_rr(cfqq))
791                 cfq_resort_rr_list(cfqq, preempted);
792
793         if (cfqq == cfqd->active_queue)
794                 cfqd->active_queue = NULL;
795
796         if (cfqd->active_cic) {
797                 put_io_context(cfqd->active_cic->ioc);
798                 cfqd->active_cic = NULL;
799         }
800
801         cfqd->dispatch_slice = 0;
802 }
803
804 static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
805 {
806         struct cfq_queue *cfqq = cfqd->active_queue;
807
808         if (cfqq)
809                 __cfq_slice_expired(cfqd, cfqq, preempted);
810 }
811
812 /*
813  * 0
814  * 0,1
815  * 0,1,2
816  * 0,1,2,3
817  * 0,1,2,3,4
818  * 0,1,2,3,4,5
819  * 0,1,2,3,4,5,6
820  * 0,1,2,3,4,5,6,7
821  */
822 static int cfq_get_next_prio_level(struct cfq_data *cfqd)
823 {
824         int prio, wrap;
825
826         prio = -1;
827         wrap = 0;
828         do {
829                 int p;
830
831                 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
832                         if (!list_empty(&cfqd->rr_list[p])) {
833                                 prio = p;
834                                 break;
835                         }
836                 }
837
838                 if (prio != -1)
839                         break;
840                 cfqd->cur_prio = 0;
841                 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
842                         cfqd->cur_end_prio = 0;
843                         if (wrap)
844                                 break;
845                         wrap = 1;
846                 }
847         } while (1);
848
849         if (unlikely(prio == -1))
850                 return -1;
851
852         BUG_ON(prio >= CFQ_PRIO_LISTS);
853
854         list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
855
856         cfqd->cur_prio = prio + 1;
857         if (cfqd->cur_prio > cfqd->cur_end_prio) {
858                 cfqd->cur_end_prio = cfqd->cur_prio;
859                 cfqd->cur_prio = 0;
860         }
861         if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
862                 cfqd->cur_prio = 0;
863                 cfqd->cur_end_prio = 0;
864         }
865
866         return prio;
867 }
868
869 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
870 {
871         struct cfq_queue *cfqq = NULL;
872
873         /*
874          * if current list is non-empty, grab first entry. if it is empty,
875          * get next prio level and grab first entry then if any are spliced
876          */
877         if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
878                 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
879
880         /*
881          * If no new queues are available, check if the busy list has some
882          * before falling back to idle io.
883          */
884         if (!cfqq && !list_empty(&cfqd->busy_rr))
885                 cfqq = list_entry_cfqq(cfqd->busy_rr.next);
886
887         /*
888          * if we have idle queues and no rt or be queues had pending
889          * requests, either allow immediate service if the grace period
890          * has passed or arm the idle grace timer
891          */
892         if (!cfqq && !list_empty(&cfqd->idle_rr)) {
893                 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
894
895                 if (time_after_eq(jiffies, end))
896                         cfqq = list_entry_cfqq(cfqd->idle_rr.next);
897                 else
898                         mod_timer(&cfqd->idle_class_timer, end);
899         }
900
901         __cfq_set_active_queue(cfqd, cfqq);
902         return cfqq;
903 }
904
905 #define CIC_SEEKY(cic) ((cic)->seek_mean > (128 * 1024))
906
907 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
908
909 {
910         struct cfq_io_context *cic;
911         unsigned long sl;
912
913         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
914         WARN_ON(cfqq != cfqd->active_queue);
915
916         /*
917          * idle is disabled, either manually or by past process history
918          */
919         if (!cfqd->cfq_slice_idle)
920                 return 0;
921         if (!cfq_cfqq_idle_window(cfqq))
922                 return 0;
923         /*
924          * task has exited, don't wait
925          */
926         cic = cfqd->active_cic;
927         if (!cic || !cic->ioc->task)
928                 return 0;
929
930         cfq_mark_cfqq_must_dispatch(cfqq);
931         cfq_mark_cfqq_wait_request(cfqq);
932
933         sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
934
935         /*
936          * we don't want to idle for seeks, but we do want to allow
937          * fair distribution of slice time for a process doing back-to-back
938          * seeks. so allow a little bit of time for him to submit a new rq
939          */
940         if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
941                 sl = min(sl, msecs_to_jiffies(2));
942
943         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
944         return 1;
945 }
946
947 static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
948 {
949         struct cfq_data *cfqd = q->elevator->elevator_data;
950         struct cfq_queue *cfqq = crq->cfq_queue;
951         struct request *rq;
952
953         cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
954         cfq_remove_request(crq->request);
955         cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
956         elv_dispatch_sort(q, crq->request);
957
958         rq = list_entry(q->queue_head.prev, struct request, queuelist);
959         cfqd->last_sector = rq->sector + rq->nr_sectors;
960 }
961
962 /*
963  * return expired entry, or NULL to just start from scratch in rbtree
964  */
965 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
966 {
967         struct cfq_data *cfqd = cfqq->cfqd;
968         struct request *rq;
969         struct cfq_rq *crq;
970
971         if (cfq_cfqq_fifo_expire(cfqq))
972                 return NULL;
973
974         if (!list_empty(&cfqq->fifo)) {
975                 int fifo = cfq_cfqq_class_sync(cfqq);
976
977                 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
978                 rq = crq->request;
979                 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
980                         cfq_mark_cfqq_fifo_expire(cfqq);
981                         return crq;
982                 }
983         }
984
985         return NULL;
986 }
987
988 /*
989  * Scale schedule slice based on io priority. Use the sync time slice only
990  * if a queue is marked sync and has sync io queued. A sync queue with async
991  * io only, should not get full sync slice length.
992  */
993 static inline int
994 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
995 {
996         const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
997
998         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
999
1000         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
1001 }
1002
1003 static inline void
1004 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1005 {
1006         cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
1007 }
1008
1009 static inline int
1010 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1011 {
1012         const int base_rq = cfqd->cfq_slice_async_rq;
1013
1014         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1015
1016         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1017 }
1018
1019 /*
1020  * get next queue for service
1021  */
1022 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1023 {
1024         unsigned long now = jiffies;
1025         struct cfq_queue *cfqq;
1026
1027         cfqq = cfqd->active_queue;
1028         if (!cfqq)
1029                 goto new_queue;
1030
1031         /*
1032          * slice has expired
1033          */
1034         if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
1035                 goto expire;
1036
1037         /*
1038          * if queue has requests, dispatch one. if not, check if
1039          * enough slice is left to wait for one
1040          */
1041         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
1042                 goto keep_queue;
1043         else if (cfq_cfqq_dispatched(cfqq)) {
1044                 cfqq = NULL;
1045                 goto keep_queue;
1046         } else if (cfq_cfqq_class_sync(cfqq)) {
1047                 if (cfq_arm_slice_timer(cfqd, cfqq))
1048                         return NULL;
1049         }
1050
1051 expire:
1052         cfq_slice_expired(cfqd, 0);
1053 new_queue:
1054         cfqq = cfq_set_active_queue(cfqd);
1055 keep_queue:
1056         return cfqq;
1057 }
1058
1059 static int
1060 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1061                         int max_dispatch)
1062 {
1063         int dispatched = 0;
1064
1065         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
1066
1067         do {
1068                 struct cfq_rq *crq;
1069
1070                 /*
1071                  * follow expired path, else get first next available
1072                  */
1073                 if ((crq = cfq_check_fifo(cfqq)) == NULL)
1074                         crq = cfqq->next_crq;
1075
1076                 /*
1077                  * finally, insert request into driver dispatch list
1078                  */
1079                 cfq_dispatch_insert(cfqd->queue, crq);
1080
1081                 cfqd->dispatch_slice++;
1082                 dispatched++;
1083
1084                 if (!cfqd->active_cic) {
1085                         atomic_inc(&crq->io_context->ioc->refcount);
1086                         cfqd->active_cic = crq->io_context;
1087                 }
1088
1089                 if (RB_EMPTY_ROOT(&cfqq->sort_list))
1090                         break;
1091
1092         } while (dispatched < max_dispatch);
1093
1094         /*
1095          * if slice end isn't set yet, set it.
1096          */
1097         if (!cfqq->slice_end)
1098                 cfq_set_prio_slice(cfqd, cfqq);
1099
1100         /*
1101          * expire an async queue immediately if it has used up its slice. idle
1102          * queue always expire after 1 dispatch round.
1103          */
1104         if ((!cfq_cfqq_sync(cfqq) &&
1105             cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1106             cfq_class_idle(cfqq) ||
1107             !cfq_cfqq_idle_window(cfqq))
1108                 cfq_slice_expired(cfqd, 0);
1109
1110         return dispatched;
1111 }
1112
1113 static int
1114 cfq_forced_dispatch_cfqqs(struct list_head *list)
1115 {
1116         struct cfq_queue *cfqq, *next;
1117         struct cfq_rq *crq;
1118         int dispatched;
1119
1120         dispatched = 0;
1121         list_for_each_entry_safe(cfqq, next, list, cfq_list) {
1122                 while ((crq = cfqq->next_crq)) {
1123                         cfq_dispatch_insert(cfqq->cfqd->queue, crq);
1124                         dispatched++;
1125                 }
1126                 BUG_ON(!list_empty(&cfqq->fifo));
1127         }
1128
1129         return dispatched;
1130 }
1131
1132 static int
1133 cfq_forced_dispatch(struct cfq_data *cfqd)
1134 {
1135         int i, dispatched = 0;
1136
1137         for (i = 0; i < CFQ_PRIO_LISTS; i++)
1138                 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
1139
1140         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
1141         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
1142         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
1143
1144         cfq_slice_expired(cfqd, 0);
1145
1146         BUG_ON(cfqd->busy_queues);
1147
1148         return dispatched;
1149 }
1150
1151 static int
1152 cfq_dispatch_requests(request_queue_t *q, int force)
1153 {
1154         struct cfq_data *cfqd = q->elevator->elevator_data;
1155         struct cfq_queue *cfqq, *prev_cfqq;
1156         int dispatched;
1157
1158         if (!cfqd->busy_queues)
1159                 return 0;
1160
1161         if (unlikely(force))
1162                 return cfq_forced_dispatch(cfqd);
1163
1164         dispatched = 0;
1165         prev_cfqq = NULL;
1166         while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
1167                 int max_dispatch;
1168
1169                 /*
1170                  * Don't repeat dispatch from the previous queue.
1171                  */
1172                 if (prev_cfqq == cfqq)
1173                         break;
1174
1175                 cfq_clear_cfqq_must_dispatch(cfqq);
1176                 cfq_clear_cfqq_wait_request(cfqq);
1177                 del_timer(&cfqd->idle_slice_timer);
1178
1179                 max_dispatch = cfqd->cfq_quantum;
1180                 if (cfq_class_idle(cfqq))
1181                         max_dispatch = 1;
1182
1183                 dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1184
1185                 /*
1186                  * If the dispatch cfqq has idling enabled and is still
1187                  * the active queue, break out.
1188                  */
1189                 if (cfq_cfqq_idle_window(cfqq) && cfqd->active_queue)
1190                         break;
1191
1192                 prev_cfqq = cfqq;
1193         }
1194
1195         return dispatched;
1196 }
1197
1198 /*
1199  * task holds one reference to the queue, dropped when task exits. each crq
1200  * in-flight on this queue also holds a reference, dropped when crq is freed.
1201  *
1202  * queue lock must be held here.
1203  */
1204 static void cfq_put_queue(struct cfq_queue *cfqq)
1205 {
1206         struct cfq_data *cfqd = cfqq->cfqd;
1207
1208         BUG_ON(atomic_read(&cfqq->ref) <= 0);
1209
1210         if (!atomic_dec_and_test(&cfqq->ref))
1211                 return;
1212
1213         BUG_ON(rb_first(&cfqq->sort_list));
1214         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1215         BUG_ON(cfq_cfqq_on_rr(cfqq));
1216
1217         if (unlikely(cfqd->active_queue == cfqq))
1218                 __cfq_slice_expired(cfqd, cfqq, 0);
1219
1220         /*
1221          * it's on the empty list and still hashed
1222          */
1223         list_del(&cfqq->cfq_list);
1224         hlist_del(&cfqq->cfq_hash);
1225         kmem_cache_free(cfq_pool, cfqq);
1226 }
1227
1228 static inline struct cfq_queue *
1229 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1230                     const int hashval)
1231 {
1232         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1233         struct hlist_node *entry;
1234         struct cfq_queue *__cfqq;
1235
1236         hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
1237                 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1238
1239                 if (__cfqq->key == key && (__p == prio || !prio))
1240                         return __cfqq;
1241         }
1242
1243         return NULL;
1244 }
1245
1246 static struct cfq_queue *
1247 cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1248 {
1249         return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1250 }
1251
1252 static void cfq_free_io_context(struct io_context *ioc)
1253 {
1254         struct cfq_io_context *__cic;
1255         struct rb_node *n;
1256         int freed = 0;
1257
1258         while ((n = rb_first(&ioc->cic_root)) != NULL) {
1259                 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1260                 rb_erase(&__cic->rb_node, &ioc->cic_root);
1261                 kmem_cache_free(cfq_ioc_pool, __cic);
1262                 freed++;
1263         }
1264
1265         if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
1266                 complete(ioc_gone);
1267 }
1268
1269 static void cfq_trim(struct io_context *ioc)
1270 {
1271         ioc->set_ioprio = NULL;
1272         cfq_free_io_context(ioc);
1273 }
1274
1275 /*
1276  * Called with interrupts disabled
1277  */
1278 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1279 {
1280         struct cfq_data *cfqd = cic->key;
1281         request_queue_t *q;
1282
1283         if (!cfqd)
1284                 return;
1285
1286         q = cfqd->queue;
1287
1288         WARN_ON(!irqs_disabled());
1289
1290         spin_lock(q->queue_lock);
1291
1292         if (cic->cfqq[ASYNC]) {
1293                 if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
1294                         __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
1295                 cfq_put_queue(cic->cfqq[ASYNC]);
1296                 cic->cfqq[ASYNC] = NULL;
1297         }
1298
1299         if (cic->cfqq[SYNC]) {
1300                 if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
1301                         __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
1302                 cfq_put_queue(cic->cfqq[SYNC]);
1303                 cic->cfqq[SYNC] = NULL;
1304         }
1305
1306         cic->key = NULL;
1307         list_del_init(&cic->queue_list);
1308         spin_unlock(q->queue_lock);
1309 }
1310
1311 static void cfq_exit_io_context(struct io_context *ioc)
1312 {
1313         struct cfq_io_context *__cic;
1314         unsigned long flags;
1315         struct rb_node *n;
1316
1317         /*
1318          * put the reference this task is holding to the various queues
1319          */
1320         spin_lock_irqsave(&cfq_exit_lock, flags);
1321
1322         n = rb_first(&ioc->cic_root);
1323         while (n != NULL) {
1324                 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1325
1326                 cfq_exit_single_io_context(__cic);
1327                 n = rb_next(n);
1328         }
1329
1330         spin_unlock_irqrestore(&cfq_exit_lock, flags);
1331 }
1332
1333 static struct cfq_io_context *
1334 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1335 {
1336         struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1337
1338         if (cic) {
1339                 memset(cic, 0, sizeof(*cic));
1340                 cic->last_end_request = jiffies;
1341                 INIT_LIST_HEAD(&cic->queue_list);
1342                 cic->dtor = cfq_free_io_context;
1343                 cic->exit = cfq_exit_io_context;
1344                 atomic_inc(&ioc_count);
1345         }
1346
1347         return cic;
1348 }
1349
1350 static void cfq_init_prio_data(struct cfq_queue *cfqq)
1351 {
1352         struct task_struct *tsk = current;
1353         int ioprio_class;
1354
1355         if (!cfq_cfqq_prio_changed(cfqq))
1356                 return;
1357
1358         ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1359         switch (ioprio_class) {
1360                 default:
1361                         printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1362                 case IOPRIO_CLASS_NONE:
1363                         /*
1364                          * no prio set, place us in the middle of the BE classes
1365                          */
1366                         cfqq->ioprio = task_nice_ioprio(tsk);
1367                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1368                         break;
1369                 case IOPRIO_CLASS_RT:
1370                         cfqq->ioprio = task_ioprio(tsk);
1371                         cfqq->ioprio_class = IOPRIO_CLASS_RT;
1372                         break;
1373                 case IOPRIO_CLASS_BE:
1374                         cfqq->ioprio = task_ioprio(tsk);
1375                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1376                         break;
1377                 case IOPRIO_CLASS_IDLE:
1378                         cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1379                         cfqq->ioprio = 7;
1380                         cfq_clear_cfqq_idle_window(cfqq);
1381                         break;
1382         }
1383
1384         /*
1385          * keep track of original prio settings in case we have to temporarily
1386          * elevate the priority of this queue
1387          */
1388         cfqq->org_ioprio = cfqq->ioprio;
1389         cfqq->org_ioprio_class = cfqq->ioprio_class;
1390
1391         if (cfq_cfqq_on_rr(cfqq))
1392                 cfq_resort_rr_list(cfqq, 0);
1393
1394         cfq_clear_cfqq_prio_changed(cfqq);
1395 }
1396
1397 static inline void changed_ioprio(struct cfq_io_context *cic)
1398 {
1399         struct cfq_data *cfqd = cic->key;
1400         struct cfq_queue *cfqq;
1401
1402         if (unlikely(!cfqd))
1403                 return;
1404
1405         spin_lock(cfqd->queue->queue_lock);
1406
1407         cfqq = cic->cfqq[ASYNC];
1408         if (cfqq) {
1409                 struct cfq_queue *new_cfqq;
1410                 new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task,
1411                                          GFP_ATOMIC);
1412                 if (new_cfqq) {
1413                         cic->cfqq[ASYNC] = new_cfqq;
1414                         cfq_put_queue(cfqq);
1415                 }
1416         }
1417
1418         cfqq = cic->cfqq[SYNC];
1419         if (cfqq)
1420                 cfq_mark_cfqq_prio_changed(cfqq);
1421
1422         spin_unlock(cfqd->queue->queue_lock);
1423 }
1424
1425 /*
1426  * callback from sys_ioprio_set, irqs are disabled
1427  */
1428 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1429 {
1430         struct cfq_io_context *cic;
1431         struct rb_node *n;
1432
1433         spin_lock(&cfq_exit_lock);
1434
1435         n = rb_first(&ioc->cic_root);
1436         while (n != NULL) {
1437                 cic = rb_entry(n, struct cfq_io_context, rb_node);
1438
1439                 changed_ioprio(cic);
1440                 n = rb_next(n);
1441         }
1442
1443         spin_unlock(&cfq_exit_lock);
1444
1445         return 0;
1446 }
1447
1448 static struct cfq_queue *
1449 cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
1450               gfp_t gfp_mask)
1451 {
1452         const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1453         struct cfq_queue *cfqq, *new_cfqq = NULL;
1454         unsigned short ioprio;
1455
1456 retry:
1457         ioprio = tsk->ioprio;
1458         cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1459
1460         if (!cfqq) {
1461                 if (new_cfqq) {
1462                         cfqq = new_cfqq;
1463                         new_cfqq = NULL;
1464                 } else if (gfp_mask & __GFP_WAIT) {
1465                         spin_unlock_irq(cfqd->queue->queue_lock);
1466                         new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1467                         spin_lock_irq(cfqd->queue->queue_lock);
1468                         goto retry;
1469                 } else {
1470                         cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1471                         if (!cfqq)
1472                                 goto out;
1473                 }
1474
1475                 memset(cfqq, 0, sizeof(*cfqq));
1476
1477                 INIT_HLIST_NODE(&cfqq->cfq_hash);
1478                 INIT_LIST_HEAD(&cfqq->cfq_list);
1479                 INIT_LIST_HEAD(&cfqq->fifo);
1480
1481                 cfqq->key = key;
1482                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1483                 atomic_set(&cfqq->ref, 0);
1484                 cfqq->cfqd = cfqd;
1485                 cfqq->service_last = 0;
1486                 /*
1487                  * set ->slice_left to allow preemption for a new process
1488                  */
1489                 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1490                 cfq_mark_cfqq_idle_window(cfqq);
1491                 cfq_mark_cfqq_prio_changed(cfqq);
1492                 cfq_init_prio_data(cfqq);
1493         }
1494
1495         if (new_cfqq)
1496                 kmem_cache_free(cfq_pool, new_cfqq);
1497
1498         atomic_inc(&cfqq->ref);
1499 out:
1500         WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1501         return cfqq;
1502 }
1503
1504 static void
1505 cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
1506 {
1507         spin_lock(&cfq_exit_lock);
1508         rb_erase(&cic->rb_node, &ioc->cic_root);
1509         list_del_init(&cic->queue_list);
1510         spin_unlock(&cfq_exit_lock);
1511         kmem_cache_free(cfq_ioc_pool, cic);
1512         atomic_dec(&ioc_count);
1513 }
1514
1515 static struct cfq_io_context *
1516 cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1517 {
1518         struct rb_node *n;
1519         struct cfq_io_context *cic;
1520         void *k, *key = cfqd;
1521
1522 restart:
1523         n = ioc->cic_root.rb_node;
1524         while (n) {
1525                 cic = rb_entry(n, struct cfq_io_context, rb_node);
1526                 /* ->key must be copied to avoid race with cfq_exit_queue() */
1527                 k = cic->key;
1528                 if (unlikely(!k)) {
1529                         cfq_drop_dead_cic(ioc, cic);
1530                         goto restart;
1531                 }
1532
1533                 if (key < k)
1534                         n = n->rb_left;
1535                 else if (key > k)
1536                         n = n->rb_right;
1537                 else
1538                         return cic;
1539         }
1540
1541         return NULL;
1542 }
1543
1544 static inline void
1545 cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1546              struct cfq_io_context *cic)
1547 {
1548         struct rb_node **p;
1549         struct rb_node *parent;
1550         struct cfq_io_context *__cic;
1551         void *k;
1552
1553         cic->ioc = ioc;
1554         cic->key = cfqd;
1555
1556         ioc->set_ioprio = cfq_ioc_set_ioprio;
1557 restart:
1558         parent = NULL;
1559         p = &ioc->cic_root.rb_node;
1560         while (*p) {
1561                 parent = *p;
1562                 __cic = rb_entry(parent, struct cfq_io_context, rb_node);
1563                 /* ->key must be copied to avoid race with cfq_exit_queue() */
1564                 k = __cic->key;
1565                 if (unlikely(!k)) {
1566                         cfq_drop_dead_cic(ioc, __cic);
1567                         goto restart;
1568                 }
1569
1570                 if (cic->key < k)
1571                         p = &(*p)->rb_left;
1572                 else if (cic->key > k)
1573                         p = &(*p)->rb_right;
1574                 else
1575                         BUG();
1576         }
1577
1578         spin_lock(&cfq_exit_lock);
1579         rb_link_node(&cic->rb_node, parent, p);
1580         rb_insert_color(&cic->rb_node, &ioc->cic_root);
1581         list_add(&cic->queue_list, &cfqd->cic_list);
1582         spin_unlock(&cfq_exit_lock);
1583 }
1584
1585 /*
1586  * Setup general io context and cfq io context. There can be several cfq
1587  * io contexts per general io context, if this process is doing io to more
1588  * than one device managed by cfq.
1589  */
1590 static struct cfq_io_context *
1591 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1592 {
1593         struct io_context *ioc = NULL;
1594         struct cfq_io_context *cic;
1595
1596         might_sleep_if(gfp_mask & __GFP_WAIT);
1597
1598         ioc = get_io_context(gfp_mask);
1599         if (!ioc)
1600                 return NULL;
1601
1602         cic = cfq_cic_rb_lookup(cfqd, ioc);
1603         if (cic)
1604                 goto out;
1605
1606         cic = cfq_alloc_io_context(cfqd, gfp_mask);
1607         if (cic == NULL)
1608                 goto err;
1609
1610         cfq_cic_link(cfqd, ioc, cic);
1611 out:
1612         return cic;
1613 err:
1614         put_io_context(ioc);
1615         return NULL;
1616 }
1617
1618 static void
1619 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1620 {
1621         unsigned long elapsed, ttime;
1622
1623         /*
1624          * if this context already has stuff queued, thinktime is from
1625          * last queue not last end
1626          */
1627 #if 0
1628         if (time_after(cic->last_end_request, cic->last_queue))
1629                 elapsed = jiffies - cic->last_end_request;
1630         else
1631                 elapsed = jiffies - cic->last_queue;
1632 #else
1633                 elapsed = jiffies - cic->last_end_request;
1634 #endif
1635
1636         ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1637
1638         cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1639         cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1640         cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1641 }
1642
1643 static void
1644 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1645                        struct cfq_rq *crq)
1646 {
1647         sector_t sdist;
1648         u64 total;
1649
1650         if (cic->last_request_pos < crq->request->sector)
1651                 sdist = crq->request->sector - cic->last_request_pos;
1652         else
1653                 sdist = cic->last_request_pos - crq->request->sector;
1654
1655         /*
1656          * Don't allow the seek distance to get too large from the
1657          * odd fragment, pagein, etc
1658          */
1659         if (cic->seek_samples <= 60) /* second&third seek */
1660                 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1661         else
1662                 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
1663
1664         cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1665         cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1666         total = cic->seek_total + (cic->seek_samples/2);
1667         do_div(total, cic->seek_samples);
1668         cic->seek_mean = (sector_t)total;
1669 }
1670
1671 /*
1672  * Disable idle window if the process thinks too long or seeks so much that
1673  * it doesn't matter
1674  */
1675 static void
1676 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1677                        struct cfq_io_context *cic)
1678 {
1679         int enable_idle = cfq_cfqq_idle_window(cfqq);
1680
1681         if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
1682             (cfqd->hw_tag && CIC_SEEKY(cic)))
1683                 enable_idle = 0;
1684         else if (sample_valid(cic->ttime_samples)) {
1685                 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1686                         enable_idle = 0;
1687                 else
1688                         enable_idle = 1;
1689         }
1690
1691         if (enable_idle)
1692                 cfq_mark_cfqq_idle_window(cfqq);
1693         else
1694                 cfq_clear_cfqq_idle_window(cfqq);
1695 }
1696
1697
1698 /*
1699  * Check if new_cfqq should preempt the currently active queue. Return 0 for
1700  * no or if we aren't sure, a 1 will cause a preempt.
1701  */
1702 static int
1703 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1704                    struct cfq_rq *crq)
1705 {
1706         struct cfq_queue *cfqq = cfqd->active_queue;
1707
1708         if (cfq_class_idle(new_cfqq))
1709                 return 0;
1710
1711         if (!cfqq)
1712                 return 0;
1713
1714         if (cfq_class_idle(cfqq))
1715                 return 1;
1716         if (!cfq_cfqq_wait_request(new_cfqq))
1717                 return 0;
1718         /*
1719          * if it doesn't have slice left, forget it
1720          */
1721         if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1722                 return 0;
1723         if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
1724                 return 1;
1725
1726         return 0;
1727 }
1728
1729 /*
1730  * cfqq preempts the active queue. if we allowed preempt with no slice left,
1731  * let it have half of its nominal slice.
1732  */
1733 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1734 {
1735         struct cfq_queue *__cfqq, *next;
1736
1737         list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1738                 cfq_resort_rr_list(__cfqq, 1);
1739
1740         if (!cfqq->slice_left)
1741                 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1742
1743         cfqq->slice_end = cfqq->slice_left + jiffies;
1744         cfq_slice_expired(cfqd, 1);
1745         __cfq_set_active_queue(cfqd, cfqq);
1746 }
1747
1748 /*
1749  * should really be a ll_rw_blk.c helper
1750  */
1751 static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1752 {
1753         request_queue_t *q = cfqd->queue;
1754
1755         if (!blk_queue_plugged(q))
1756                 q->request_fn(q);
1757         else
1758                 __generic_unplug_device(q);
1759 }
1760
1761 /*
1762  * Called when a new fs request (crq) is added (to cfqq). Check if there's
1763  * something we should do about it
1764  */
1765 static void
1766 cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1767                  struct cfq_rq *crq)
1768 {
1769         struct cfq_io_context *cic = crq->io_context;
1770
1771         /*
1772          * we never wait for an async request and we don't allow preemption
1773          * of an async request. so just return early
1774          */
1775         if (!cfq_crq_is_sync(crq)) {
1776                 /*
1777                  * sync process issued an async request, if it's waiting
1778                  * then expire it and kick rq handling.
1779                  */
1780                 if (cic == cfqd->active_cic &&
1781                     del_timer(&cfqd->idle_slice_timer)) {
1782                         cfq_slice_expired(cfqd, 0);
1783                         cfq_start_queueing(cfqd, cfqq);
1784                 }
1785                 return;
1786         }
1787
1788         cfq_update_io_thinktime(cfqd, cic);
1789         cfq_update_io_seektime(cfqd, cic, crq);
1790         cfq_update_idle_window(cfqd, cfqq, cic);
1791
1792         cic->last_queue = jiffies;
1793         cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
1794
1795         if (cfqq == cfqd->active_queue) {
1796                 /*
1797                  * if we are waiting for a request for this queue, let it rip
1798                  * immediately and flag that we must not expire this queue
1799                  * just now
1800                  */
1801                 if (cfq_cfqq_wait_request(cfqq)) {
1802                         cfq_mark_cfqq_must_dispatch(cfqq);
1803                         del_timer(&cfqd->idle_slice_timer);
1804                         cfq_start_queueing(cfqd, cfqq);
1805                 }
1806         } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1807                 /*
1808                  * not the active queue - expire current slice if it is
1809                  * idle and has expired it's mean thinktime or this new queue
1810                  * has some old slice time left and is of higher priority
1811                  */
1812                 cfq_preempt_queue(cfqd, cfqq);
1813                 cfq_mark_cfqq_must_dispatch(cfqq);
1814                 cfq_start_queueing(cfqd, cfqq);
1815         }
1816 }
1817
1818 static void cfq_insert_request(request_queue_t *q, struct request *rq)
1819 {
1820         struct cfq_data *cfqd = q->elevator->elevator_data;
1821         struct cfq_rq *crq = RQ_DATA(rq);
1822         struct cfq_queue *cfqq = crq->cfq_queue;
1823
1824         cfq_init_prio_data(cfqq);
1825
1826         cfq_add_crq_rb(crq);
1827
1828         list_add_tail(&rq->queuelist, &cfqq->fifo);
1829
1830         if (rq_mergeable(rq))
1831                 cfq_add_crq_hash(cfqd, crq);
1832
1833         cfq_crq_enqueued(cfqd, cfqq, crq);
1834 }
1835
1836 static void cfq_completed_request(request_queue_t *q, struct request *rq)
1837 {
1838         struct cfq_rq *crq = RQ_DATA(rq);
1839         struct cfq_queue *cfqq = crq->cfq_queue;
1840         struct cfq_data *cfqd = cfqq->cfqd;
1841         const int sync = cfq_crq_is_sync(crq);
1842         unsigned long now;
1843
1844         now = jiffies;
1845
1846         WARN_ON(!cfqd->rq_in_driver);
1847         WARN_ON(!cfqq->on_dispatch[sync]);
1848         cfqd->rq_in_driver--;
1849         cfqq->on_dispatch[sync]--;
1850
1851         if (!cfq_class_idle(cfqq))
1852                 cfqd->last_end_request = now;
1853
1854         if (!cfq_cfqq_dispatched(cfqq)) {
1855                 if (cfq_cfqq_on_rr(cfqq)) {
1856                         cfqq->service_last = now;
1857                         cfq_resort_rr_list(cfqq, 0);
1858                 }
1859         }
1860
1861         if (sync)
1862                 crq->io_context->last_end_request = now;
1863
1864         /*
1865          * If this is the active queue, check if it needs to be expired,
1866          * or if we want to idle in case it has no pending requests.
1867          */
1868         if (cfqd->active_queue == cfqq) {
1869                 if (time_after(now, cfqq->slice_end))
1870                         cfq_slice_expired(cfqd, 0);
1871                 else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1872                         if (!cfq_arm_slice_timer(cfqd, cfqq))
1873                                 cfq_schedule_dispatch(cfqd);
1874                 }
1875         }
1876 }
1877
1878 static struct request *
1879 cfq_former_request(request_queue_t *q, struct request *rq)
1880 {
1881         struct cfq_rq *crq = RQ_DATA(rq);
1882         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1883
1884         if (rbprev)
1885                 return rb_entry_crq(rbprev)->request;
1886
1887         return NULL;
1888 }
1889
1890 static struct request *
1891 cfq_latter_request(request_queue_t *q, struct request *rq)
1892 {
1893         struct cfq_rq *crq = RQ_DATA(rq);
1894         struct rb_node *rbnext = rb_next(&crq->rb_node);
1895
1896         if (rbnext)
1897                 return rb_entry_crq(rbnext)->request;
1898
1899         return NULL;
1900 }
1901
1902 /*
1903  * we temporarily boost lower priority queues if they are holding fs exclusive
1904  * resources. they are boosted to normal prio (CLASS_BE/4)
1905  */
1906 static void cfq_prio_boost(struct cfq_queue *cfqq)
1907 {
1908         const int ioprio_class = cfqq->ioprio_class;
1909         const int ioprio = cfqq->ioprio;
1910
1911         if (has_fs_excl()) {
1912                 /*
1913                  * boost idle prio on transactions that would lock out other
1914                  * users of the filesystem
1915                  */
1916                 if (cfq_class_idle(cfqq))
1917                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1918                 if (cfqq->ioprio > IOPRIO_NORM)
1919                         cfqq->ioprio = IOPRIO_NORM;
1920         } else {
1921                 /*
1922                  * check if we need to unboost the queue
1923                  */
1924                 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1925                         cfqq->ioprio_class = cfqq->org_ioprio_class;
1926                 if (cfqq->ioprio != cfqq->org_ioprio)
1927                         cfqq->ioprio = cfqq->org_ioprio;
1928         }
1929
1930         /*
1931          * refile between round-robin lists if we moved the priority class
1932          */
1933         if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1934             cfq_cfqq_on_rr(cfqq))
1935                 cfq_resort_rr_list(cfqq, 0);
1936 }
1937
1938 static inline int
1939 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1940                 struct task_struct *task, int rw)
1941 {
1942         if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1943             !cfq_cfqq_must_alloc_slice(cfqq)) {
1944                 cfq_mark_cfqq_must_alloc_slice(cfqq);
1945                 return ELV_MQUEUE_MUST;
1946         }
1947
1948         return ELV_MQUEUE_MAY;
1949 }
1950
1951 static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1952 {
1953         struct cfq_data *cfqd = q->elevator->elevator_data;
1954         struct task_struct *tsk = current;
1955         struct cfq_queue *cfqq;
1956
1957         /*
1958          * don't force setup of a queue from here, as a call to may_queue
1959          * does not necessarily imply that a request actually will be queued.
1960          * so just lookup a possibly existing queue, or return 'may queue'
1961          * if that fails
1962          */
1963         cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1964         if (cfqq) {
1965                 cfq_init_prio_data(cfqq);
1966                 cfq_prio_boost(cfqq);
1967
1968                 return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1969         }
1970
1971         return ELV_MQUEUE_MAY;
1972 }
1973
1974 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1975 {
1976         struct cfq_data *cfqd = q->elevator->elevator_data;
1977
1978         if (unlikely(cfqd->rq_starved)) {
1979                 struct request_list *rl = &q->rq;
1980
1981                 smp_mb();
1982                 if (waitqueue_active(&rl->wait[READ]))
1983                         wake_up(&rl->wait[READ]);
1984                 if (waitqueue_active(&rl->wait[WRITE]))
1985                         wake_up(&rl->wait[WRITE]);
1986         }
1987 }
1988
1989 /*
1990  * queue lock held here
1991  */
1992 static void cfq_put_request(request_queue_t *q, struct request *rq)
1993 {
1994         struct cfq_data *cfqd = q->elevator->elevator_data;
1995         struct cfq_rq *crq = RQ_DATA(rq);
1996
1997         if (crq) {
1998                 struct cfq_queue *cfqq = crq->cfq_queue;
1999                 const int rw = rq_data_dir(rq);
2000
2001                 BUG_ON(!cfqq->allocated[rw]);
2002                 cfqq->allocated[rw]--;
2003
2004                 put_io_context(crq->io_context->ioc);
2005
2006                 mempool_free(crq, cfqd->crq_pool);
2007                 rq->elevator_private = NULL;
2008
2009                 cfq_check_waiters(q, cfqq);
2010                 cfq_put_queue(cfqq);
2011         }
2012 }
2013
2014 /*
2015  * Allocate cfq data structures associated with this request.
2016  */
2017 static int
2018 cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
2019                 gfp_t gfp_mask)
2020 {
2021         struct cfq_data *cfqd = q->elevator->elevator_data;
2022         struct task_struct *tsk = current;
2023         struct cfq_io_context *cic;
2024         const int rw = rq_data_dir(rq);
2025         pid_t key = cfq_queue_pid(tsk, rw);
2026         struct cfq_queue *cfqq;
2027         struct cfq_rq *crq;
2028         unsigned long flags;
2029         int is_sync = key != CFQ_KEY_ASYNC;
2030
2031         might_sleep_if(gfp_mask & __GFP_WAIT);
2032
2033         cic = cfq_get_io_context(cfqd, gfp_mask);
2034
2035         spin_lock_irqsave(q->queue_lock, flags);
2036
2037         if (!cic)
2038                 goto queue_fail;
2039
2040         if (!cic->cfqq[is_sync]) {
2041                 cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
2042                 if (!cfqq)
2043                         goto queue_fail;
2044
2045                 cic->cfqq[is_sync] = cfqq;
2046         } else
2047                 cfqq = cic->cfqq[is_sync];
2048
2049         cfqq->allocated[rw]++;
2050         cfq_clear_cfqq_must_alloc(cfqq);
2051         cfqd->rq_starved = 0;
2052         atomic_inc(&cfqq->ref);
2053         spin_unlock_irqrestore(q->queue_lock, flags);
2054
2055         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
2056         if (crq) {
2057                 RB_CLEAR_NODE(&crq->rb_node);
2058                 crq->rb_key = 0;
2059                 crq->request = rq;
2060                 INIT_HLIST_NODE(&crq->hash);
2061                 crq->cfq_queue = cfqq;
2062                 crq->io_context = cic;
2063
2064                 if (is_sync)
2065                         cfq_mark_crq_is_sync(crq);
2066                 else
2067                         cfq_clear_crq_is_sync(crq);
2068
2069                 rq->elevator_private = crq;
2070                 return 0;
2071         }
2072
2073         spin_lock_irqsave(q->queue_lock, flags);
2074         cfqq->allocated[rw]--;
2075         if (!(cfqq->allocated[0] + cfqq->allocated[1]))
2076                 cfq_mark_cfqq_must_alloc(cfqq);
2077         cfq_put_queue(cfqq);
2078 queue_fail:
2079         if (cic)
2080                 put_io_context(cic->ioc);
2081         /*
2082          * mark us rq allocation starved. we need to kickstart the process
2083          * ourselves if there are no pending requests that can do it for us.
2084          * that would be an extremely rare OOM situation
2085          */
2086         cfqd->rq_starved = 1;
2087         cfq_schedule_dispatch(cfqd);
2088         spin_unlock_irqrestore(q->queue_lock, flags);
2089         return 1;
2090 }
2091
2092 static void cfq_kick_queue(void *data)
2093 {
2094         request_queue_t *q = data;
2095         struct cfq_data *cfqd = q->elevator->elevator_data;
2096         unsigned long flags;
2097
2098         spin_lock_irqsave(q->queue_lock, flags);
2099
2100         if (cfqd->rq_starved) {
2101                 struct request_list *rl = &q->rq;
2102
2103                 /*
2104                  * we aren't guaranteed to get a request after this, but we
2105                  * have to be opportunistic
2106                  */
2107                 smp_mb();
2108                 if (waitqueue_active(&rl->wait[READ]))
2109                         wake_up(&rl->wait[READ]);
2110                 if (waitqueue_active(&rl->wait[WRITE]))
2111                         wake_up(&rl->wait[WRITE]);
2112         }
2113
2114         blk_remove_plug(q);
2115         q->request_fn(q);
2116         spin_unlock_irqrestore(q->queue_lock, flags);
2117 }
2118
2119 /*
2120  * Timer running if the active_queue is currently idling inside its time slice
2121  */
2122 static void cfq_idle_slice_timer(unsigned long data)
2123 {
2124         struct cfq_data *cfqd = (struct cfq_data *) data;
2125         struct cfq_queue *cfqq;
2126         unsigned long flags;
2127
2128         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2129
2130         if ((cfqq = cfqd->active_queue) != NULL) {
2131                 unsigned long now = jiffies;
2132
2133                 /*
2134                  * expired
2135                  */
2136                 if (time_after(now, cfqq->slice_end))
2137                         goto expire;
2138
2139                 /*
2140                  * only expire and reinvoke request handler, if there are
2141                  * other queues with pending requests
2142                  */
2143                 if (!cfqd->busy_queues)
2144                         goto out_cont;
2145
2146                 /*
2147                  * not expired and it has a request pending, let it dispatch
2148                  */
2149                 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
2150                         cfq_mark_cfqq_must_dispatch(cfqq);
2151                         goto out_kick;
2152                 }
2153         }
2154 expire:
2155         cfq_slice_expired(cfqd, 0);
2156 out_kick:
2157         cfq_schedule_dispatch(cfqd);
2158 out_cont:
2159         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2160 }
2161
2162 /*
2163  * Timer running if an idle class queue is waiting for service
2164  */
2165 static void cfq_idle_class_timer(unsigned long data)
2166 {
2167         struct cfq_data *cfqd = (struct cfq_data *) data;
2168         unsigned long flags, end;
2169
2170         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2171
2172         /*
2173          * race with a non-idle queue, reset timer
2174          */
2175         end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2176         if (!time_after_eq(jiffies, end))
2177                 mod_timer(&cfqd->idle_class_timer, end);
2178         else
2179                 cfq_schedule_dispatch(cfqd);
2180
2181         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2182 }
2183
2184 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2185 {
2186         del_timer_sync(&cfqd->idle_slice_timer);
2187         del_timer_sync(&cfqd->idle_class_timer);
2188         blk_sync_queue(cfqd->queue);
2189 }
2190
2191 static void cfq_exit_queue(elevator_t *e)
2192 {
2193         struct cfq_data *cfqd = e->elevator_data;
2194         request_queue_t *q = cfqd->queue;
2195
2196         cfq_shutdown_timer_wq(cfqd);
2197
2198         spin_lock(&cfq_exit_lock);
2199         spin_lock_irq(q->queue_lock);
2200
2201         if (cfqd->active_queue)
2202                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2203
2204         while (!list_empty(&cfqd->cic_list)) {
2205                 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2206                                                         struct cfq_io_context,
2207                                                         queue_list);
2208                 if (cic->cfqq[ASYNC]) {
2209                         cfq_put_queue(cic->cfqq[ASYNC]);
2210                         cic->cfqq[ASYNC] = NULL;
2211                 }
2212                 if (cic->cfqq[SYNC]) {
2213                         cfq_put_queue(cic->cfqq[SYNC]);
2214                         cic->cfqq[SYNC] = NULL;
2215                 }
2216                 cic->key = NULL;
2217                 list_del_init(&cic->queue_list);
2218         }
2219
2220         spin_unlock_irq(q->queue_lock);
2221         spin_unlock(&cfq_exit_lock);
2222
2223         cfq_shutdown_timer_wq(cfqd);
2224
2225         mempool_destroy(cfqd->crq_pool);
2226         kfree(cfqd->crq_hash);
2227         kfree(cfqd->cfq_hash);
2228         kfree(cfqd);
2229 }
2230
2231 static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
2232 {
2233         struct cfq_data *cfqd;
2234         int i;
2235
2236         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
2237         if (!cfqd)
2238                 return NULL;
2239
2240         memset(cfqd, 0, sizeof(*cfqd));
2241
2242         for (i = 0; i < CFQ_PRIO_LISTS; i++)
2243                 INIT_LIST_HEAD(&cfqd->rr_list[i]);
2244
2245         INIT_LIST_HEAD(&cfqd->busy_rr);
2246         INIT_LIST_HEAD(&cfqd->cur_rr);
2247         INIT_LIST_HEAD(&cfqd->idle_rr);
2248         INIT_LIST_HEAD(&cfqd->empty_list);
2249         INIT_LIST_HEAD(&cfqd->cic_list);
2250
2251         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
2252         if (!cfqd->crq_hash)
2253                 goto out_crqhash;
2254
2255         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
2256         if (!cfqd->cfq_hash)
2257                 goto out_cfqhash;
2258
2259         cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
2260         if (!cfqd->crq_pool)
2261                 goto out_crqpool;
2262
2263         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
2264                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
2265         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2266                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2267
2268         cfqd->queue = q;
2269
2270         init_timer(&cfqd->idle_slice_timer);
2271         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2272         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2273
2274         init_timer(&cfqd->idle_class_timer);
2275         cfqd->idle_class_timer.function = cfq_idle_class_timer;
2276         cfqd->idle_class_timer.data = (unsigned long) cfqd;
2277
2278         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2279
2280         cfqd->cfq_queued = cfq_queued;
2281         cfqd->cfq_quantum = cfq_quantum;
2282         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2283         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2284         cfqd->cfq_back_max = cfq_back_max;
2285         cfqd->cfq_back_penalty = cfq_back_penalty;
2286         cfqd->cfq_slice[0] = cfq_slice_async;
2287         cfqd->cfq_slice[1] = cfq_slice_sync;
2288         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2289         cfqd->cfq_slice_idle = cfq_slice_idle;
2290
2291         return cfqd;
2292 out_crqpool:
2293         kfree(cfqd->cfq_hash);
2294 out_cfqhash:
2295         kfree(cfqd->crq_hash);
2296 out_crqhash:
2297         kfree(cfqd);
2298         return NULL;
2299 }
2300
2301 static void cfq_slab_kill(void)
2302 {
2303         if (crq_pool)
2304                 kmem_cache_destroy(crq_pool);
2305         if (cfq_pool)
2306                 kmem_cache_destroy(cfq_pool);
2307         if (cfq_ioc_pool)
2308                 kmem_cache_destroy(cfq_ioc_pool);
2309 }
2310
2311 static int __init cfq_slab_setup(void)
2312 {
2313         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
2314                                         NULL, NULL);
2315         if (!crq_pool)
2316                 goto fail;
2317
2318         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2319                                         NULL, NULL);
2320         if (!cfq_pool)
2321                 goto fail;
2322
2323         cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2324                         sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2325         if (!cfq_ioc_pool)
2326                 goto fail;
2327
2328         return 0;
2329 fail:
2330         cfq_slab_kill();
2331         return -ENOMEM;
2332 }
2333
2334 /*
2335  * sysfs parts below -->
2336  */
2337
2338 static ssize_t
2339 cfq_var_show(unsigned int var, char *page)
2340 {
2341         return sprintf(page, "%d\n", var);
2342 }
2343
2344 static ssize_t
2345 cfq_var_store(unsigned int *var, const char *page, size_t count)
2346 {
2347         char *p = (char *) page;
2348
2349         *var = simple_strtoul(p, &p, 10);
2350         return count;
2351 }
2352
2353 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
2354 static ssize_t __FUNC(elevator_t *e, char *page)                        \
2355 {                                                                       \
2356         struct cfq_data *cfqd = e->elevator_data;                       \
2357         unsigned int __data = __VAR;                                    \
2358         if (__CONV)                                                     \
2359                 __data = jiffies_to_msecs(__data);                      \
2360         return cfq_var_show(__data, (page));                            \
2361 }
2362 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2363 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
2364 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2365 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2366 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2367 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2368 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2369 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2370 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2371 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2372 #undef SHOW_FUNCTION
2373
2374 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
2375 static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)    \
2376 {                                                                       \
2377         struct cfq_data *cfqd = e->elevator_data;                       \
2378         unsigned int __data;                                            \
2379         int ret = cfq_var_store(&__data, (page), count);                \
2380         if (__data < (MIN))                                             \
2381                 __data = (MIN);                                         \
2382         else if (__data > (MAX))                                        \
2383                 __data = (MAX);                                         \
2384         if (__CONV)                                                     \
2385                 *(__PTR) = msecs_to_jiffies(__data);                    \
2386         else                                                            \
2387                 *(__PTR) = __data;                                      \
2388         return ret;                                                     \
2389 }
2390 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2391 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
2392 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2393 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2394 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2395 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2396 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2397 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2398 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2399 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2400 #undef STORE_FUNCTION
2401
2402 #define CFQ_ATTR(name) \
2403         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2404
2405 static struct elv_fs_entry cfq_attrs[] = {
2406         CFQ_ATTR(quantum),
2407         CFQ_ATTR(queued),
2408         CFQ_ATTR(fifo_expire_sync),
2409         CFQ_ATTR(fifo_expire_async),
2410         CFQ_ATTR(back_seek_max),
2411         CFQ_ATTR(back_seek_penalty),
2412         CFQ_ATTR(slice_sync),
2413         CFQ_ATTR(slice_async),
2414         CFQ_ATTR(slice_async_rq),
2415         CFQ_ATTR(slice_idle),
2416         __ATTR_NULL
2417 };
2418
2419 static struct elevator_type iosched_cfq = {
2420         .ops = {
2421                 .elevator_merge_fn =            cfq_merge,
2422                 .elevator_merged_fn =           cfq_merged_request,
2423                 .elevator_merge_req_fn =        cfq_merged_requests,
2424                 .elevator_dispatch_fn =         cfq_dispatch_requests,
2425                 .elevator_add_req_fn =          cfq_insert_request,
2426                 .elevator_activate_req_fn =     cfq_activate_request,
2427                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
2428                 .elevator_queue_empty_fn =      cfq_queue_empty,
2429                 .elevator_completed_req_fn =    cfq_completed_request,
2430                 .elevator_former_req_fn =       cfq_former_request,
2431                 .elevator_latter_req_fn =       cfq_latter_request,
2432                 .elevator_set_req_fn =          cfq_set_request,
2433                 .elevator_put_req_fn =          cfq_put_request,
2434                 .elevator_may_queue_fn =        cfq_may_queue,
2435                 .elevator_init_fn =             cfq_init_queue,
2436                 .elevator_exit_fn =             cfq_exit_queue,
2437                 .trim =                         cfq_trim,
2438         },
2439         .elevator_attrs =       cfq_attrs,
2440         .elevator_name =        "cfq",
2441         .elevator_owner =       THIS_MODULE,
2442 };
2443
2444 static int __init cfq_init(void)
2445 {
2446         int ret;
2447
2448         /*
2449          * could be 0 on HZ < 1000 setups
2450          */
2451         if (!cfq_slice_async)
2452                 cfq_slice_async = 1;
2453         if (!cfq_slice_idle)
2454                 cfq_slice_idle = 1;
2455
2456         if (cfq_slab_setup())
2457                 return -ENOMEM;
2458
2459         ret = elv_register(&iosched_cfq);
2460         if (ret)
2461                 cfq_slab_kill();
2462
2463         return ret;
2464 }
2465
2466 static void __exit cfq_exit(void)
2467 {
2468         DECLARE_COMPLETION_ONSTACK(all_gone);
2469         elv_unregister(&iosched_cfq);
2470         ioc_gone = &all_gone;
2471         /* ioc_gone's update must be visible before reading ioc_count */
2472         smp_wmb();
2473         if (atomic_read(&ioc_count))
2474                 wait_for_completion(ioc_gone);
2475         synchronize_rcu();
2476         cfq_slab_kill();
2477 }
2478
2479 module_init(cfq_init);
2480 module_exit(cfq_exit);
2481
2482 MODULE_AUTHOR("Jens Axboe");
2483 MODULE_LICENSE("GPL");
2484 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");