There is a bug in the CKRM CPU scheduler. This has been reported to the
[linux-2.6.git] / drivers / block / cfq-iosched.c
1 /*
2  *  linux/drivers/block/cfq-iosched.c
3  *
4  *  CFQ, or complete fairness queueing, disk scheduler.
5  *
6  *  Based on ideas from a previously unfinished io
7  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
8  *
9  *  IO priorities are supported, from 0% to 100% in 5% increments. Both of
10  *  those values have special meaning - 0% class is allowed to do io if
11  *  noone else wants to use the disk. 100% is considered real-time io, and
12  *  always get priority. Default process io rate is 95%. In absence of other
13  *  io, a class may consume 100% disk bandwidth regardless. Withing a class,
14  *  bandwidth is distributed equally among the citizens.
15  *
16  * TODO:
17  *      - cfq_select_requests() needs some work for 5-95% io
18  *      - barriers not supported
19  *      - export grace periods in ms, not jiffies
20  *
21  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
22  */
23 #include <linux/kernel.h>
24 #include <linux/fs.h>
25 #include <linux/blkdev.h>
26 #include <linux/elevator.h>
27 #include <linux/bio.h>
28 #include <linux/config.h>
29 #include <linux/module.h>
30 #include <linux/slab.h>
31 #include <linux/init.h>
32 #include <linux/compiler.h>
33 #include <linux/hash.h>
34 #include <linux/rbtree.h>
35 #include <linux/mempool.h>
36 #include <asm/div64.h>
37
38 #if IOPRIO_NR > BITS_PER_LONG
39 #error Cannot support this many io priority levels
40 #endif
41
42 /*
43  * tunables
44  */
45 static int cfq_quantum = 6;
46 static int cfq_quantum_io = 256;
47 static int cfq_idle_quantum = 1;
48 static int cfq_idle_quantum_io = 64;
49 static int cfq_queued = 4;
50 static int cfq_grace_rt = HZ / 100 ?: 1;
51 static int cfq_grace_idle = HZ / 10;
52
53 #define CFQ_EPOCH               1000000000
54 #define CFQ_SECTORATE           1000   
55 #define CFQ_HMAX_PCT            80
56
57 #define CFQ_QHASH_SHIFT         6
58 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
59 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
60
61 #define CFQ_MHASH_SHIFT         8
62 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
63 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
64 #define CFQ_MHASH_FN(sec)       (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
65 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
66 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
67
68 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
69 #define list_entry_prio(ptr)    list_entry((ptr), struct cfq_rq, prio_list)
70
71 #define cfq_account_io(crq)     \
72         ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
73
74 /*
75  * defines how we distribute bandwidth (can be tgid, uid, etc)
76  */
77
78 /* FIXME: change hash_key to be sizeof(void *) rather than sizeof(int) 
79  * otherwise the cast of cki_tsk_icls will not work reliably on 64-bit arches.
80  * OR, change cki_tsk_icls to return ints (will need another id space to be 
81  * managed)
82  */
83
84 #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
85 extern void *cki_hash_key(struct task_struct *tsk);
86 extern int cki_ioprio(struct task_struct *tsk);
87 extern void *cki_cfqpriv(struct task_struct *tsk); 
88
89 #define cfq_hash_key(tsk)   ((int)cki_hash_key((tsk)))
90 #define cfq_ioprio(tsk) (cki_ioprio((tsk)))
91 #define cfq_cfqpriv(cfqd,tsk)   (cki_cfqpriv((tsk)))
92
93 #else
94 #define cfq_hash_key(tsk)       ((tsk)->tgid)
95 #define cfq_cfqpriv(cfqd,tsk)   (&(((cfqd)->cid[(tsk)->ioprio]).cfqpriv))
96
97 /*
98  * move to io_context
99  */
100 #define cfq_ioprio(tsk) ((tsk)->ioprio)
101 #endif
102
103 #define CFQ_WAIT_RT     0
104 #define CFQ_WAIT_NORM   1
105
106 static kmem_cache_t *crq_pool;
107 static kmem_cache_t *cfq_pool;
108 static mempool_t *cfq_mpool;
109
110 /*
111  * defines an io priority level
112  */
113 struct io_prio_data {
114         struct list_head rr_list;
115         int busy_queues;
116         int busy_rq;
117         unsigned long busy_sectors;
118         
119         /* requests, sectors and queues 
120          * added(in),dispatched/deleted(out) 
121          * at this priority level. 
122          */
123         atomic_t cum_rq_in,cum_rq_out;              
124         atomic_t cum_sectors_in,cum_sectors_out;    
125         atomic_t cum_queues_in,cum_queues_out;
126
127         cfqlim_t cfqpriv;       /* data for enforcing limits */
128
129         struct list_head prio_list;
130         int last_rq;
131         int last_sectors;
132
133 };
134
135 /*
136  * per-request queue structure
137  */
138 struct cfq_data {
139         struct list_head rr_list;
140         struct list_head *dispatch;
141         struct hlist_head *cfq_hash;
142         struct hlist_head *crq_hash;
143         mempool_t *crq_pool;
144
145         struct io_prio_data cid[IOPRIO_NR];
146
147         /*
148          * total number of busy queues and requests
149          */
150         int busy_rq;
151         int busy_queues;
152         unsigned long busy_sectors;
153
154
155         request_queue_t *queue;
156         unsigned long rq_starved_mask;
157
158         /*
159          * grace period handling
160          */
161         struct timer_list timer;
162         unsigned long wait_end;
163         unsigned long flags;
164         struct work_struct work;
165
166         /*
167          * tunables
168          */
169         unsigned int cfq_quantum;
170         unsigned int cfq_quantum_io;
171         unsigned int cfq_idle_quantum;
172         unsigned int cfq_idle_quantum_io;
173         unsigned int cfq_queued;
174         unsigned int cfq_grace_rt;
175         unsigned int cfq_grace_idle;
176
177         unsigned int cfq_epoch;
178         unsigned int cfq_hmax_pct;
179         unsigned int cfq_qsectorate;
180 };
181
182 /*
183  * per-class structure
184  */
185 struct cfq_queue {
186         struct list_head cfq_list;
187         struct hlist_node cfq_hash;
188         int hash_key;
189         struct rb_root sort_list;
190         int queued[2];
191         int ioprio;
192
193         /* limit related settings/stats obtained 
194            either from io_prio_data or ckrm I/O class
195         */
196         struct cfqlim *cfqpriv; 
197
198         u64 epstart;            /* current epoch's starting timestamp (ns) */
199         u64 epsector[2];        /* Total sectors dispatched in [0] previous
200                                  * and [1] current epoch
201                                  */
202         
203         unsigned long avsec;            /* avg sectors dispatched/epoch */
204 //      unsigned long long lastime;     /* timestamp of last request served */
205 //      unsigned long sectorate;        /* limit for sectors served/epoch */
206         int skipped;                    /* queue skipped at last dispatch ? */
207
208         /* Per queue timer to suspend/resume queue from processing */
209         struct timer_list timer;
210         unsigned long wait_end;
211         unsigned long flags;
212         struct work_struct work;
213
214         struct cfq_data *cfqd;
215 };
216
217
218
219 /*
220  * Per-request structure
221  */
222 struct cfq_rq {
223         struct cfq_queue *cfq_queue;
224         struct rb_node rb_node;
225         struct hlist_node hash;
226         sector_t rb_key;
227
228         struct request *request;
229         struct list_head prio_list;
230         unsigned long nr_sectors;
231         int ioprio;
232 };
233
234 static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
235 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
236 static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
237                               struct cfq_rq *crq);
238
239 /*
240  * lots of deadline iosched dupes, can be abstracted later...
241  */
242 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
243 {
244         hlist_del_init(&crq->hash);
245 }
246
247 static inline void
248 cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
249 {
250         cfq_del_crq_hash(crq);
251
252         if (q->last_merge == crq->request)
253                 q->last_merge = NULL;
254 }
255
256 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
257 {
258         struct request *rq = crq->request;
259         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(rq));
260
261         BUG_ON(!hlist_unhashed(&crq->hash));
262  
263         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
264 }
265
266 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
267 {
268         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
269         struct hlist_node *entry, *next;
270
271         hlist_for_each_safe(entry, next, hash_list) {
272                 struct cfq_rq *crq = list_entry_hash(entry);
273                 struct request *__rq = crq->request;
274
275                 BUG_ON(hlist_unhashed(&crq->hash));
276
277                 if (!rq_mergeable(__rq)) {
278                         cfq_del_crq_hash(crq);
279                         continue;
280                 }
281
282                 if (rq_hash_key(__rq) == offset)
283                         return __rq;
284         }
285
286         return NULL;
287 }
288
289 /*
290  * rb tree support functions
291  */
292 #define RB_EMPTY(node)          ((node)->rb_node == NULL)
293 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
294 #define rq_rb_key(rq)           (rq)->sector
295
296 static void
297 cfq_del_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
298 {
299         if (crq->cfq_queue) {
300                 crq->cfq_queue = NULL;
301
302                 if (cfq_account_io(crq)) {
303                         cfqd->busy_rq--;
304                         cfqd->busy_sectors -= crq->nr_sectors;
305                         cfqd->cid[crq->ioprio].busy_rq--;
306                         cfqd->cid[crq->ioprio].busy_sectors -= crq->nr_sectors;
307                 }
308                 atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_out));
309                 atomic_add(crq->nr_sectors,
310                            &(cfqd->cid[crq->ioprio].cum_sectors_out));
311                 cfqq->queued[rq_data_dir(crq->request)]--;
312                 rb_erase(&crq->rb_node, &cfqq->sort_list);
313         }
314 }
315
316 static struct cfq_rq *
317 __cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
318 {
319         struct rb_node **p = &cfqq->sort_list.rb_node;
320         struct rb_node *parent = NULL;
321         struct cfq_rq *__crq;
322
323         while (*p) {
324                 parent = *p;
325                 __crq = rb_entry_crq(parent);
326
327                 if (crq->rb_key < __crq->rb_key)
328                         p = &(*p)->rb_left;
329                 else if (crq->rb_key > __crq->rb_key)
330                         p = &(*p)->rb_right;
331                 else
332                         return __crq;
333         }
334
335         rb_link_node(&crq->rb_node, parent, p);
336         return NULL;
337 }
338
339 static void
340 cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
341 {
342         struct request *rq = crq->request;
343         struct cfq_rq *__alias;
344
345
346         cfqq->queued[rq_data_dir(rq)]++;
347         if (cfq_account_io(crq)) {
348                 cfqd->busy_rq++;
349                 cfqd->busy_sectors += crq->nr_sectors;
350                 cfqd->cid[crq->ioprio].busy_rq++;
351                 cfqd->cid[crq->ioprio].busy_sectors += crq->nr_sectors;
352         }
353         atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_in));
354         atomic_add(crq->nr_sectors,
355                    &(cfqd->cid[crq->ioprio].cum_sectors_in));
356 retry:
357         __alias = __cfq_add_crq_rb(cfqq, crq);
358         if (!__alias) {
359                 rb_insert_color(&crq->rb_node, &cfqq->sort_list);
360                 crq->rb_key = rq_rb_key(rq);
361                 crq->cfq_queue = cfqq;
362                 return;
363         }
364
365         cfq_dispatch_sort(cfqd, cfqq, __alias);
366         goto retry;
367 }
368
369 static struct request *
370 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
371 {
372         struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
373         struct rb_node *n;
374
375         if (!cfqq)
376                 goto out;
377
378         n = cfqq->sort_list.rb_node;
379         while (n) {
380                 struct cfq_rq *crq = rb_entry_crq(n);
381
382                 if (sector < crq->rb_key)
383                         n = n->rb_left;
384                 else if (sector > crq->rb_key)
385                         n = n->rb_right;
386                 else
387                         return crq->request;
388         }
389
390 out:
391         return NULL;
392 }
393
394 static void cfq_remove_request(request_queue_t *q, struct request *rq)
395 {
396         struct cfq_data *cfqd = q->elevator.elevator_data;
397         struct cfq_rq *crq = RQ_ELV_DATA(rq);
398
399         if (crq) {
400
401                 cfq_remove_merge_hints(q, crq);
402                 list_del_init(&crq->prio_list);
403                 list_del_init(&rq->queuelist);
404
405                 /*
406                  * set a grace period timer to allow realtime io to make real
407                  * progress, if we release an rt request. for normal request,
408                  * set timer so idle io doesn't interfere with other io
409                  */
410                 if (crq->ioprio == IOPRIO_RT) {
411                         set_bit(CFQ_WAIT_RT, &cfqd->flags);
412                         cfqd->wait_end = jiffies + cfqd->cfq_grace_rt;
413                 } else if (crq->ioprio != IOPRIO_IDLE) {
414                         set_bit(CFQ_WAIT_NORM, &cfqd->flags);
415                         cfqd->wait_end = jiffies + cfqd->cfq_grace_idle;
416                 }
417
418                 if (crq->cfq_queue) {
419                         struct cfq_queue *cfqq = crq->cfq_queue;
420
421                         cfq_del_crq_rb(cfqd, cfqq, crq);
422
423                         if (RB_EMPTY(&cfqq->sort_list))
424                                 cfq_put_queue(cfqd, cfqq);
425                 }
426         }
427 }
428
429 static int
430 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
431 {
432         struct cfq_data *cfqd = q->elevator.elevator_data;
433         struct request *__rq;
434         int ret;
435
436         ret = elv_try_last_merge(q, bio);
437         if (ret != ELEVATOR_NO_MERGE) {
438                 __rq = q->last_merge;
439                 goto out_insert;
440         }
441
442         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
443         if (__rq) {
444                 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
445
446                 if (elv_rq_merge_ok(__rq, bio)) {
447                         ret = ELEVATOR_BACK_MERGE;
448                         goto out;
449                 }
450         }
451
452         __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
453         if (__rq) {
454                 if (elv_rq_merge_ok(__rq, bio)) {
455                         ret = ELEVATOR_FRONT_MERGE;
456                         goto out;
457                 }
458         }
459
460         return ELEVATOR_NO_MERGE;
461 out:
462         q->last_merge = __rq;
463 out_insert:
464         *req = __rq;
465         return ret;
466 }
467
468 static void cfq_merged_request(request_queue_t *q, struct request *req)
469 {
470         struct cfq_data *cfqd = q->elevator.elevator_data;
471         struct cfq_rq *crq = RQ_ELV_DATA(req);
472         int tmp;
473
474         cfq_del_crq_hash(crq);
475         cfq_add_crq_hash(cfqd, crq);
476
477         if (crq->cfq_queue && (rq_rb_key(req) != crq->rb_key)) {
478                 struct cfq_queue *cfqq = crq->cfq_queue;
479
480                 cfq_del_crq_rb(cfqd, cfqq, crq);
481                 cfq_add_crq_rb(cfqd, cfqq, crq);
482         }
483
484         tmp = req->hard_nr_sectors - crq->nr_sectors;
485         cfqd->busy_sectors += tmp;
486         cfqd->cid[crq->ioprio].busy_sectors += tmp;
487         atomic_add(tmp,&(cfqd->cid[crq->ioprio].cum_sectors_in));
488
489         crq->nr_sectors = req->hard_nr_sectors;
490
491         q->last_merge = req;
492 }
493
494 static void
495 cfq_merged_requests(request_queue_t *q, struct request *req,
496                     struct request *next)
497 {
498         cfq_merged_request(q, req);
499         cfq_remove_request(q, next);
500 }
501
502 /*
503  * sort into dispatch list, in optimal ascending order
504  */
505 static void
506 cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
507                   struct cfq_rq *crq)
508 {
509         struct list_head *head = cfqd->dispatch, *entry = head;
510         struct request *__rq;
511
512         cfq_del_crq_rb(cfqd, cfqq, crq);
513         cfq_remove_merge_hints(cfqd->queue, crq);
514
515         if (!list_empty(head)) {
516                 __rq = list_entry_rq(head->next);
517
518                 if (crq->request->sector < __rq->sector) {
519                         entry = head->prev;
520                         goto link;
521                 }
522         }
523
524         while ((entry = entry->prev) != head) {
525                 __rq = list_entry_rq(entry);
526
527                 if (crq->request->sector <= __rq->sector)
528                         break;
529         }
530
531 link:
532         list_add_tail(&crq->request->queuelist, entry);
533 }
534
535 struct cfq_queue *dcfqq;
536 u64 dtmp;
537
538
539
540 /* Over how many ns is sectorate defined */
541 #define NS4SCALE  (100000000)
542
543 static inline int
544 __cfq_check_limit(struct cfq_data *cfqd,struct cfq_queue *cfqq, int dontskip)
545 {
546         struct cfq_rq *crq;
547         unsigned long long ts, gap, epoch, tmp;
548         unsigned long newavsec, sectorate;
549
550         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
551
552         ts = sched_clock();
553         gap = ts - cfqq->epstart;
554         epoch = cfqd->cfq_epoch;
555
556         sectorate = atomic_read(&cfqq->cfqpriv->sectorate);
557 //      sectorate = atomic_read(&(cfqd->cid[crq->ioprio].sectorate));
558
559         dcfqq = cfqq;
560
561         if ((gap >= epoch) || (gap < 0)) {
562
563                 if (gap >= (epoch << 1)) {
564                         cfqq->epsector[0] = 0;
565                         cfqq->epstart = ts ; 
566                 } else {
567                         cfqq->epsector[0] = cfqq->epsector[1];
568                         cfqq->epstart += epoch;
569                 } 
570                 cfqq->epsector[1] = 0;
571                 gap = ts - cfqq->epstart;
572
573                 tmp  = (cfqq->epsector[0] + crq->nr_sectors) * NS4SCALE;
574                 do_div(tmp,epoch+gap);
575
576                 cfqq->avsec = (unsigned long)tmp;
577                 cfqq->skipped = 0;
578                 cfqq->epsector[1] += crq->nr_sectors;
579                 
580                 cfqq->cfqpriv->navsec = cfqq->avsec;
581                 cfqq->cfqpriv->sec[0] = cfqq->epsector[0];
582                 cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
583                 cfqq->cfqpriv->timedout++;
584                 /*
585                 cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
586                 cfqd->cid[crq->ioprio].sec[0] = cfqq->epsector[0];
587                 cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
588                 cfqd->cid[crq->ioprio].timedout++;
589                 */
590                 return 0;
591         } else {
592                 
593                 tmp = (cfqq->epsector[0] + cfqq->epsector[1] + crq->nr_sectors)
594                         * NS4SCALE;
595                 do_div(tmp,epoch+gap);
596
597                 newavsec = (unsigned long)tmp;
598                 if ((newavsec < sectorate) || dontskip) {
599                         cfqq->avsec = newavsec ;
600                         cfqq->skipped = 0;
601                         cfqq->epsector[1] += crq->nr_sectors;
602                         cfqq->cfqpriv->navsec = cfqq->avsec;
603                         cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
604                         /*
605                         cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
606                         cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
607                         */
608                 } else {
609                         cfqq->skipped = 1;
610                         /* pause q's processing till avsec drops to 
611                            cfq_hmax_pct % of its value */
612                         tmp = (epoch+gap) * (100-cfqd->cfq_hmax_pct);
613                         do_div(tmp,1000000*cfqd->cfq_hmax_pct);
614                         cfqq->wait_end = jiffies+msecs_to_jiffies(tmp);
615                 }
616         }                       
617 }
618
619 /*
620  * remove from io scheduler core and put on dispatch list for service
621  */
622 static inline int
623 __cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
624                         struct cfq_queue *cfqq)
625 {
626         struct cfq_rq *crq;
627
628         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
629
630         cfq_dispatch_sort(cfqd, cfqq, crq);
631
632         /*
633          * technically, for IOPRIO_RT we don't need to add it to the list.
634          */
635         list_add_tail(&crq->prio_list, &cfqd->cid[cfqq->ioprio].prio_list);
636         return crq->nr_sectors;
637 }
638
639 static int
640 cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors)
641 {
642         struct cfq_data *cfqd = q->elevator.elevator_data;
643         struct list_head *plist = &cfqd->cid[prio].rr_list;
644         struct cfq_queue *cfqq;
645         struct list_head *entry, *nxt;
646         int q_rq, q_io;
647         int first_round,busy_queues,busy_unlimited;
648
649
650         /*
651          * for each queue at this prio level, dispatch a request
652          */
653         q_rq = q_io = 0;
654         first_round=1;
655  restart:
656         busy_unlimited = 0;
657         busy_queues = 0;
658         list_for_each_safe(entry, nxt, plist) {
659                 cfqq = list_entry_cfqq(entry);
660
661                 BUG_ON(RB_EMPTY(&cfqq->sort_list));
662                 busy_queues++;
663
664                 
665                 if (first_round || busy_unlimited)
666                         __cfq_check_limit(cfqd,cfqq,0);
667                 else
668                         __cfq_check_limit(cfqd,cfqq,1);
669
670                 if (cfqq->skipped) {
671                         cfqq->cfqpriv->nskip++;
672                         /* cfqd->cid[prio].nskip++; */
673                         busy_queues--;
674                         if (time_before(jiffies, cfqq->wait_end)) {
675                                 list_del(&cfqq->cfq_list);
676                                 mod_timer(&cfqq->timer,cfqq->wait_end);
677                         }
678                         continue;
679                 }
680                 busy_unlimited++;
681
682                 q_io += __cfq_dispatch_requests(q, cfqd, cfqq);
683                 q_rq++;
684
685                 if (RB_EMPTY(&cfqq->sort_list)) {
686                         busy_unlimited--;
687                         busy_queues--;
688                         cfq_put_queue(cfqd, cfqq);
689                 } 
690
691                 if (q_io >= max_sectors || q_rq >= max_rq) {
692 #if 0
693                         struct list_head *prv = nxt->prev;
694
695                         if (prv != plist) {
696                                 list_del(plist);
697                                 list_add(plist, prv);
698                         }
699 #endif
700                         break;
701                 }
702         }
703
704         if ((q_io < max_sectors) && (q_rq < max_rq) && 
705             (busy_queues || first_round))
706         {
707                 first_round = 0;
708                 goto restart;
709         } else {
710                 /*
711                  * if we hit the queue limit, put the string of serviced
712                  * queues at the back of the pending list
713                  */
714                 struct list_head *prv = nxt->prev;
715                 if (prv != plist) {
716                         list_del(plist);
717                         list_add(plist, prv);
718                 }
719         }
720
721         cfqd->cid[prio].last_rq = q_rq;
722         cfqd->cid[prio].last_sectors = q_io;
723         return q_rq;
724 }
725
726 /*
727  * try to move some requests to the dispatch list. return 0 on success
728  */
729 static int cfq_select_requests(request_queue_t *q, struct cfq_data *cfqd)
730 {
731         int queued, busy_rq, busy_sectors, i;
732
733         /*
734          * if there's any realtime io, only schedule that
735          */
736         if (cfq_dispatch_requests(q, IOPRIO_RT, cfqd->cfq_quantum, cfqd->cfq_quantum_io))
737                 return 1;
738
739         /*
740          * if RT io was last serviced and grace time hasn't expired,
741          * arm the timer to restart queueing if no other RT io has been
742          * submitted in the mean time
743          */
744         if (test_bit(CFQ_WAIT_RT, &cfqd->flags)) {
745                 if (time_before(jiffies, cfqd->wait_end)) {
746                         mod_timer(&cfqd->timer, cfqd->wait_end);
747                         return 0;
748                 }
749                 clear_bit(CFQ_WAIT_RT, &cfqd->flags);
750         }
751
752         /*
753          * for each priority level, calculate number of requests we
754          * are allowed to put into service.
755          */
756         queued = 0;
757         busy_rq = cfqd->busy_rq;
758         busy_sectors = cfqd->busy_sectors;
759         for (i = IOPRIO_RT - 1; i > IOPRIO_IDLE; i--) {
760                 const int o_rq = busy_rq - cfqd->cid[i].busy_rq;
761                 const int o_sectors = busy_sectors - cfqd->cid[i].busy_sectors;
762                 int q_rq = cfqd->cfq_quantum * (i + 1) / IOPRIO_NR;
763                 int q_io = cfqd->cfq_quantum_io * (i + 1) / IOPRIO_NR;
764
765                 /*
766                  * no need to keep iterating the list, if there are no
767                  * requests pending anymore
768                  */
769                 if (!cfqd->busy_rq)
770                         break;
771
772                 /*
773                  * find out how many requests and sectors we are allowed to
774                  * service
775                  */
776                 if (o_rq)
777                         q_rq = o_sectors * (i + 1) / IOPRIO_NR;
778                 if (q_rq > cfqd->cfq_quantum)
779                         q_rq = cfqd->cfq_quantum;
780
781                 if (o_sectors)
782                         q_io = o_sectors * (i + 1) / IOPRIO_NR;
783                 if (q_io > cfqd->cfq_quantum_io)
784                         q_io = cfqd->cfq_quantum_io;
785
786                 /*
787                  * average with last dispatched for fairness
788                  */
789                 if (cfqd->cid[i].last_rq != -1)
790                         q_rq = (cfqd->cid[i].last_rq + q_rq) / 2;
791                 if (cfqd->cid[i].last_sectors != -1)
792                         q_io = (cfqd->cid[i].last_sectors + q_io) / 2;
793
794                 queued += cfq_dispatch_requests(q, i, q_rq, q_io);
795         }
796
797         if (queued)
798                 return 1;
799
800         /*
801          * only allow dispatch of idle io, if the queue has been idle from
802          * servicing RT or normal io for the grace period
803          */
804         if (test_bit(CFQ_WAIT_NORM, &cfqd->flags)) {
805                 if (time_before(jiffies, cfqd->wait_end)) {
806                         mod_timer(&cfqd->timer, cfqd->wait_end);
807                         return 0;
808                 }
809                 clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
810         }
811
812         /*
813          * if we found nothing to do, allow idle io to be serviced
814          */
815         if (cfq_dispatch_requests(q, IOPRIO_IDLE, cfqd->cfq_idle_quantum, cfqd->cfq_idle_quantum_io))
816                 return 1;
817
818         return 0;
819 }
820
821 static struct request *cfq_next_request(request_queue_t *q)
822 {
823         struct cfq_data *cfqd = q->elevator.elevator_data;
824         struct request *rq;
825
826         if (!list_empty(cfqd->dispatch)) {
827                 struct cfq_rq *crq;
828 dispatch:
829                 /*
830                  * end grace period, we are servicing a request
831                  */
832                 del_timer(&cfqd->timer);
833                 clear_bit(CFQ_WAIT_RT, &cfqd->flags);
834                 clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
835
836                 BUG_ON(list_empty(cfqd->dispatch));
837                 rq = list_entry_rq(cfqd->dispatch->next);
838
839                 BUG_ON(q->last_merge == rq);
840                 crq = RQ_ELV_DATA(rq);
841                 if (crq) {
842                         BUG_ON(!hlist_unhashed(&crq->hash));
843                         list_del_init(&crq->prio_list);
844                 }
845
846                 return rq;
847         }
848
849         /*
850          * we moved requests to dispatch list, go back end serve one
851          */
852         if (cfq_select_requests(q, cfqd))
853                 goto dispatch;
854
855         return NULL;
856 }
857
858 static inline struct cfq_queue *
859 __cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey, const int hashval)
860 {
861         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
862         struct hlist_node *entry;
863
864         hlist_for_each(entry, hash_list) {
865                 struct cfq_queue *__cfqq = list_entry_qhash(entry);
866
867                 if (__cfqq->hash_key == hashkey)
868                         return __cfqq;
869         }
870
871         return NULL;
872 }
873
874
875 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey)
876 {
877         const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
878
879         return __cfq_find_cfq_hash(cfqd, hashkey, hashval);
880 }
881
882 static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
883 {
884         cfqd->busy_queues--;
885         WARN_ON(cfqd->busy_queues < 0);
886
887         cfqd->cid[cfqq->ioprio].busy_queues--;
888         WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0);
889         atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
890
891         list_del(&cfqq->cfq_list);
892         hlist_del(&cfqq->cfq_hash);
893         mempool_free(cfqq, cfq_mpool);
894 }
895
896 static void cfq_pauseq_timer(unsigned long data)
897 {
898         struct cfq_queue *cfqq = (struct cfq_queue *) data;
899         kblockd_schedule_work(&cfqq->work);
900 }
901
902 static void cfq_pauseq_work(void *data)
903 {
904         struct cfq_queue *cfqq = (struct cfq_queue *) data;
905         struct cfq_data *cfqd = cfqq->cfqd;
906         request_queue_t *q = cfqd->queue;
907         unsigned long flags;
908         
909         spin_lock_irqsave(q->queue_lock, flags);
910         list_add_tail(&cfqq->cfq_list,&cfqd->cid[cfqq->ioprio].rr_list);
911         cfqq->skipped = 0;
912         if (cfq_next_request(q))
913                 q->request_fn(q);
914         spin_unlock_irqrestore(q->queue_lock, flags);
915
916         //del_timer(&cfqq->timer);
917 }       
918
919 static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey,
920                                          int gfp_mask)
921 {
922         const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
923         struct cfq_queue *cfqq, *new_cfqq = NULL;
924         request_queue_t *q = cfqd->queue;
925
926 retry:
927         cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
928
929         if (!cfqq) {
930                 if (new_cfqq) {
931                         cfqq = new_cfqq;
932                         new_cfqq = NULL;
933                 } else if (gfp_mask & __GFP_WAIT) {
934                         spin_unlock_irq(q->queue_lock);
935                         new_cfqq = mempool_alloc(cfq_mpool, gfp_mask);
936                         spin_lock_irq(q->queue_lock);
937                         goto retry;
938                 } else
939                         return NULL;
940
941                 memset(cfqq, 0, sizeof(*cfqq));
942                 INIT_HLIST_NODE(&cfqq->cfq_hash);
943                 INIT_LIST_HEAD(&cfqq->cfq_list);
944                 cfqq->hash_key = cfq_hash_key(current);
945                 cfqq->ioprio = cfq_ioprio(current);
946                 
947                 cfqq->cfqpriv = cfq_cfqpriv(cfqd,current);
948                 if (!cfqq->cfqpriv)
949                         cfqq->cfqpriv = &((cfqd->cid[cfqq->ioprio]).cfqpriv);
950
951                 cfqq->epstart = sched_clock();
952                 /* epsector, avsec, skipped initialized to zero by memset */
953                 
954                 init_timer(&cfqq->timer);
955                 cfqq->timer.function = cfq_pauseq_timer;
956                 cfqq->timer.data = (unsigned long) cfqq;
957
958                 INIT_WORK(&cfqq->work, cfq_pauseq_work, cfqq); 
959
960                 cfqq->cfqd = cfqd ;
961
962                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
963         }
964
965         if (new_cfqq)
966                 mempool_free(new_cfqq, cfq_mpool);
967
968         return cfqq;
969 }
970
971 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int hashkey,
972                                        int gfp_mask)
973 {
974         request_queue_t *q = cfqd->queue;
975         struct cfq_queue *cfqq;
976
977         spin_lock_irq(q->queue_lock);
978         cfqq = __cfq_get_queue(cfqd, hashkey, gfp_mask);
979         spin_unlock_irq(q->queue_lock);
980
981         return cfqq;
982 }
983
984 static void
985 __cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
986 {
987         const int prio = crq->ioprio;
988         struct cfq_queue *cfqq;
989
990         cfqq = __cfq_get_queue(cfqd, cfq_hash_key(current), GFP_ATOMIC);
991         if (cfqq) {
992
993                 /*
994                  * not too good...
995                  */
996                 if (prio > cfqq->ioprio) {
997                         printk("prio hash collision %d %d\n", 
998                                prio, cfqq->ioprio);
999                         if (!list_empty(&cfqq->cfq_list)) {
1000                                 cfqd->cid[cfqq->ioprio].busy_queues--;
1001                                 WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues<0);
1002                                 atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
1003                                 cfqd->cid[prio].busy_queues++;
1004                                 atomic_inc(&(cfqd->cid[prio].cum_queues_in));
1005                                 list_move_tail(&cfqq->cfq_list, 
1006                                                &cfqd->cid[prio].rr_list);
1007                         }
1008                         cfqq->ioprio = prio;
1009                 }
1010
1011                 cfq_add_crq_rb(cfqd, cfqq, crq);
1012
1013                 if (list_empty(&cfqq->cfq_list)) {
1014                         list_add_tail(&cfqq->cfq_list, 
1015                                       &cfqd->cid[prio].rr_list);
1016                         cfqd->cid[prio].busy_queues++;
1017                         atomic_inc(&(cfqd->cid[prio].cum_queues_in));
1018                         cfqd->busy_queues++;
1019                 }
1020
1021                 if (rq_mergeable(crq->request)) {
1022                         cfq_add_crq_hash(cfqd, crq);
1023                         
1024                         if (!q->last_merge)
1025                                 q->last_merge = crq->request;
1026                 }
1027
1028         } else {
1029                 /*
1030                  * should can only happen if the request wasn't allocated
1031                  * through blk_alloc_request(), eg stack requests from ide-cd
1032                  * (those should be removed) _and_ we are in OOM.
1033                  */
1034                 list_add_tail(&crq->request->queuelist, cfqd->dispatch);
1035         }
1036 }
1037
1038 static void cfq_reenqueue(request_queue_t *q, struct cfq_data *cfqd, int prio)
1039 {
1040         struct list_head *prio_list = &cfqd->cid[prio].prio_list;
1041         struct list_head *entry, *tmp;
1042
1043         list_for_each_safe(entry, tmp, prio_list) {
1044                 struct cfq_rq *crq = list_entry_prio(entry);
1045
1046                 list_del_init(entry);
1047                 list_del_init(&crq->request->queuelist);
1048                 __cfq_enqueue(q, cfqd, crq);
1049         }
1050 }
1051
1052 static void
1053 cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
1054 {
1055         const int prio = cfq_ioprio(current);
1056
1057         crq->ioprio = prio;
1058         crq->nr_sectors = crq->request->hard_nr_sectors;
1059         __cfq_enqueue(q, cfqd, crq);
1060
1061         if (prio == IOPRIO_RT) {
1062                 int i;
1063
1064                 /*
1065                  * realtime io gets priority, move all other io back
1066                  */
1067                 for (i = IOPRIO_IDLE; i < IOPRIO_RT; i++)
1068                         cfq_reenqueue(q, cfqd, i);
1069         } else if (prio != IOPRIO_IDLE) {
1070                 /*
1071                  * check if we need to move idle io back into queue
1072                  */
1073                 cfq_reenqueue(q, cfqd, IOPRIO_IDLE);
1074         }
1075 }
1076
1077 static void
1078 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
1079 {
1080         struct cfq_data *cfqd = q->elevator.elevator_data;
1081         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1082
1083         switch (where) {
1084                 case ELEVATOR_INSERT_BACK:
1085 #if 0
1086                         while (cfq_dispatch_requests(q, cfqd))
1087                                 ;
1088 #endif
1089                         list_add_tail(&rq->queuelist, cfqd->dispatch);
1090                         break;
1091                 case ELEVATOR_INSERT_FRONT:
1092                         list_add(&rq->queuelist, cfqd->dispatch);
1093                         break;
1094                 case ELEVATOR_INSERT_SORT:
1095                         BUG_ON(!blk_fs_request(rq));
1096                         cfq_enqueue(q, cfqd, crq);
1097                         break;
1098                 default:
1099                         printk("%s: bad insert point %d\n", 
1100                                __FUNCTION__,where);
1101                         return;
1102         }
1103 }
1104
1105 static int cfq_queue_empty(request_queue_t *q)
1106 {
1107         struct cfq_data *cfqd = q->elevator.elevator_data;
1108
1109         if (list_empty(cfqd->dispatch) && !cfqd->busy_queues)
1110                 return 1;
1111
1112         return 0;
1113 }
1114
1115 static struct request *
1116 cfq_former_request(request_queue_t *q, struct request *rq)
1117 {
1118         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1119         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1120
1121         if (rbprev)
1122                 return rb_entry_crq(rbprev)->request;
1123
1124         return NULL;
1125 }
1126
1127 static struct request *
1128 cfq_latter_request(request_queue_t *q, struct request *rq)
1129 {
1130         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1131         struct rb_node *rbnext = rb_next(&crq->rb_node);
1132
1133         if (rbnext)
1134                 return rb_entry_crq(rbnext)->request;
1135
1136         return NULL;
1137 }
1138
1139 static void cfq_queue_congested(request_queue_t *q)
1140 {
1141         struct cfq_data *cfqd = q->elevator.elevator_data;
1142
1143         set_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
1144 }
1145
1146 static int cfq_may_queue(request_queue_t *q, int rw)
1147 {
1148         struct cfq_data *cfqd = q->elevator.elevator_data;
1149         struct cfq_queue *cfqq;
1150         const int prio = cfq_ioprio(current);
1151         int limit, ret = 1;
1152
1153         if (!cfqd->busy_queues)
1154                 goto out;
1155
1156         cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
1157         if (!cfqq)
1158                 goto out;
1159
1160         cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
1161         if (!cfqq)
1162                 goto out;
1163
1164         /*
1165          * if higher or equal prio io is sleeping waiting for a request, don't
1166          * allow this one to allocate one. as long as ll_rw_blk does fifo
1167          * waitqueue wakeups this should work...
1168          */
1169         if (cfqd->rq_starved_mask & ~((1 << prio) - 1))
1170                 goto out;
1171
1172         if (cfqq->queued[rw] < cfqd->cfq_queued || !cfqd->cid[prio].busy_queues)
1173                 goto out;
1174
1175         limit = q->nr_requests * (prio + 1) / IOPRIO_NR;
1176         limit /= cfqd->cid[prio].busy_queues;
1177         if (cfqq->queued[rw] > limit)
1178                 ret = 0;
1179 out:
1180         return ret;
1181 }
1182
1183 static void cfq_put_request(request_queue_t *q, struct request *rq)
1184 {
1185         struct cfq_data *cfqd = q->elevator.elevator_data;
1186         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1187         struct request_list *rl;
1188         int other_rw;
1189
1190         if (crq) {
1191                 BUG_ON(q->last_merge == rq);
1192                 BUG_ON(!hlist_unhashed(&crq->hash));
1193
1194                 mempool_free(crq, cfqd->crq_pool);
1195                 rq->elevator_private = NULL;
1196         }
1197
1198         /*
1199          * work-around for may_queue "bug": if a read gets issued and refused
1200          * to queue because writes ate all the allowed slots and no other
1201          * reads are pending for this queue, it could get stuck infinitely
1202          * since freed_request() only checks the waitqueue for writes when
1203          * freeing them. or vice versa for a single write vs many reads.
1204          * so check here whether "the other" data direction might be able
1205          * to queue and wake them
1206          */
1207         rl = &q->rq;
1208         other_rw = rq_data_dir(rq) ^ 1;
1209         if (rl->count[other_rw] <= q->nr_requests) {
1210                 smp_mb();
1211                 if (waitqueue_active(&rl->wait[other_rw]))
1212                         wake_up(&rl->wait[other_rw]);
1213         }
1214 }
1215
1216 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
1217 {
1218         struct cfq_data *cfqd = q->elevator.elevator_data;
1219         struct cfq_queue *cfqq;
1220         struct cfq_rq *crq;
1221
1222         /*
1223          * prepare a queue up front, so cfq_enqueue() doesn't have to
1224          */
1225         cfqq = cfq_get_queue(cfqd, cfq_hash_key(current), gfp_mask);
1226         if (!cfqq)
1227                 return 1;
1228
1229         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1230         if (crq) {
1231                 /*
1232                  * process now has one request
1233                  */
1234                 clear_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
1235
1236                 memset(crq, 0, sizeof(*crq));
1237                 crq->request = rq;
1238                 INIT_HLIST_NODE(&crq->hash);
1239                 INIT_LIST_HEAD(&crq->prio_list);
1240                 rq->elevator_private = crq;
1241                 return 0;
1242         }
1243
1244         return 1;
1245 }
1246
1247 static void cfq_exit(request_queue_t *q, elevator_t *e)
1248 {
1249         struct cfq_data *cfqd = e->elevator_data;
1250
1251         e->elevator_data = NULL;
1252         mempool_destroy(cfqd->crq_pool);
1253         kfree(cfqd->crq_hash);
1254         kfree(cfqd->cfq_hash);
1255         kfree(cfqd);
1256 }
1257
1258         
1259
1260 static void cfq_timer(unsigned long data)
1261 {
1262         struct cfq_data *cfqd = (struct cfq_data *) data;
1263
1264         clear_bit(CFQ_WAIT_RT, &cfqd->flags);
1265         clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
1266         kblockd_schedule_work(&cfqd->work);
1267 }
1268
1269 static void cfq_work(void *data)
1270 {
1271         request_queue_t *q = data;
1272         unsigned long flags;
1273
1274         spin_lock_irqsave(q->queue_lock, flags);
1275         if (cfq_next_request(q))
1276                 q->request_fn(q);
1277         spin_unlock_irqrestore(q->queue_lock, flags);
1278 }
1279
1280 static int cfq_init(request_queue_t *q, elevator_t *e)
1281 {
1282         struct cfq_data *cfqd;
1283         int i;
1284
1285         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
1286         if (!cfqd)
1287                 return -ENOMEM;
1288
1289         memset(cfqd, 0, sizeof(*cfqd));
1290         init_timer(&cfqd->timer);
1291         cfqd->timer.function = cfq_timer;
1292         cfqd->timer.data = (unsigned long) cfqd;
1293
1294         INIT_WORK(&cfqd->work, cfq_work, q);
1295
1296         for (i = 0; i < IOPRIO_NR; i++) {
1297                 struct io_prio_data *cid = &cfqd->cid[i];
1298
1299                 INIT_LIST_HEAD(&cid->rr_list);
1300                 INIT_LIST_HEAD(&cid->prio_list);
1301                 cid->last_rq = -1;
1302                 cid->last_sectors = -1;
1303
1304                 atomic_set(&cid->cum_rq_in,0);          
1305                 atomic_set(&cid->cum_rq_out,0);
1306                 atomic_set(&cid->cum_sectors_in,0);
1307                 atomic_set(&cid->cum_sectors_out,0);            
1308                 atomic_set(&cid->cum_queues_in,0);
1309                 atomic_set(&cid->cum_queues_out,0);
1310
1311                 
1312                 atomic_set(&((cid->cfqpriv).sectorate),CFQ_SECTORATE);
1313                 (cid->cfqpriv).nskip = 0;
1314                 (cid->cfqpriv).navsec = 0;
1315                 (cid->cfqpriv).timedout = 0;
1316         }
1317
1318         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES,
1319                                  GFP_KERNEL);
1320         if (!cfqd->crq_hash)
1321                 goto out_crqhash;
1322
1323         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES,
1324                                  GFP_KERNEL);
1325         if (!cfqd->cfq_hash)
1326                 goto out_cfqhash;
1327
1328         cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, 
1329                                         mempool_free_slab, crq_pool);
1330         if (!cfqd->crq_pool)
1331                 goto out_crqpool;
1332
1333         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
1334                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
1335         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
1336                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
1337
1338         cfqd->cfq_queued = cfq_queued;
1339         cfqd->cfq_quantum = cfq_quantum;
1340         cfqd->cfq_quantum_io = cfq_quantum_io;
1341         cfqd->cfq_idle_quantum = cfq_idle_quantum;
1342         cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
1343         cfqd->cfq_grace_rt = cfq_grace_rt;
1344         cfqd->cfq_grace_idle = cfq_grace_idle;
1345         
1346         cfqd->cfq_epoch = CFQ_EPOCH;
1347         cfqd->cfq_hmax_pct = CFQ_HMAX_PCT;
1348
1349         q->nr_requests <<= 2;
1350
1351         cfqd->dispatch = &q->queue_head;
1352         e->elevator_data = cfqd;
1353         cfqd->queue = q;
1354
1355         return 0;
1356 out_crqpool:
1357         kfree(cfqd->cfq_hash);
1358 out_cfqhash:
1359         kfree(cfqd->crq_hash);
1360 out_crqhash:
1361         kfree(cfqd);
1362         return -ENOMEM;
1363 }
1364
1365 static int __init cfq_slab_setup(void)
1366 {
1367         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
1368                                         NULL, NULL);
1369
1370         if (!crq_pool)
1371                 panic("cfq_iosched: can't init crq pool\n");
1372
1373         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
1374                                         NULL, NULL);
1375
1376         if (!cfq_pool)
1377                 panic("cfq_iosched: can't init cfq pool\n");
1378
1379         cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
1380
1381         if (!cfq_mpool)
1382                 panic("cfq_iosched: can't init cfq mpool\n");
1383
1384         return 0;
1385 }
1386
1387 subsys_initcall(cfq_slab_setup);
1388
1389 /*
1390  * sysfs parts below -->
1391  */
1392 struct cfq_fs_entry {
1393         struct attribute attr;
1394         ssize_t (*show)(struct cfq_data *, char *);
1395         ssize_t (*store)(struct cfq_data *, const char *, size_t);
1396 };
1397
1398 static ssize_t
1399 cfq_var_show(unsigned int var, char *page)
1400 {
1401         return sprintf(page, "%d\n", var);
1402 }
1403
1404 static ssize_t
1405 cfq_var_store(unsigned int *var, const char *page, size_t count)
1406 {
1407         char *p = (char *) page;
1408
1409         *var = simple_strtoul(p, &p, 10);
1410         return count;
1411 }
1412
1413 #define SHOW_FUNCTION(__FUNC, __VAR)                                    \
1414 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)                \
1415 {                                                                       \
1416         return cfq_var_show(__VAR, (page));                             \
1417 }
1418 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum);
1419 SHOW_FUNCTION(cfq_quantum_io_show, cfqd->cfq_quantum_io);
1420 SHOW_FUNCTION(cfq_idle_quantum_show, cfqd->cfq_idle_quantum);
1421 SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io);
1422 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
1423 SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
1424 SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
1425 SHOW_FUNCTION(cfq_epoch_show, cfqd->cfq_epoch);
1426 SHOW_FUNCTION(cfq_hmax_pct_show, cfqd->cfq_hmax_pct);
1427 #undef SHOW_FUNCTION
1428
1429 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                         \
1430 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)    \
1431 {                                                                       \
1432         int ret = cfq_var_store(__PTR, (page), count);                  \
1433         if (*(__PTR) < (MIN))                                           \
1434                 *(__PTR) = (MIN);                                       \
1435         else if (*(__PTR) > (MAX))                                      \
1436                 *(__PTR) = (MAX);                                       \
1437         return ret;                                                     \
1438 }
1439 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX);
1440 STORE_FUNCTION(cfq_quantum_io_store, &cfqd->cfq_quantum_io, 4, INT_MAX);
1441 STORE_FUNCTION(cfq_idle_quantum_store, &cfqd->cfq_idle_quantum, 1, INT_MAX);
1442 STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX);
1443 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
1444 STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
1445 STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
1446 STORE_FUNCTION(cfq_epoch_store, &cfqd->cfq_epoch, 0, INT_MAX);
1447 STORE_FUNCTION(cfq_hmax_pct_store, &cfqd->cfq_hmax_pct, 1, 100);
1448 #undef STORE_FUNCTION
1449
1450
1451 /* Additional entries to get priority level data */
1452 static ssize_t
1453 cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
1454 {
1455     //int r1,r2,s1,s2,q1,q2;
1456
1457         if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) 
1458                 return 0;
1459         
1460         /*
1461         r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
1462         r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
1463         s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
1464         s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
1465         q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); 
1466         q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
1467         */
1468
1469         return sprintf(page,"skip %d timdout %d avsec %lu rate %ld "
1470                        " sec0 %lu sec1 %lu\n",
1471                        cfqd->cid[priolvl].cfqpriv.nskip,
1472                        cfqd->cid[priolvl].cfqpriv.timedout,
1473                        cfqd->cid[priolvl].cfqpriv.navsec,
1474                        atomic_read(&(cfqd->cid[priolvl].cfqpriv.sectorate)),
1475                        (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[0],
1476                        (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[1]);
1477
1478 }
1479
1480 #define SHOW_PRIO_DATA(__PRIOLVL)                                               \
1481 static ssize_t cfq_prio_##__PRIOLVL##_show(struct cfq_data *cfqd, char *page)   \
1482 {                                                                               \
1483         return cfq_prio_show(cfqd,page,__PRIOLVL);                              \
1484 }
1485 SHOW_PRIO_DATA(0);
1486 SHOW_PRIO_DATA(1);
1487 SHOW_PRIO_DATA(2);
1488 SHOW_PRIO_DATA(3);
1489 SHOW_PRIO_DATA(4);
1490 SHOW_PRIO_DATA(5);
1491 SHOW_PRIO_DATA(6);
1492 SHOW_PRIO_DATA(7);
1493 SHOW_PRIO_DATA(8);
1494 SHOW_PRIO_DATA(9);
1495 SHOW_PRIO_DATA(10);
1496 SHOW_PRIO_DATA(11);
1497 SHOW_PRIO_DATA(12);
1498 SHOW_PRIO_DATA(13);
1499 SHOW_PRIO_DATA(14);
1500 SHOW_PRIO_DATA(15);
1501 SHOW_PRIO_DATA(16);
1502 SHOW_PRIO_DATA(17);
1503 SHOW_PRIO_DATA(18);
1504 SHOW_PRIO_DATA(19);
1505 SHOW_PRIO_DATA(20);
1506 #undef SHOW_PRIO_DATA
1507
1508
1509 static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
1510 {       
1511
1512         char *p = (char *) page;
1513         int val;
1514
1515         val = (int) simple_strtoul(p, &p, 10);
1516
1517         atomic_set(&(cfqd->cid[priolvl].cfqpriv.sectorate),val);
1518         cfqd->cid[priolvl].cfqpriv.nskip = 0;
1519         cfqd->cid[priolvl].cfqpriv.navsec = 0;
1520         cfqd->cid[priolvl].cfqpriv.timedout = 0;
1521
1522 #if 0
1523         atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
1524         atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
1525         atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
1526         atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
1527         atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
1528         atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
1529 #endif
1530
1531         return count;
1532 }
1533
1534
1535 #define STORE_PRIO_DATA(__PRIOLVL)                                                                 \
1536 static ssize_t cfq_prio_##__PRIOLVL##_store(struct cfq_data *cfqd, const char *page, size_t count) \
1537 {                                                                                                  \
1538         return cfq_prio_store(cfqd,page,count,__PRIOLVL);                                          \
1539 }                  
1540 STORE_PRIO_DATA(0);     
1541 STORE_PRIO_DATA(1);
1542 STORE_PRIO_DATA(2);
1543 STORE_PRIO_DATA(3);
1544 STORE_PRIO_DATA(4);
1545 STORE_PRIO_DATA(5);
1546 STORE_PRIO_DATA(6);
1547 STORE_PRIO_DATA(7);
1548 STORE_PRIO_DATA(8);
1549 STORE_PRIO_DATA(9);
1550 STORE_PRIO_DATA(10);
1551 STORE_PRIO_DATA(11);
1552 STORE_PRIO_DATA(12);
1553 STORE_PRIO_DATA(13);
1554 STORE_PRIO_DATA(14);
1555 STORE_PRIO_DATA(15);
1556 STORE_PRIO_DATA(16);
1557 STORE_PRIO_DATA(17);
1558 STORE_PRIO_DATA(18);
1559 STORE_PRIO_DATA(19);
1560 STORE_PRIO_DATA(20);
1561 #undef STORE_PRIO_DATA
1562
1563
1564 static struct cfq_fs_entry cfq_quantum_entry = {
1565         .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
1566         .show = cfq_quantum_show,
1567         .store = cfq_quantum_store,
1568 };
1569 static struct cfq_fs_entry cfq_quantum_io_entry = {
1570         .attr = {.name = "quantum_io", .mode = S_IRUGO | S_IWUSR },
1571         .show = cfq_quantum_io_show,
1572         .store = cfq_quantum_io_store,
1573 };
1574 static struct cfq_fs_entry cfq_idle_quantum_entry = {
1575         .attr = {.name = "idle_quantum", .mode = S_IRUGO | S_IWUSR },
1576         .show = cfq_idle_quantum_show,
1577         .store = cfq_idle_quantum_store,
1578 };
1579 static struct cfq_fs_entry cfq_idle_quantum_io_entry = {
1580         .attr = {.name = "idle_quantum_io", .mode = S_IRUGO | S_IWUSR },
1581         .show = cfq_idle_quantum_io_show,
1582         .store = cfq_idle_quantum_io_store,
1583 };
1584 static struct cfq_fs_entry cfq_queued_entry = {
1585         .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
1586         .show = cfq_queued_show,
1587         .store = cfq_queued_store,
1588 };
1589 static struct cfq_fs_entry cfq_grace_rt_entry = {
1590         .attr = {.name = "grace_rt", .mode = S_IRUGO | S_IWUSR },
1591         .show = cfq_grace_rt_show,
1592         .store = cfq_grace_rt_store,
1593 };
1594 static struct cfq_fs_entry cfq_grace_idle_entry = {
1595         .attr = {.name = "grace_idle", .mode = S_IRUGO | S_IWUSR },
1596         .show = cfq_grace_idle_show,
1597         .store = cfq_grace_idle_store,
1598 };
1599 static struct cfq_fs_entry cfq_epoch_entry = {
1600         .attr = {.name = "epoch", .mode = S_IRUGO | S_IWUSR },
1601         .show = cfq_epoch_show,
1602         .store = cfq_epoch_store,
1603 };
1604 static struct cfq_fs_entry cfq_hmax_pct_entry = {
1605         .attr = {.name = "hmaxpct", .mode = S_IRUGO | S_IWUSR },
1606         .show = cfq_hmax_pct_show,
1607         .store = cfq_hmax_pct_store,
1608 };
1609
1610 #define P_0_STR   "p0"
1611 #define P_1_STR   "p1"
1612 #define P_2_STR   "p2"
1613 #define P_3_STR   "p3"
1614 #define P_4_STR   "p4"
1615 #define P_5_STR   "p5"
1616 #define P_6_STR   "p6"
1617 #define P_7_STR   "p7"
1618 #define P_8_STR   "p8"
1619 #define P_9_STR   "p9"
1620 #define P_10_STR  "p10"
1621 #define P_11_STR  "p11"
1622 #define P_12_STR  "p12"
1623 #define P_13_STR  "p13"
1624 #define P_14_STR  "p14"
1625 #define P_15_STR  "p15"
1626 #define P_16_STR  "p16"
1627 #define P_17_STR  "p17"
1628 #define P_18_STR  "p18"
1629 #define P_19_STR  "p19"
1630 #define P_20_STR  "p20"
1631
1632
1633 #define CFQ_PRIO_SYSFS_ENTRY(__PRIOLVL)                                    \
1634 static struct cfq_fs_entry cfq_prio_##__PRIOLVL##_entry = {                \
1635         .attr = {.name = P_##__PRIOLVL##_STR, .mode = S_IRUGO | S_IWUSR }, \
1636         .show = cfq_prio_##__PRIOLVL##_show,                               \
1637         .store = cfq_prio_##__PRIOLVL##_store,                             \
1638 };
1639 CFQ_PRIO_SYSFS_ENTRY(0);
1640 CFQ_PRIO_SYSFS_ENTRY(1);
1641 CFQ_PRIO_SYSFS_ENTRY(2);
1642 CFQ_PRIO_SYSFS_ENTRY(3);
1643 CFQ_PRIO_SYSFS_ENTRY(4);
1644 CFQ_PRIO_SYSFS_ENTRY(5);
1645 CFQ_PRIO_SYSFS_ENTRY(6);
1646 CFQ_PRIO_SYSFS_ENTRY(7);
1647 CFQ_PRIO_SYSFS_ENTRY(8);
1648 CFQ_PRIO_SYSFS_ENTRY(9);
1649 CFQ_PRIO_SYSFS_ENTRY(10);
1650 CFQ_PRIO_SYSFS_ENTRY(11);
1651 CFQ_PRIO_SYSFS_ENTRY(12);
1652 CFQ_PRIO_SYSFS_ENTRY(13);
1653 CFQ_PRIO_SYSFS_ENTRY(14);
1654 CFQ_PRIO_SYSFS_ENTRY(15);
1655 CFQ_PRIO_SYSFS_ENTRY(16);
1656 CFQ_PRIO_SYSFS_ENTRY(17);
1657 CFQ_PRIO_SYSFS_ENTRY(18);
1658 CFQ_PRIO_SYSFS_ENTRY(19);
1659 CFQ_PRIO_SYSFS_ENTRY(20);
1660 #undef CFQ_PRIO_SYSFS_ENTRY
1661
1662 static struct attribute *default_attrs[] = {
1663         &cfq_quantum_entry.attr,
1664         &cfq_quantum_io_entry.attr,
1665         &cfq_idle_quantum_entry.attr,
1666         &cfq_idle_quantum_io_entry.attr,
1667         &cfq_queued_entry.attr,
1668         &cfq_grace_rt_entry.attr,
1669         &cfq_grace_idle_entry.attr,
1670         &cfq_epoch_entry.attr,
1671         &cfq_hmax_pct_entry.attr,
1672         &cfq_prio_0_entry.attr,
1673         &cfq_prio_1_entry.attr,
1674         &cfq_prio_2_entry.attr,
1675         &cfq_prio_3_entry.attr,
1676         &cfq_prio_4_entry.attr,
1677         &cfq_prio_5_entry.attr,
1678         &cfq_prio_6_entry.attr,
1679         &cfq_prio_7_entry.attr,
1680         &cfq_prio_8_entry.attr,
1681         &cfq_prio_9_entry.attr,
1682         &cfq_prio_10_entry.attr,
1683         &cfq_prio_11_entry.attr,
1684         &cfq_prio_12_entry.attr,
1685         &cfq_prio_13_entry.attr,
1686         &cfq_prio_14_entry.attr,
1687         &cfq_prio_15_entry.attr,
1688         &cfq_prio_16_entry.attr,
1689         &cfq_prio_17_entry.attr,
1690         &cfq_prio_18_entry.attr,
1691         &cfq_prio_19_entry.attr,
1692         &cfq_prio_20_entry.attr,
1693         NULL,
1694 };
1695
1696 #define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
1697
1698 static ssize_t
1699 cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1700 {
1701         elevator_t *e = container_of(kobj, elevator_t, kobj);
1702         struct cfq_fs_entry *entry = to_cfq(attr);
1703
1704         if (!entry->show)
1705                 return 0;
1706
1707         return entry->show(e->elevator_data, page);
1708 }
1709
1710 static ssize_t
1711 cfq_attr_store(struct kobject *kobj, struct attribute *attr,
1712                const char *page, size_t length)
1713 {
1714         elevator_t *e = container_of(kobj, elevator_t, kobj);
1715         struct cfq_fs_entry *entry = to_cfq(attr);
1716
1717         if (!entry->store)
1718                 return -EINVAL;
1719
1720         return entry->store(e->elevator_data, page, length);
1721 }
1722
1723 static struct sysfs_ops cfq_sysfs_ops = {
1724         .show   = cfq_attr_show,
1725         .store  = cfq_attr_store,
1726 };
1727
1728 struct kobj_type cfq_ktype = {
1729         .sysfs_ops      = &cfq_sysfs_ops,
1730         .default_attrs  = default_attrs,
1731 };
1732
1733 elevator_t iosched_cfq = {
1734         .elevator_name =                "cfq",
1735         .elevator_ktype =               &cfq_ktype,
1736         .elevator_merge_fn =            cfq_merge,
1737         .elevator_merged_fn =           cfq_merged_request,
1738         .elevator_merge_req_fn =        cfq_merged_requests,
1739         .elevator_next_req_fn =         cfq_next_request,
1740         .elevator_add_req_fn =          cfq_insert_request,
1741         .elevator_remove_req_fn =       cfq_remove_request,
1742         .elevator_queue_empty_fn =      cfq_queue_empty,
1743         .elevator_former_req_fn =       cfq_former_request,
1744         .elevator_latter_req_fn =       cfq_latter_request,
1745         .elevator_set_req_fn =          cfq_set_request,
1746         .elevator_put_req_fn =          cfq_put_request,
1747         .elevator_may_queue_fn =        cfq_may_queue,
1748         .elevator_set_congested_fn =    cfq_queue_congested,
1749         .elevator_init_fn =             cfq_init,
1750         .elevator_exit_fn =             cfq_exit,
1751 };
1752
1753 EXPORT_SYMBOL(iosched_cfq);