drivers/block/cfq-iosched.c

   1 /*
   2  *  linux/drivers/block/cfq-iosched.c
   3  *
   4  *  CFQ, or complete fairness queueing, disk scheduler.
   5  *
   6  *  Based on ideas from a previously unfinished io
   7  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   8  *
   9  *  IO priorities are supported, from 0% to 100% in 5% increments. Both of
  10  *  those values have special meaning - 0% class is allowed to do io if
  11  *  noone else wants to use the disk. 100% is considered real-time io, and
  12  *  always get priority. Default process io rate is 95%. In absence of other
  13  *  io, a class may consume 100% disk bandwidth regardless. Withing a class,
  14  *  bandwidth is distributed equally among the citizens.
  15  *
  16  * TODO:
  17  *      - cfq_select_requests() needs some work for 5-95% io
  18  *      - barriers not supported
  19  *      - export grace periods in ms, not jiffies
  20  *
  21  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
  22  */
  23 #include <linux/kernel.h>
  24 #include <linux/fs.h>
  25 #include <linux/blkdev.h>
  26 #include <linux/elevator.h>
  27 #include <linux/bio.h>
  28 #include <linux/config.h>
  29 #include <linux/module.h>
  30 #include <linux/slab.h>
  31 #include <linux/init.h>
  32 #include <linux/compiler.h>
  33 #include <linux/hash.h>
  34 #include <linux/rbtree.h>
  35 #include <linux/mempool.h>
  36 #include <asm/div64.h>
  37
  38 #if IOPRIO_NR > BITS_PER_LONG
  39 #error Cannot support this many io priority levels
  40 #endif
  41
  42 /*
  43  * tunables
  44  */
  45 static int cfq_quantum = 6;
  46 static int cfq_quantum_io = 256;
  47 static int cfq_idle_quantum = 1;
  48 static int cfq_idle_quantum_io = 64;
  49 static int cfq_queued = 4;
  50 static int cfq_grace_rt = HZ / 100 ?: 1;
  51 static int cfq_grace_idle = HZ / 10;
  52
  53 #define CFQ_EPOCH               1000000000
  54 #define CFQ_SECTORATE           1000
  55 #define CFQ_HMAX_PCT            80
  56
  57 #define CFQ_QHASH_SHIFT         6
  58 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
  59 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
  60
  61 #define CFQ_MHASH_SHIFT         8
  62 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
  63 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
  64 #define CFQ_MHASH_FN(sec)       (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
  65 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
  66 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
  67
  68 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
  69 #define list_entry_prio(ptr)    list_entry((ptr), struct cfq_rq, prio_list)
  70
  71 #define cfq_account_io(crq)     \
  72         ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
  73
  74 /*
  75  * defines how we distribute bandwidth (can be tgid, uid, etc)
  76  */
  77
  78 /* FIXME: change hash_key to be sizeof(void *) rather than sizeof(int)
  79  * otherwise the cast of cki_tsk_icls will not work reliably on 64-bit arches.
  80  * OR, change cki_tsk_icls to return ints (will need another id space to be
  81  * managed)
  82  */
  83
  84 #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
  85 extern void *cki_hash_key(struct task_struct *tsk);
  86 extern int cki_ioprio(struct task_struct *tsk);
  87 extern void *cki_cfqpriv(struct task_struct *tsk);
  88
  89 #define cfq_hash_key(tsk)   ((int)cki_hash_key((tsk)))
  90 #define cfq_ioprio(tsk) (cki_ioprio((tsk)))
  91 #define cfq_cfqpriv(cfqd,tsk)   (cki_cfqpriv((tsk)))
  92
  93 #else
  94 #define cfq_hash_key(tsk)       ((tsk)->tgid)
  95 #define cfq_cfqpriv(cfqd,tsk)   (&(((cfqd)->cid[(tsk)->ioprio]).cfqpriv))
  96
  97 /*
  98  * move to io_context
  99  */
 100 #define cfq_ioprio(tsk) ((tsk)->ioprio)
 101 #endif
 102
 103 #define CFQ_WAIT_RT     0
 104 #define CFQ_WAIT_NORM   1
 105
 106 static kmem_cache_t *crq_pool;
 107 static kmem_cache_t *cfq_pool;
 108 static mempool_t *cfq_mpool;
 109
 110 /*
 111  * defines an io priority level
 112  */
 113 struct io_prio_data {
 114         struct list_head rr_list;
 115         int busy_queues;
 116         int busy_rq;
 117         unsigned long busy_sectors;
 118
 119         /* requests, sectors and queues
 120          * added(in),dispatched/deleted(out)
 121          * at this priority level.
 122          */
 123         atomic_t cum_rq_in,cum_rq_out;
 124         atomic_t cum_sectors_in,cum_sectors_out;
 125         atomic_t cum_queues_in,cum_queues_out;
 126
 127         cfqlim_t cfqpriv;       /* data for enforcing limits */
 128
 129         struct list_head prio_list;
 130         int last_rq;
 131         int last_sectors;
 132
 133 };
 134
 135 /*
 136  * per-request queue structure
 137  */
 138 struct cfq_data {
 139         struct list_head rr_list;
 140         struct list_head *dispatch;
 141         struct hlist_head *cfq_hash;
 142         struct hlist_head *crq_hash;
 143         mempool_t *crq_pool;
 144
 145         struct io_prio_data cid[IOPRIO_NR];
 146
 147         /*
 148          * total number of busy queues and requests
 149          */
 150         int busy_rq;
 151         int busy_queues;
 152         unsigned long busy_sectors;
 153
 154
 155         request_queue_t *queue;
 156         unsigned long rq_starved_mask;
 157
 158         /*
 159          * grace period handling
 160          */
 161         struct timer_list timer;
 162         unsigned long wait_end;
 163         unsigned long flags;
 164         struct work_struct work;
 165
 166         /*
 167          * tunables
 168          */
 169         unsigned int cfq_quantum;
 170         unsigned int cfq_quantum_io;
 171         unsigned int cfq_idle_quantum;
 172         unsigned int cfq_idle_quantum_io;
 173         unsigned int cfq_queued;
 174         unsigned int cfq_grace_rt;
 175         unsigned int cfq_grace_idle;
 176
 177         unsigned int cfq_epoch;
 178         unsigned int cfq_hmax_pct;
 179         unsigned int cfq_qsectorate;
 180 };
 181
 182 /*
 183  * per-class structure
 184  */
 185 struct cfq_queue {
 186         struct list_head cfq_list;
 187         struct hlist_node cfq_hash;
 188         int hash_key;
 189         struct rb_root sort_list;
 190         int queued[2];
 191         int ioprio;
 192
 193         /* limit related settings/stats obtained
 194            either from io_prio_data or ckrm I/O class
 195         */
 196         struct cfqlim *cfqpriv;
 197
 198         u64 epstart;            /* current epoch's starting timestamp (ns) */
 199         u64 epsector[2];        /* Total sectors dispatched in [0] previous
 200                                  * and [1] current epoch
 201                                  */
 202
 203         unsigned long avsec;            /* avg sectors dispatched/epoch */
 204 //      unsigned long long lastime;     /* timestamp of last request served */
 205 //      unsigned long sectorate;        /* limit for sectors served/epoch */
 206         int skipped;                    /* queue skipped at last dispatch ? */
 207
 208         /* Per queue timer to suspend/resume queue from processing */
 209         struct timer_list timer;
 210         unsigned long wait_end;
 211         unsigned long flags;
 212         struct work_struct work;
 213
 214         struct cfq_data *cfqd;
 215 };
 216
 217
 218
 219 /*
 220  * Per-request structure
 221  */
 222 struct cfq_rq {
 223         struct cfq_queue *cfq_queue;
 224         struct rb_node rb_node;
 225         struct hlist_node hash;
 226         sector_t rb_key;
 227
 228         struct request *request;
 229         struct list_head prio_list;
 230         unsigned long nr_sectors;
 231         int ioprio;
 232 };
 233
 234 static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
 235 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
 236 static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 237                               struct cfq_rq *crq);
 238
 239 /*
 240  * lots of deadline iosched dupes, can be abstracted later...
 241  */
 242 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
 243 {
 244         hlist_del_init(&crq->hash);
 245 }
 246
 247 static inline void
 248 cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
 249 {
 250         cfq_del_crq_hash(crq);
 251
 252         if (q->last_merge == crq->request)
 253                 q->last_merge = NULL;
 254 }
 255
 256 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
 257 {
 258         struct request *rq = crq->request;
 259         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(rq));
 260
 261         BUG_ON(!hlist_unhashed(&crq->hash));
 262
 263         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
 264 }
 265
 266 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 267 {
 268         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
 269         struct hlist_node *entry, *next;
 270
 271         hlist_for_each_safe(entry, next, hash_list) {
 272                 struct cfq_rq *crq = list_entry_hash(entry);
 273                 struct request *__rq = crq->request;
 274
 275                 BUG_ON(hlist_unhashed(&crq->hash));
 276
 277                 if (!rq_mergeable(__rq)) {
 278                         cfq_del_crq_hash(crq);
 279                         continue;
 280                 }
 281
 282                 if (rq_hash_key(__rq) == offset)
 283                         return __rq;
 284         }
 285
 286         return NULL;
 287 }
 288
 289 /*
 290  * rb tree support functions
 291  */
 292 #define RB_EMPTY(node)          ((node)->rb_node == NULL)
 293 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
 294 #define rq_rb_key(rq)           (rq)->sector
 295
 296 static void
 297 cfq_del_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
 298 {
 299         if (crq->cfq_queue) {
 300                 crq->cfq_queue = NULL;
 301
 302                 if (cfq_account_io(crq)) {
 303                         cfqd->busy_rq--;
 304                         cfqd->busy_sectors -= crq->nr_sectors;
 305                         cfqd->cid[crq->ioprio].busy_rq--;
 306                         cfqd->cid[crq->ioprio].busy_sectors -= crq->nr_sectors;
 307                 }
 308                 atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_out));
 309                 atomic_add(crq->nr_sectors,
 310                            &(cfqd->cid[crq->ioprio].cum_sectors_out));
 311                 cfqq->queued[rq_data_dir(crq->request)]--;
 312                 rb_erase(&crq->rb_node, &cfqq->sort_list);
 313         }
 314 }
 315
 316 static struct cfq_rq *
 317 __cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 318 {
 319         struct rb_node **p = &cfqq->sort_list.rb_node;
 320         struct rb_node *parent = NULL;
 321         struct cfq_rq *__crq;
 322
 323         while (*p) {
 324                 parent = *p;
 325                 __crq = rb_entry_crq(parent);
 326
 327                 if (crq->rb_key < __crq->rb_key)
 328                         p = &(*p)->rb_left;
 329                 else if (crq->rb_key > __crq->rb_key)
 330                         p = &(*p)->rb_right;
 331                 else
 332                         return __crq;
 333         }
 334
 335         rb_link_node(&crq->rb_node, parent, p);
 336         return NULL;
 337 }
 338
 339 static void
 340 cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
 341 {
 342         struct request *rq = crq->request;
 343         struct cfq_rq *__alias;
 344
 345
 346         cfqq->queued[rq_data_dir(rq)]++;
 347         if (cfq_account_io(crq)) {
 348                 cfqd->busy_rq++;
 349                 cfqd->busy_sectors += crq->nr_sectors;
 350                 cfqd->cid[crq->ioprio].busy_rq++;
 351                 cfqd->cid[crq->ioprio].busy_sectors += crq->nr_sectors;
 352         }
 353         atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_in));
 354         atomic_add(crq->nr_sectors,
 355                    &(cfqd->cid[crq->ioprio].cum_sectors_in));
 356 retry:
 357         __alias = __cfq_add_crq_rb(cfqq, crq);
 358         if (!__alias) {
 359                 rb_insert_color(&crq->rb_node, &cfqq->sort_list);
 360                 crq->rb_key = rq_rb_key(rq);
 361                 crq->cfq_queue = cfqq;
 362                 return;
 363         }
 364
 365         cfq_dispatch_sort(cfqd, cfqq, __alias);
 366         goto retry;
 367 }
 368
 369 static struct request *
 370 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
 371 {
 372         struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
 373         struct rb_node *n;
 374
 375         if (!cfqq)
 376                 goto out;
 377
 378         n = cfqq->sort_list.rb_node;
 379         while (n) {
 380                 struct cfq_rq *crq = rb_entry_crq(n);
 381
 382                 if (sector < crq->rb_key)
 383                         n = n->rb_left;
 384                 else if (sector > crq->rb_key)
 385                         n = n->rb_right;
 386                 else
 387                         return crq->request;
 388         }
 389
 390 out:
 391         return NULL;
 392 }
 393
 394 static void cfq_remove_request(request_queue_t *q, struct request *rq)
 395 {
 396         struct cfq_data *cfqd = q->elevator.elevator_data;
 397         struct cfq_rq *crq = RQ_ELV_DATA(rq);
 398
 399         if (crq) {
 400
 401                 cfq_remove_merge_hints(q, crq);
 402                 list_del_init(&crq->prio_list);
 403                 list_del_init(&rq->queuelist);
 404
 405                 /*
 406                  * set a grace period timer to allow realtime io to make real
 407                  * progress, if we release an rt request. for normal request,
 408                  * set timer so idle io doesn't interfere with other io
 409                  */
 410                 if (crq->ioprio == IOPRIO_RT) {
 411                         set_bit(CFQ_WAIT_RT, &cfqd->flags);
 412                         cfqd->wait_end = jiffies + cfqd->cfq_grace_rt;
 413                 } else if (crq->ioprio != IOPRIO_IDLE) {
 414                         set_bit(CFQ_WAIT_NORM, &cfqd->flags);
 415                         cfqd->wait_end = jiffies + cfqd->cfq_grace_idle;
 416                 }
 417
 418                 if (crq->cfq_queue) {
 419                         struct cfq_queue *cfqq = crq->cfq_queue;
 420
 421                         cfq_del_crq_rb(cfqd, cfqq, crq);
 422
 423                         if (RB_EMPTY(&cfqq->sort_list))
 424                                 cfq_put_queue(cfqd, cfqq);
 425                 }
 426         }
 427 }
 428
 429 static int
 430 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 431 {
 432         struct cfq_data *cfqd = q->elevator.elevator_data;
 433         struct request *__rq;
 434         int ret;
 435
 436         ret = elv_try_last_merge(q, bio);
 437         if (ret != ELEVATOR_NO_MERGE) {
 438                 __rq = q->last_merge;
 439                 goto out_insert;
 440         }
 441
 442         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
 443         if (__rq) {
 444                 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
 445
 446                 if (elv_rq_merge_ok(__rq, bio)) {
 447                         ret = ELEVATOR_BACK_MERGE;
 448                         goto out;
 449                 }
 450         }
 451
 452         __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
 453         if (__rq) {
 454                 if (elv_rq_merge_ok(__rq, bio)) {
 455                         ret = ELEVATOR_FRONT_MERGE;
 456                         goto out;
 457                 }
 458         }
 459
 460         return ELEVATOR_NO_MERGE;
 461 out:
 462         q->last_merge = __rq;
 463 out_insert:
 464         *req = __rq;
 465         return ret;
 466 }
 467
 468 static void cfq_merged_request(request_queue_t *q, struct request *req)
 469 {
 470         struct cfq_data *cfqd = q->elevator.elevator_data;
 471         struct cfq_rq *crq = RQ_ELV_DATA(req);
 472         int tmp;
 473
 474         cfq_del_crq_hash(crq);
 475         cfq_add_crq_hash(cfqd, crq);
 476
 477         if (crq->cfq_queue && (rq_rb_key(req) != crq->rb_key)) {
 478                 struct cfq_queue *cfqq = crq->cfq_queue;
 479
 480                 cfq_del_crq_rb(cfqd, cfqq, crq);
 481                 cfq_add_crq_rb(cfqd, cfqq, crq);
 482         }
 483
 484         tmp = req->hard_nr_sectors - crq->nr_sectors;
 485         cfqd->busy_sectors += tmp;
 486         cfqd->cid[crq->ioprio].busy_sectors += tmp;
 487         atomic_add(tmp,&(cfqd->cid[crq->ioprio].cum_sectors_in));
 488
 489         crq->nr_sectors = req->hard_nr_sectors;
 490
 491         q->last_merge = req;
 492 }
 493
 494 static void
 495 cfq_merged_requests(request_queue_t *q, struct request *req,
 496                     struct request *next)
 497 {
 498         cfq_merged_request(q, req);
 499         cfq_remove_request(q, next);
 500 }
 501
 502 /*
 503  * sort into dispatch list, in optimal ascending order
 504  */
 505 static void
 506 cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 507                   struct cfq_rq *crq)
 508 {
 509         struct list_head *head = cfqd->dispatch, *entry = head;
 510         struct request *__rq;
 511
 512         cfq_del_crq_rb(cfqd, cfqq, crq);
 513         cfq_remove_merge_hints(cfqd->queue, crq);
 514
 515         if (!list_empty(head)) {
 516                 __rq = list_entry_rq(head->next);
 517
 518                 if (crq->request->sector < __rq->sector) {
 519                         entry = head->prev;
 520                         goto link;
 521                 }
 522         }
 523
 524         while ((entry = entry->prev) != head) {
 525                 __rq = list_entry_rq(entry);
 526
 527                 if (crq->request->sector <= __rq->sector)
 528                         break;
 529         }
 530
 531 link:
 532         list_add_tail(&crq->request->queuelist, entry);
 533 }
 534
 535 struct cfq_queue *dcfqq;
 536 u64 dtmp;
 537
 538
 539
 540 /* Over how many ns is sectorate defined */
 541 #define NS4SCALE  (100000000)
 542
 543 static inline int
 544 __cfq_check_limit(struct cfq_data *cfqd,struct cfq_queue *cfqq, int dontskip)
 545 {
 546         struct cfq_rq *crq;
 547         unsigned long long ts, gap, epoch, tmp;
 548         unsigned long newavsec, sectorate;
 549
 550         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
 551
 552         ts = sched_clock();
 553         gap = ts - cfqq->epstart;
 554         epoch = cfqd->cfq_epoch;
 555
 556         sectorate = atomic_read(&cfqq->cfqpriv->sectorate);
 557 //      sectorate = atomic_read(&(cfqd->cid[crq->ioprio].sectorate));
 558
 559         dcfqq = cfqq;
 560
 561         if ((gap >= epoch) || (gap < 0)) {
 562
 563                 if (gap >= (epoch << 1)) {
 564                         cfqq->epsector[0] = 0;
 565                         cfqq->epstart = ts ;
 566                 } else {
 567                         cfqq->epsector[0] = cfqq->epsector[1];
 568                         cfqq->epstart += epoch;
 569                 }
 570                 cfqq->epsector[1] = 0;
 571                 gap = ts - cfqq->epstart;
 572
 573                 tmp  = (cfqq->epsector[0] + crq->nr_sectors) * NS4SCALE;
 574                 do_div(tmp,epoch+gap);
 575
 576                 cfqq->avsec = (unsigned long)tmp;
 577                 cfqq->skipped = 0;
 578                 cfqq->epsector[1] += crq->nr_sectors;
 579
 580                 cfqq->cfqpriv->navsec = cfqq->avsec;
 581                 cfqq->cfqpriv->sec[0] = cfqq->epsector[0];
 582                 cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
 583                 cfqq->cfqpriv->timedout++;
 584                 /*
 585                 cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
 586                 cfqd->cid[crq->ioprio].sec[0] = cfqq->epsector[0];
 587                 cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
 588                 cfqd->cid[crq->ioprio].timedout++;
 589                 */
 590                 return 0;
 591         } else {
 592
 593                 tmp = (cfqq->epsector[0] + cfqq->epsector[1] + crq->nr_sectors)
 594                         * NS4SCALE;
 595                 do_div(tmp,epoch+gap);
 596
 597                 newavsec = (unsigned long)tmp;
 598                 if ((newavsec < sectorate) || dontskip) {
 599                         cfqq->avsec = newavsec ;
 600                         cfqq->skipped = 0;
 601                         cfqq->epsector[1] += crq->nr_sectors;
 602                         cfqq->cfqpriv->navsec = cfqq->avsec;
 603                         cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
 604                         /*
 605                         cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
 606                         cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
 607                         */
 608                 } else {
 609                         cfqq->skipped = 1;
 610                         /* pause q's processing till avsec drops to
 611                            cfq_hmax_pct % of its value */
 612                         tmp = (epoch+gap) * (100-cfqd->cfq_hmax_pct);
 613                         do_div(tmp,1000000*cfqd->cfq_hmax_pct);
 614                         cfqq->wait_end = jiffies+msecs_to_jiffies(tmp);
 615                 }
 616         }
 617 }
 618
 619 /*
 620  * remove from io scheduler core and put on dispatch list for service
 621  */
 622 static inline int
 623 __cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
 624                         struct cfq_queue *cfqq)
 625 {
 626         struct cfq_rq *crq;
 627
 628         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
 629
 630         cfq_dispatch_sort(cfqd, cfqq, crq);
 631
 632         /*
 633          * technically, for IOPRIO_RT we don't need to add it to the list.
 634          */
 635         list_add_tail(&crq->prio_list, &cfqd->cid[cfqq->ioprio].prio_list);
 636         return crq->nr_sectors;
 637 }
 638
 639 static int
 640 cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors)
 641 {
 642         struct cfq_data *cfqd = q->elevator.elevator_data;
 643         struct list_head *plist = &cfqd->cid[prio].rr_list;
 644         struct cfq_queue *cfqq;
 645         struct list_head *entry, *nxt;
 646         int q_rq, q_io;
 647         int first_round,busy_queues,busy_unlimited;
 648
 649
 650         /*
 651          * for each queue at this prio level, dispatch a request
 652          */
 653         q_rq = q_io = 0;
 654         first_round=1;
 655  restart:
 656         busy_unlimited = 0;
 657         busy_queues = 0;
 658         list_for_each_safe(entry, nxt, plist) {
 659                 cfqq = list_entry_cfqq(entry);
 660
 661                 BUG_ON(RB_EMPTY(&cfqq->sort_list));
 662                 busy_queues++;
 663
 664
 665                 if (first_round || busy_unlimited)
 666                         __cfq_check_limit(cfqd,cfqq,0);
 667                 else
 668                         __cfq_check_limit(cfqd,cfqq,1);
 669
 670                 if (cfqq->skipped) {
 671                         cfqq->cfqpriv->nskip++;
 672                         /* cfqd->cid[prio].nskip++; */
 673                         busy_queues--;
 674                         if (time_before(jiffies, cfqq->wait_end)) {
 675                                 list_del(&cfqq->cfq_list);
 676                                 mod_timer(&cfqq->timer,cfqq->wait_end);
 677                         }
 678                         continue;
 679                 }
 680                 busy_unlimited++;
 681
 682                 q_io += __cfq_dispatch_requests(q, cfqd, cfqq);
 683                 q_rq++;
 684
 685                 if (RB_EMPTY(&cfqq->sort_list)) {
 686                         busy_unlimited--;
 687                         busy_queues--;
 688                         cfq_put_queue(cfqd, cfqq);
 689                 }
 690
 691                 if (q_io >= max_sectors || q_rq >= max_rq) {
 692 #if 0
 693                         struct list_head *prv = nxt->prev;
 694
 695                         if (prv != plist) {
 696                                 list_del(plist);
 697                                 list_add(plist, prv);
 698                         }
 699 #endif
 700                         break;
 701                 }
 702         }
 703
 704         if ((q_io < max_sectors) && (q_rq < max_rq) &&
 705             (busy_queues || first_round))
 706         {
 707                 first_round = 0;
 708                 goto restart;
 709         } else {
 710                 /*
 711                  * if we hit the queue limit, put the string of serviced
 712                  * queues at the back of the pending list
 713                  */
 714                 struct list_head *prv = nxt->prev;
 715                 if (prv != plist) {
 716                         list_del(plist);
 717                         list_add(plist, prv);
 718                 }
 719         }
 720
 721         cfqd->cid[prio].last_rq = q_rq;
 722         cfqd->cid[prio].last_sectors = q_io;
 723         return q_rq;
 724 }
 725
 726 /*
 727  * try to move some requests to the dispatch list. return 0 on success
 728  */
 729 static int cfq_select_requests(request_queue_t *q, struct cfq_data *cfqd)
 730 {
 731         int queued, busy_rq, busy_sectors, i;
 732
 733         /*
 734          * if there's any realtime io, only schedule that
 735          */
 736         if (cfq_dispatch_requests(q, IOPRIO_RT, cfqd->cfq_quantum, cfqd->cfq_quantum_io))
 737                 return 1;
 738
 739         /*
 740          * if RT io was last serviced and grace time hasn't expired,
 741          * arm the timer to restart queueing if no other RT io has been
 742          * submitted in the mean time
 743          */
 744         if (test_bit(CFQ_WAIT_RT, &cfqd->flags)) {
 745                 if (time_before(jiffies, cfqd->wait_end)) {
 746                         mod_timer(&cfqd->timer, cfqd->wait_end);
 747                         return 0;
 748                 }
 749                 clear_bit(CFQ_WAIT_RT, &cfqd->flags);
 750         }
 751
 752         /*
 753          * for each priority level, calculate number of requests we
 754          * are allowed to put into service.
 755          */
 756         queued = 0;
 757         busy_rq = cfqd->busy_rq;
 758         busy_sectors = cfqd->busy_sectors;
 759         for (i = IOPRIO_RT - 1; i > IOPRIO_IDLE; i--) {
 760                 const int o_rq = busy_rq - cfqd->cid[i].busy_rq;
 761                 const int o_sectors = busy_sectors - cfqd->cid[i].busy_sectors;
 762                 int q_rq = cfqd->cfq_quantum * (i + 1) / IOPRIO_NR;
 763                 int q_io = cfqd->cfq_quantum_io * (i + 1) / IOPRIO_NR;
 764
 765                 /*
 766                  * no need to keep iterating the list, if there are no
 767                  * requests pending anymore
 768                  */
 769                 if (!cfqd->busy_rq)
 770                         break;
 771
 772                 /*
 773                  * find out how many requests and sectors we are allowed to
 774                  * service
 775                  */
 776                 if (o_rq)
 777                         q_rq = o_sectors * (i + 1) / IOPRIO_NR;
 778                 if (q_rq > cfqd->cfq_quantum)
 779                         q_rq = cfqd->cfq_quantum;
 780
 781                 if (o_sectors)
 782                         q_io = o_sectors * (i + 1) / IOPRIO_NR;
 783                 if (q_io > cfqd->cfq_quantum_io)
 784                         q_io = cfqd->cfq_quantum_io;
 785
 786                 /*
 787                  * average with last dispatched for fairness
 788                  */
 789                 if (cfqd->cid[i].last_rq != -1)
 790                         q_rq = (cfqd->cid[i].last_rq + q_rq) / 2;
 791                 if (cfqd->cid[i].last_sectors != -1)
 792                         q_io = (cfqd->cid[i].last_sectors + q_io) / 2;
 793
 794                 queued += cfq_dispatch_requests(q, i, q_rq, q_io);
 795         }
 796
 797         if (queued)
 798                 return 1;
 799
 800         /*
 801          * only allow dispatch of idle io, if the queue has been idle from
 802          * servicing RT or normal io for the grace period
 803          */
 804         if (test_bit(CFQ_WAIT_NORM, &cfqd->flags)) {
 805                 if (time_before(jiffies, cfqd->wait_end)) {
 806                         mod_timer(&cfqd->timer, cfqd->wait_end);
 807                         return 0;
 808                 }
 809                 clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
 810         }
 811
 812         /*
 813          * if we found nothing to do, allow idle io to be serviced
 814          */
 815         if (cfq_dispatch_requests(q, IOPRIO_IDLE, cfqd->cfq_idle_quantum, cfqd->cfq_idle_quantum_io))
 816                 return 1;
 817
 818         return 0;
 819 }
 820
 821 static struct request *cfq_next_request(request_queue_t *q)
 822 {
 823         struct cfq_data *cfqd = q->elevator.elevator_data;
 824         struct request *rq;
 825
 826         if (!list_empty(cfqd->dispatch)) {
 827                 struct cfq_rq *crq;
 828 dispatch:
 829                 /*
 830                  * end grace period, we are servicing a request
 831                  */
 832                 del_timer(&cfqd->timer);
 833                 clear_bit(CFQ_WAIT_RT, &cfqd->flags);
 834                 clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
 835
 836                 BUG_ON(list_empty(cfqd->dispatch));
 837                 rq = list_entry_rq(cfqd->dispatch->next);
 838
 839                 BUG_ON(q->last_merge == rq);
 840                 crq = RQ_ELV_DATA(rq);
 841                 if (crq) {
 842                         BUG_ON(!hlist_unhashed(&crq->hash));
 843                         list_del_init(&crq->prio_list);
 844                 }
 845
 846                 return rq;
 847         }
 848
 849         /*
 850          * we moved requests to dispatch list, go back end serve one
 851          */
 852         if (cfq_select_requests(q, cfqd))
 853                 goto dispatch;
 854
 855         return NULL;
 856 }
 857
 858 static inline struct cfq_queue *
 859 __cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey, const int hashval)
 860 {
 861         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
 862         struct hlist_node *entry;
 863
 864         hlist_for_each(entry, hash_list) {
 865                 struct cfq_queue *__cfqq = list_entry_qhash(entry);
 866
 867                 if (__cfqq->hash_key == hashkey)
 868                         return __cfqq;
 869         }
 870
 871         return NULL;
 872 }
 873
 874
 875 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey)
 876 {
 877         const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
 878
 879         return __cfq_find_cfq_hash(cfqd, hashkey, hashval);
 880 }
 881
 882 static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 883 {
 884         cfqd->busy_queues--;
 885         WARN_ON(cfqd->busy_queues < 0);
 886
 887         cfqd->cid[cfqq->ioprio].busy_queues--;
 888         WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0);
 889         atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
 890
 891         list_del(&cfqq->cfq_list);
 892         hlist_del(&cfqq->cfq_hash);
 893         mempool_free(cfqq, cfq_mpool);
 894 }
 895
 896 static void cfq_pauseq_timer(unsigned long data)
 897 {
 898         struct cfq_queue *cfqq = (struct cfq_queue *) data;
 899         kblockd_schedule_work(&cfqq->work);
 900 }
 901
 902 static void cfq_pauseq_work(void *data)
 903 {
 904         struct cfq_queue *cfqq = (struct cfq_queue *) data;
 905         struct cfq_data *cfqd = cfqq->cfqd;
 906         request_queue_t *q = cfqd->queue;
 907         unsigned long flags;
 908
 909         spin_lock_irqsave(q->queue_lock, flags);
 910         list_add_tail(&cfqq->cfq_list,&cfqd->cid[cfqq->ioprio].rr_list);
 911         cfqq->skipped = 0;
 912         if (cfq_next_request(q))
 913                 q->request_fn(q);
 914         spin_unlock_irqrestore(q->queue_lock, flags);
 915
 916         //del_timer(&cfqq->timer);
 917 }
 918
 919 static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey,
 920                                          int gfp_mask)
 921 {
 922         const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
 923         struct cfq_queue *cfqq, *new_cfqq = NULL;
 924         request_queue_t *q = cfqd->queue;
 925
 926 retry:
 927         cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
 928
 929         if (!cfqq) {
 930                 if (new_cfqq) {
 931                         cfqq = new_cfqq;
 932                         new_cfqq = NULL;
 933                 } else if (gfp_mask & __GFP_WAIT) {
 934                         spin_unlock_irq(q->queue_lock);
 935                         new_cfqq = mempool_alloc(cfq_mpool, gfp_mask);
 936                         spin_lock_irq(q->queue_lock);
 937                         goto retry;
 938                 } else
 939                         return NULL;
 940
 941                 memset(cfqq, 0, sizeof(*cfqq));
 942                 INIT_HLIST_NODE(&cfqq->cfq_hash);
 943                 INIT_LIST_HEAD(&cfqq->cfq_list);
 944                 cfqq->hash_key = cfq_hash_key(current);
 945                 cfqq->ioprio = cfq_ioprio(current);
 946
 947                 cfqq->cfqpriv = cfq_cfqpriv(cfqd,current);
 948                 if (!cfqq->cfqpriv)
 949                         cfqq->cfqpriv = &((cfqd->cid[cfqq->ioprio]).cfqpriv);
 950
 951                 cfqq->epstart = sched_clock();
 952                 /* epsector, avsec, skipped initialized to zero by memset */
 953
 954                 init_timer(&cfqq->timer);
 955                 cfqq->timer.function = cfq_pauseq_timer;
 956                 cfqq->timer.data = (unsigned long) cfqq;
 957
 958                 INIT_WORK(&cfqq->work, cfq_pauseq_work, cfqq);
 959
 960                 cfqq->cfqd = cfqd ;
 961
 962                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
 963         }
 964
 965         if (new_cfqq)
 966                 mempool_free(new_cfqq, cfq_mpool);
 967
 968         return cfqq;
 969 }
 970
 971 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int hashkey,
 972                                        int gfp_mask)
 973 {
 974         request_queue_t *q = cfqd->queue;
 975         struct cfq_queue *cfqq;
 976
 977         spin_lock_irq(q->queue_lock);
 978         cfqq = __cfq_get_queue(cfqd, hashkey, gfp_mask);
 979         spin_unlock_irq(q->queue_lock);
 980
 981         return cfqq;
 982 }
 983
 984 static void
 985 __cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
 986 {
 987         const int prio = crq->ioprio;
 988         struct cfq_queue *cfqq;
 989
 990         cfqq = __cfq_get_queue(cfqd, cfq_hash_key(current), GFP_ATOMIC);
 991         if (cfqq) {
 992
 993                 /*
 994                  * not too good...
 995                  */
 996                 if (prio > cfqq->ioprio) {
 997                         printk("prio hash collision %d %d\n",
 998                                prio, cfqq->ioprio);
 999                         if (!list_empty(&cfqq->cfq_list)) {
1000                                 cfqd->cid[cfqq->ioprio].busy_queues--;
1001                                 WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues<0);
1002                                 atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
1003                                 cfqd->cid[prio].busy_queues++;
1004                                 atomic_inc(&(cfqd->cid[prio].cum_queues_in));
1005                                 list_move_tail(&cfqq->cfq_list,
1006                                                &cfqd->cid[prio].rr_list);
1007                         }
1008                         cfqq->ioprio = prio;
1009                 }
1010
1011                 cfq_add_crq_rb(cfqd, cfqq, crq);
1012
1013                 if (list_empty(&cfqq->cfq_list)) {
1014                         list_add_tail(&cfqq->cfq_list,
1015                                       &cfqd->cid[prio].rr_list);
1016                         cfqd->cid[prio].busy_queues++;
1017                         atomic_inc(&(cfqd->cid[prio].cum_queues_in));
1018                         cfqd->busy_queues++;
1019                 }
1020
1021                 if (rq_mergeable(crq->request)) {
1022                         cfq_add_crq_hash(cfqd, crq);
1023
1024                         if (!q->last_merge)
1025                                 q->last_merge = crq->request;
1026                 }
1027
1028         } else {
1029                 /*
1030                  * should can only happen if the request wasn't allocated
1031                  * through blk_alloc_request(), eg stack requests from ide-cd
1032                  * (those should be removed) _and_ we are in OOM.
1033                  */
1034                 list_add_tail(&crq->request->queuelist, cfqd->dispatch);
1035         }
1036 }
1037
1038 static void cfq_reenqueue(request_queue_t *q, struct cfq_data *cfqd, int prio)
1039 {
1040         struct list_head *prio_list = &cfqd->cid[prio].prio_list;
1041         struct list_head *entry, *tmp;
1042
1043         list_for_each_safe(entry, tmp, prio_list) {
1044                 struct cfq_rq *crq = list_entry_prio(entry);
1045
1046                 list_del_init(entry);
1047                 list_del_init(&crq->request->queuelist);
1048                 __cfq_enqueue(q, cfqd, crq);
1049         }
1050 }
1051
1052 static void
1053 cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
1054 {
1055         const int prio = cfq_ioprio(current);
1056
1057         crq->ioprio = prio;
1058         crq->nr_sectors = crq->request->hard_nr_sectors;
1059         __cfq_enqueue(q, cfqd, crq);
1060
1061         if (prio == IOPRIO_RT) {
1062                 int i;
1063
1064                 /*
1065                  * realtime io gets priority, move all other io back
1066                  */
1067                 for (i = IOPRIO_IDLE; i < IOPRIO_RT; i++)
1068                         cfq_reenqueue(q, cfqd, i);
1069         } else if (prio != IOPRIO_IDLE) {
1070                 /*
1071                  * check if we need to move idle io back into queue
1072                  */
1073                 cfq_reenqueue(q, cfqd, IOPRIO_IDLE);
1074         }
1075 }
1076
1077 static void
1078 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
1079 {
1080         struct cfq_data *cfqd = q->elevator.elevator_data;
1081         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1082
1083         switch (where) {
1084                 case ELEVATOR_INSERT_BACK:
1085 #if 0
1086                         while (cfq_dispatch_requests(q, cfqd))
1087                                 ;
1088 #endif
1089                         list_add_tail(&rq->queuelist, cfqd->dispatch);
1090                         break;
1091                 case ELEVATOR_INSERT_FRONT:
1092                         list_add(&rq->queuelist, cfqd->dispatch);
1093                         break;
1094                 case ELEVATOR_INSERT_SORT:
1095                         BUG_ON(!blk_fs_request(rq));
1096                         cfq_enqueue(q, cfqd, crq);
1097                         break;
1098                 default:
1099                         printk("%s: bad insert point %d\n",
1100                                __FUNCTION__,where);
1101                         return;
1102         }
1103 }
1104
1105 static int cfq_queue_empty(request_queue_t *q)
1106 {
1107         struct cfq_data *cfqd = q->elevator.elevator_data;
1108
1109         if (list_empty(cfqd->dispatch) && !cfqd->busy_queues)
1110                 return 1;
1111
1112         return 0;
1113 }
1114
1115 static struct request *
1116 cfq_former_request(request_queue_t *q, struct request *rq)
1117 {
1118         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1119         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1120
1121         if (rbprev)
1122                 return rb_entry_crq(rbprev)->request;
1123
1124         return NULL;
1125 }
1126
1127 static struct request *
1128 cfq_latter_request(request_queue_t *q, struct request *rq)
1129 {
1130         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1131         struct rb_node *rbnext = rb_next(&crq->rb_node);
1132
1133         if (rbnext)
1134                 return rb_entry_crq(rbnext)->request;
1135
1136         return NULL;
1137 }
1138
1139 static void cfq_queue_congested(request_queue_t *q)
1140 {
1141         struct cfq_data *cfqd = q->elevator.elevator_data;
1142
1143         set_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
1144 }
1145
1146 static int cfq_may_queue(request_queue_t *q, int rw)
1147 {
1148         struct cfq_data *cfqd = q->elevator.elevator_data;
1149         struct cfq_queue *cfqq;
1150         const int prio = cfq_ioprio(current);
1151         int limit, ret = 1;
1152
1153         if (!cfqd->busy_queues)
1154                 goto out;
1155
1156         cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
1157         if (!cfqq)
1158                 goto out;
1159
1160         cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
1161         if (!cfqq)
1162                 goto out;
1163
1164         /*
1165          * if higher or equal prio io is sleeping waiting for a request, don't
1166          * allow this one to allocate one. as long as ll_rw_blk does fifo
1167          * waitqueue wakeups this should work...
1168          */
1169         if (cfqd->rq_starved_mask & ~((1 << prio) - 1))
1170                 goto out;
1171
1172         if (cfqq->queued[rw] < cfqd->cfq_queued || !cfqd->cid[prio].busy_queues)
1173                 goto out;
1174
1175         limit = q->nr_requests * (prio + 1) / IOPRIO_NR;
1176         limit /= cfqd->cid[prio].busy_queues;
1177         if (cfqq->queued[rw] > limit)
1178                 ret = 0;
1179 out:
1180         return ret;
1181 }
1182
1183 static void cfq_put_request(request_queue_t *q, struct request *rq)
1184 {
1185         struct cfq_data *cfqd = q->elevator.elevator_data;
1186         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1187         struct request_list *rl;
1188         int other_rw;
1189
1190         if (crq) {
1191                 BUG_ON(q->last_merge == rq);
1192                 BUG_ON(!hlist_unhashed(&crq->hash));
1193
1194                 mempool_free(crq, cfqd->crq_pool);
1195                 rq->elevator_private = NULL;
1196         }
1197
1198         /*
1199          * work-around for may_queue "bug": if a read gets issued and refused
1200          * to queue because writes ate all the allowed slots and no other
1201          * reads are pending for this queue, it could get stuck infinitely
1202          * since freed_request() only checks the waitqueue for writes when
1203          * freeing them. or vice versa for a single write vs many reads.
1204          * so check here whether "the other" data direction might be able
1205          * to queue and wake them
1206          */
1207         rl = &q->rq;
1208         other_rw = rq_data_dir(rq) ^ 1;
1209         if (rl->count[other_rw] <= q->nr_requests) {
1210                 smp_mb();
1211                 if (waitqueue_active(&rl->wait[other_rw]))
1212                         wake_up(&rl->wait[other_rw]);
1213         }
1214 }
1215
1216 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
1217 {
1218         struct cfq_data *cfqd = q->elevator.elevator_data;
1219         struct cfq_queue *cfqq;
1220         struct cfq_rq *crq;
1221
1222         /*
1223          * prepare a queue up front, so cfq_enqueue() doesn't have to
1224          */
1225         cfqq = cfq_get_queue(cfqd, cfq_hash_key(current), gfp_mask);
1226         if (!cfqq)
1227                 return 1;
1228
1229         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1230         if (crq) {
1231                 /*
1232                  * process now has one request
1233                  */
1234                 clear_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
1235
1236                 memset(crq, 0, sizeof(*crq));
1237                 crq->request = rq;
1238                 INIT_HLIST_NODE(&crq->hash);
1239                 INIT_LIST_HEAD(&crq->prio_list);
1240                 rq->elevator_private = crq;
1241                 return 0;
1242         }
1243
1244         return 1;
1245 }
1246
1247 static void cfq_exit(request_queue_t *q, elevator_t *e)
1248 {
1249         struct cfq_data *cfqd = e->elevator_data;
1250
1251         e->elevator_data = NULL;
1252         mempool_destroy(cfqd->crq_pool);
1253         kfree(cfqd->crq_hash);
1254         kfree(cfqd->cfq_hash);
1255         kfree(cfqd);
1256 }
1257
1258
1259
1260 static void cfq_timer(unsigned long data)
1261 {
1262         struct cfq_data *cfqd = (struct cfq_data *) data;
1263
1264         clear_bit(CFQ_WAIT_RT, &cfqd->flags);
1265         clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
1266         kblockd_schedule_work(&cfqd->work);
1267 }
1268
1269 static void cfq_work(void *data)
1270 {
1271         request_queue_t *q = data;
1272         unsigned long flags;
1273
1274         spin_lock_irqsave(q->queue_lock, flags);
1275         if (cfq_next_request(q))
1276                 q->request_fn(q);
1277         spin_unlock_irqrestore(q->queue_lock, flags);
1278 }
1279
1280 static int cfq_init(request_queue_t *q, elevator_t *e)
1281 {
1282         struct cfq_data *cfqd;
1283         int i;
1284
1285         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
1286         if (!cfqd)
1287                 return -ENOMEM;
1288
1289         memset(cfqd, 0, sizeof(*cfqd));
1290         init_timer(&cfqd->timer);
1291         cfqd->timer.function = cfq_timer;
1292         cfqd->timer.data = (unsigned long) cfqd;
1293
1294         INIT_WORK(&cfqd->work, cfq_work, q);
1295
1296         for (i = 0; i < IOPRIO_NR; i++) {
1297                 struct io_prio_data *cid = &cfqd->cid[i];
1298
1299                 INIT_LIST_HEAD(&cid->rr_list);
1300                 INIT_LIST_HEAD(&cid->prio_list);
1301                 cid->last_rq = -1;
1302                 cid->last_sectors = -1;
1303
1304                 atomic_set(&cid->cum_rq_in,0);
1305                 atomic_set(&cid->cum_rq_out,0);
1306                 atomic_set(&cid->cum_sectors_in,0);
1307                 atomic_set(&cid->cum_sectors_out,0);
1308                 atomic_set(&cid->cum_queues_in,0);
1309                 atomic_set(&cid->cum_queues_out,0);
1310
1311
1312                 atomic_set(&((cid->cfqpriv).sectorate),CFQ_SECTORATE);
1313                 (cid->cfqpriv).nskip = 0;
1314                 (cid->cfqpriv).navsec = 0;
1315                 (cid->cfqpriv).timedout = 0;
1316         }
1317
1318         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES,
1319                                  GFP_KERNEL);
1320         if (!cfqd->crq_hash)
1321                 goto out_crqhash;
1322
1323         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES,
1324                                  GFP_KERNEL);
1325         if (!cfqd->cfq_hash)
1326                 goto out_cfqhash;
1327
1328         cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab,
1329                                         mempool_free_slab, crq_pool);
1330         if (!cfqd->crq_pool)
1331                 goto out_crqpool;
1332
1333         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
1334                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
1335         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
1336                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
1337
1338         cfqd->cfq_queued = cfq_queued;
1339         cfqd->cfq_quantum = cfq_quantum;
1340         cfqd->cfq_quantum_io = cfq_quantum_io;
1341         cfqd->cfq_idle_quantum = cfq_idle_quantum;
1342         cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
1343         cfqd->cfq_grace_rt = cfq_grace_rt;
1344         cfqd->cfq_grace_idle = cfq_grace_idle;
1345
1346         cfqd->cfq_epoch = CFQ_EPOCH;
1347         cfqd->cfq_hmax_pct = CFQ_HMAX_PCT;
1348
1349         q->nr_requests <<= 2;
1350
1351         cfqd->dispatch = &q->queue_head;
1352         e->elevator_data = cfqd;
1353         cfqd->queue = q;
1354
1355         return 0;
1356 out_crqpool:
1357         kfree(cfqd->cfq_hash);
1358 out_cfqhash:
1359         kfree(cfqd->crq_hash);
1360 out_crqhash:
1361         kfree(cfqd);
1362         return -ENOMEM;
1363 }
1364
1365 static int __init cfq_slab_setup(void)
1366 {
1367         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
1368                                         NULL, NULL);
1369
1370         if (!crq_pool)
1371                 panic("cfq_iosched: can't init crq pool\n");
1372
1373         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
1374                                         NULL, NULL);
1375
1376         if (!cfq_pool)
1377                 panic("cfq_iosched: can't init cfq pool\n");
1378
1379         cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
1380
1381         if (!cfq_mpool)
1382                 panic("cfq_iosched: can't init cfq mpool\n");
1383
1384         return 0;
1385 }
1386
1387 subsys_initcall(cfq_slab_setup);
1388
1389 /*
1390  * sysfs parts below -->
1391  */
1392 struct cfq_fs_entry {
1393         struct attribute attr;
1394         ssize_t (*show)(struct cfq_data *, char *);
1395         ssize_t (*store)(struct cfq_data *, const char *, size_t);
1396 };
1397
1398 static ssize_t
1399 cfq_var_show(unsigned int var, char *page)
1400 {
1401         return sprintf(page, "%d\n", var);
1402 }
1403
1404 static ssize_t
1405 cfq_var_store(unsigned int *var, const char *page, size_t count)
1406 {
1407         char *p = (char *) page;
1408
1409         *var = simple_strtoul(p, &p, 10);
1410         return count;
1411 }
1412
1413 #define SHOW_FUNCTION(__FUNC, __VAR)                                    \
1414 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)                \
1415 {                                                                       \
1416         return cfq_var_show(__VAR, (page));                             \
1417 }
1418 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum);
1419 SHOW_FUNCTION(cfq_quantum_io_show, cfqd->cfq_quantum_io);
1420 SHOW_FUNCTION(cfq_idle_quantum_show, cfqd->cfq_idle_quantum);
1421 SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io);
1422 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
1423 SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
1424 SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
1425 SHOW_FUNCTION(cfq_epoch_show, cfqd->cfq_epoch);
1426 SHOW_FUNCTION(cfq_hmax_pct_show, cfqd->cfq_hmax_pct);
1427 #undef SHOW_FUNCTION
1428
1429 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                         \
1430 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)    \
1431 {                                                                       \
1432         int ret = cfq_var_store(__PTR, (page), count);                  \
1433         if (*(__PTR) < (MIN))                                           \
1434                 *(__PTR) = (MIN);                                       \
1435         else if (*(__PTR) > (MAX))                                      \
1436                 *(__PTR) = (MAX);                                       \
1437         return ret;                                                     \
1438 }
1439 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX);
1440 STORE_FUNCTION(cfq_quantum_io_store, &cfqd->cfq_quantum_io, 4, INT_MAX);
1441 STORE_FUNCTION(cfq_idle_quantum_store, &cfqd->cfq_idle_quantum, 1, INT_MAX);
1442 STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX);
1443 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
1444 STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
1445 STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
1446 STORE_FUNCTION(cfq_epoch_store, &cfqd->cfq_epoch, 0, INT_MAX);
1447 STORE_FUNCTION(cfq_hmax_pct_store, &cfqd->cfq_hmax_pct, 1, 100);
1448 #undef STORE_FUNCTION
1449
1450
1451 /* Additional entries to get priority level data */
1452 static ssize_t
1453 cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
1454 {
1455     //int r1,r2,s1,s2,q1,q2;
1456
1457         if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT))
1458                 return 0;
1459
1460         /*
1461         r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
1462         r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
1463         s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
1464         s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
1465         q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in));
1466         q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
1467         */
1468
1469         return sprintf(page,"skip %d timdout %d avsec %lu rate %ld "
1470                        " sec0 %lu sec1 %lu\n",
1471                        cfqd->cid[priolvl].cfqpriv.nskip,
1472                        cfqd->cid[priolvl].cfqpriv.timedout,
1473                        cfqd->cid[priolvl].cfqpriv.navsec,
1474                        atomic_read(&(cfqd->cid[priolvl].cfqpriv.sectorate)),
1475                        (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[0],
1476                        (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[1]);
1477
1478 }
1479
1480 #define SHOW_PRIO_DATA(__PRIOLVL)                                               \
1481 static ssize_t cfq_prio_##__PRIOLVL##_show(struct cfq_data *cfqd, char *page)   \
1482 {                                                                               \
1483         return cfq_prio_show(cfqd,page,__PRIOLVL);                              \
1484 }
1485 SHOW_PRIO_DATA(0);
1486 SHOW_PRIO_DATA(1);
1487 SHOW_PRIO_DATA(2);
1488 SHOW_PRIO_DATA(3);
1489 SHOW_PRIO_DATA(4);
1490 SHOW_PRIO_DATA(5);
1491 SHOW_PRIO_DATA(6);
1492 SHOW_PRIO_DATA(7);
1493 SHOW_PRIO_DATA(8);
1494 SHOW_PRIO_DATA(9);
1495 SHOW_PRIO_DATA(10);
1496 SHOW_PRIO_DATA(11);
1497 SHOW_PRIO_DATA(12);
1498 SHOW_PRIO_DATA(13);
1499 SHOW_PRIO_DATA(14);
1500 SHOW_PRIO_DATA(15);
1501 SHOW_PRIO_DATA(16);
1502 SHOW_PRIO_DATA(17);
1503 SHOW_PRIO_DATA(18);
1504 SHOW_PRIO_DATA(19);
1505 SHOW_PRIO_DATA(20);
1506 #undef SHOW_PRIO_DATA
1507
1508
1509 static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
1510 {
1511
1512         char *p = (char *) page;
1513         int val;
1514
1515         val = (int) simple_strtoul(p, &p, 10);
1516
1517         atomic_set(&(cfqd->cid[priolvl].cfqpriv.sectorate),val);
1518         cfqd->cid[priolvl].cfqpriv.nskip = 0;
1519         cfqd->cid[priolvl].cfqpriv.navsec = 0;
1520         cfqd->cid[priolvl].cfqpriv.timedout = 0;
1521
1522 #if 0
1523         atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
1524         atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
1525         atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
1526         atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
1527         atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
1528         atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
1529 #endif
1530
1531         return count;
1532 }
1533
1534
1535 #define STORE_PRIO_DATA(__PRIOLVL)                                                                 \
1536 static ssize_t cfq_prio_##__PRIOLVL##_store(struct cfq_data *cfqd, const char *page, size_t count) \
1537 {                                                                                                  \
1538         return cfq_prio_store(cfqd,page,count,__PRIOLVL);                                          \
1539 }
1540 STORE_PRIO_DATA(0);
1541 STORE_PRIO_DATA(1);
1542 STORE_PRIO_DATA(2);
1543 STORE_PRIO_DATA(3);
1544 STORE_PRIO_DATA(4);
1545 STORE_PRIO_DATA(5);
1546 STORE_PRIO_DATA(6);
1547 STORE_PRIO_DATA(7);
1548 STORE_PRIO_DATA(8);
1549 STORE_PRIO_DATA(9);
1550 STORE_PRIO_DATA(10);
1551 STORE_PRIO_DATA(11);
1552 STORE_PRIO_DATA(12);
1553 STORE_PRIO_DATA(13);
1554 STORE_PRIO_DATA(14);
1555 STORE_PRIO_DATA(15);
1556 STORE_PRIO_DATA(16);
1557 STORE_PRIO_DATA(17);
1558 STORE_PRIO_DATA(18);
1559 STORE_PRIO_DATA(19);
1560 STORE_PRIO_DATA(20);
1561 #undef STORE_PRIO_DATA
1562
1563
1564 static struct cfq_fs_entry cfq_quantum_entry = {
1565         .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
1566         .show = cfq_quantum_show,
1567         .store = cfq_quantum_store,
1568 };
1569 static struct cfq_fs_entry cfq_quantum_io_entry = {
1570         .attr = {.name = "quantum_io", .mode = S_IRUGO | S_IWUSR },
1571         .show = cfq_quantum_io_show,
1572         .store = cfq_quantum_io_store,
1573 };
1574 static struct cfq_fs_entry cfq_idle_quantum_entry = {
1575         .attr = {.name = "idle_quantum", .mode = S_IRUGO | S_IWUSR },
1576         .show = cfq_idle_quantum_show,
1577         .store = cfq_idle_quantum_store,
1578 };
1579 static struct cfq_fs_entry cfq_idle_quantum_io_entry = {
1580         .attr = {.name = "idle_quantum_io", .mode = S_IRUGO | S_IWUSR },
1581         .show = cfq_idle_quantum_io_show,
1582         .store = cfq_idle_quantum_io_store,
1583 };
1584 static struct cfq_fs_entry cfq_queued_entry = {
1585         .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
1586         .show = cfq_queued_show,
1587         .store = cfq_queued_store,
1588 };
1589 static struct cfq_fs_entry cfq_grace_rt_entry = {
1590         .attr = {.name = "grace_rt", .mode = S_IRUGO | S_IWUSR },
1591         .show = cfq_grace_rt_show,
1592         .store = cfq_grace_rt_store,
1593 };
1594 static struct cfq_fs_entry cfq_grace_idle_entry = {
1595         .attr = {.name = "grace_idle", .mode = S_IRUGO | S_IWUSR },
1596         .show = cfq_grace_idle_show,
1597         .store = cfq_grace_idle_store,
1598 };
1599 static struct cfq_fs_entry cfq_epoch_entry = {
1600         .attr = {.name = "epoch", .mode = S_IRUGO | S_IWUSR },
1601         .show = cfq_epoch_show,
1602         .store = cfq_epoch_store,
1603 };
1604 static struct cfq_fs_entry cfq_hmax_pct_entry = {
1605         .attr = {.name = "hmaxpct", .mode = S_IRUGO | S_IWUSR },
1606         .show = cfq_hmax_pct_show,
1607         .store = cfq_hmax_pct_store,
1608 };
1609
1610 #define P_0_STR   "p0"
1611 #define P_1_STR   "p1"
1612 #define P_2_STR   "p2"
1613 #define P_3_STR   "p3"
1614 #define P_4_STR   "p4"
1615 #define P_5_STR   "p5"
1616 #define P_6_STR   "p6"
1617 #define P_7_STR   "p7"
1618 #define P_8_STR   "p8"
1619 #define P_9_STR   "p9"
1620 #define P_10_STR  "p10"
1621 #define P_11_STR  "p11"
1622 #define P_12_STR  "p12"
1623 #define P_13_STR  "p13"
1624 #define P_14_STR  "p14"
1625 #define P_15_STR  "p15"
1626 #define P_16_STR  "p16"
1627 #define P_17_STR  "p17"
1628 #define P_18_STR  "p18"
1629 #define P_19_STR  "p19"
1630 #define P_20_STR  "p20"
1631
1632
1633 #define CFQ_PRIO_SYSFS_ENTRY(__PRIOLVL)                                    \
1634 static struct cfq_fs_entry cfq_prio_##__PRIOLVL##_entry = {                \
1635         .attr = {.name = P_##__PRIOLVL##_STR, .mode = S_IRUGO | S_IWUSR }, \
1636         .show = cfq_prio_##__PRIOLVL##_show,                               \
1637         .store = cfq_prio_##__PRIOLVL##_store,                             \
1638 };
1639 CFQ_PRIO_SYSFS_ENTRY(0);
1640 CFQ_PRIO_SYSFS_ENTRY(1);
1641 CFQ_PRIO_SYSFS_ENTRY(2);
1642 CFQ_PRIO_SYSFS_ENTRY(3);
1643 CFQ_PRIO_SYSFS_ENTRY(4);
1644 CFQ_PRIO_SYSFS_ENTRY(5);
1645 CFQ_PRIO_SYSFS_ENTRY(6);
1646 CFQ_PRIO_SYSFS_ENTRY(7);
1647 CFQ_PRIO_SYSFS_ENTRY(8);
1648 CFQ_PRIO_SYSFS_ENTRY(9);
1649 CFQ_PRIO_SYSFS_ENTRY(10);
1650 CFQ_PRIO_SYSFS_ENTRY(11);
1651 CFQ_PRIO_SYSFS_ENTRY(12);
1652 CFQ_PRIO_SYSFS_ENTRY(13);
1653 CFQ_PRIO_SYSFS_ENTRY(14);
1654 CFQ_PRIO_SYSFS_ENTRY(15);
1655 CFQ_PRIO_SYSFS_ENTRY(16);
1656 CFQ_PRIO_SYSFS_ENTRY(17);
1657 CFQ_PRIO_SYSFS_ENTRY(18);
1658 CFQ_PRIO_SYSFS_ENTRY(19);
1659 CFQ_PRIO_SYSFS_ENTRY(20);
1660 #undef CFQ_PRIO_SYSFS_ENTRY
1661
1662 static struct attribute *default_attrs[] = {
1663         &cfq_quantum_entry.attr,
1664         &cfq_quantum_io_entry.attr,
1665         &cfq_idle_quantum_entry.attr,
1666         &cfq_idle_quantum_io_entry.attr,
1667         &cfq_queued_entry.attr,
1668         &cfq_grace_rt_entry.attr,
1669         &cfq_grace_idle_entry.attr,
1670         &cfq_epoch_entry.attr,
1671         &cfq_hmax_pct_entry.attr,
1672         &cfq_prio_0_entry.attr,
1673         &cfq_prio_1_entry.attr,
1674         &cfq_prio_2_entry.attr,
1675         &cfq_prio_3_entry.attr,
1676         &cfq_prio_4_entry.attr,
1677         &cfq_prio_5_entry.attr,
1678         &cfq_prio_6_entry.attr,
1679         &cfq_prio_7_entry.attr,
1680         &cfq_prio_8_entry.attr,
1681         &cfq_prio_9_entry.attr,
1682         &cfq_prio_10_entry.attr,
1683         &cfq_prio_11_entry.attr,
1684         &cfq_prio_12_entry.attr,
1685         &cfq_prio_13_entry.attr,
1686         &cfq_prio_14_entry.attr,
1687         &cfq_prio_15_entry.attr,
1688         &cfq_prio_16_entry.attr,
1689         &cfq_prio_17_entry.attr,
1690         &cfq_prio_18_entry.attr,
1691         &cfq_prio_19_entry.attr,
1692         &cfq_prio_20_entry.attr,
1693         NULL,
1694 };
1695
1696 #define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
1697
1698 static ssize_t
1699 cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1700 {
1701         elevator_t *e = container_of(kobj, elevator_t, kobj);
1702         struct cfq_fs_entry *entry = to_cfq(attr);
1703
1704         if (!entry->show)
1705                 return 0;
1706
1707         return entry->show(e->elevator_data, page);
1708 }
1709
1710 static ssize_t
1711 cfq_attr_store(struct kobject *kobj, struct attribute *attr,
1712                const char *page, size_t length)
1713 {
1714         elevator_t *e = container_of(kobj, elevator_t, kobj);
1715         struct cfq_fs_entry *entry = to_cfq(attr);
1716
1717         if (!entry->store)
1718                 return -EINVAL;
1719
1720         return entry->store(e->elevator_data, page, length);
1721 }
1722
1723 static struct sysfs_ops cfq_sysfs_ops = {
1724         .show   = cfq_attr_show,
1725         .store  = cfq_attr_store,
1726 };
1727
1728 struct kobj_type cfq_ktype = {
1729         .sysfs_ops      = &cfq_sysfs_ops,
1730         .default_attrs  = default_attrs,
1731 };
1732
1733 elevator_t iosched_cfq = {
1734         .elevator_name =                "cfq",
1735         .elevator_ktype =               &cfq_ktype,
1736         .elevator_merge_fn =            cfq_merge,
1737         .elevator_merged_fn =           cfq_merged_request,
1738         .elevator_merge_req_fn =        cfq_merged_requests,
1739         .elevator_next_req_fn =         cfq_next_request,
1740         .elevator_add_req_fn =          cfq_insert_request,
1741         .elevator_remove_req_fn =       cfq_remove_request,
1742         .elevator_queue_empty_fn =      cfq_queue_empty,
1743         .elevator_former_req_fn =       cfq_former_request,
1744         .elevator_latter_req_fn =       cfq_latter_request,
1745         .elevator_set_req_fn =          cfq_set_request,
1746         .elevator_put_req_fn =          cfq_put_request,
1747         .elevator_may_queue_fn =        cfq_may_queue,
1748         .elevator_set_congested_fn =    cfq_queue_congested,
1749         .elevator_init_fn =             cfq_init,
1750         .elevator_exit_fn =             cfq_exit,
1751 };
1752
1753 EXPORT_SYMBOL(iosched_cfq);