drivers/block/cfq-iosched.c

   1 /*
   2  *  linux/drivers/block/cfq-iosched.c
   3  *
   4  *  CFQ, or complete fairness queueing, disk scheduler.
   5  *
   6  *  Based on ideas from a previously unfinished io
   7  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   8  *
   9  *  IO priorities are supported, from 0% to 100% in 5% increments. Both of
  10  *  those values have special meaning - 0% class is allowed to do io if
  11  *  noone else wants to use the disk. 100% is considered real-time io, and
  12  *  always get priority. Default process io rate is 95%. In absence of other
  13  *  io, a class may consume 100% disk bandwidth regardless. Withing a class,
  14  *  bandwidth is distributed equally among the citizens.
  15  *
  16  * TODO:
  17  *      - cfq_select_requests() needs some work for 5-95% io
  18  *      - barriers not supported
  19  *      - export grace periods in ms, not jiffies
  20  *
  21  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
  22  */
  23 #include <linux/kernel.h>
  24 #include <linux/fs.h>
  25 #include <linux/blkdev.h>
  26 #include <linux/elevator.h>
  27 #include <linux/bio.h>
  28 #include <linux/config.h>
  29 #include <linux/module.h>
  30 #include <linux/slab.h>
  31 #include <linux/init.h>
  32 #include <linux/compiler.h>
  33 #include <linux/hash.h>
  34 #include <linux/rbtree.h>
  35 #include <linux/mempool.h>
  36 #include <asm/div64.h>
  37
  38 #if IOPRIO_NR > BITS_PER_LONG
  39 #error Cannot support this many io priority levels
  40 #endif
  41
  42 #define LIMIT_DEBUG   1
  43
  44 /*
  45  * tunables
  46  */
  47 static int cfq_quantum = 6;
  48 static int cfq_quantum_io = 256;
  49 static int cfq_idle_quantum = 1;
  50 static int cfq_idle_quantum_io = 64;
  51 static int cfq_queued = 4;
  52 static int cfq_grace_rt = HZ / 100 ?: 1;
  53 static int cfq_grace_idle = HZ / 10;
  54
  55 #define CFQ_QHASH_SHIFT         6
  56 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
  57 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
  58
  59 #define CFQ_MHASH_SHIFT         8
  60 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
  61 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
  62 #define CFQ_MHASH_FN(sec)       (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
  63 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
  64 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
  65
  66 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
  67 #define list_entry_prio(ptr)    list_entry((ptr), struct cfq_rq, prio_list)
  68
  69 #define cfq_account_io(crq)     \
  70         ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
  71
  72 /* define to be 50 ms for now; make tunable later */
  73 #define CFQ_EPOCH               50000
  74 /* Needs to be made tunable right away, in MiB/s */
  75 #define CFQ_DISKBW              10
  76 /* Temporary global limit, as percent of available b/w, for each "class" */
  77 #define CFQ_TEMPLIM             10
  78
  79 /*
  80  * defines how we distribute bandwidth (can be tgid, uid, etc)
  81  */
  82
  83 /* FIXME: change hash_key to be sizeof(void *) rather than sizeof(int)
  84  * otherwise the cast of cki_tsk_icls will not work reliably on 64-bit arches.
  85  * OR, change cki_tsk_icls to return ints (will need another id space to be
  86  * managed)
  87  */
  88
  89 #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
  90 extern inline void *cki_hash_key(struct task_struct *tsk);
  91 extern inline int cki_ioprio(struct task_struct *tsk);
  92 #define cfq_hash_key(current)   ((int)cki_hash_key((current)))
  93 #define cfq_ioprio(current)     (cki_ioprio((current)))
  94
  95 #else
  96 #define cfq_hash_key(current)   ((current)->tgid)
  97
  98 /*
  99  * move to io_context
 100  */
 101 #define cfq_ioprio(current)     ((current)->ioprio)
 102 #endif
 103
 104 #define CFQ_WAIT_RT     0
 105 #define CFQ_WAIT_NORM   1
 106
 107 static kmem_cache_t *crq_pool;
 108 static kmem_cache_t *cfq_pool;
 109 static mempool_t *cfq_mpool;
 110
 111 /*
 112  * defines an io priority level
 113  */
 114 struct io_prio_data {
 115         struct list_head rr_list;
 116         int busy_queues;
 117         int busy_rq;
 118         unsigned long busy_sectors;
 119
 120         /* requests, sectors and queues
 121          * added(in),dispatched/deleted(out)
 122          * at this priority level.
 123          */
 124         atomic_t cum_rq_in,cum_rq_out;
 125         atomic_t cum_sectors_in,cum_sectors_out;
 126         atomic_t cum_queues_in,cum_queues_out;
 127
 128 #ifdef LIMIT_DEBUG
 129         int nskip;
 130         unsigned long navsec;
 131         unsigned long csectorate;
 132         unsigned long lsectorate;
 133 #endif
 134
 135         struct list_head prio_list;
 136         int last_rq;
 137         int last_sectors;
 138 };
 139
 140 /*
 141  * per-request queue structure
 142  */
 143 struct cfq_data {
 144         struct list_head rr_list;
 145         struct list_head *dispatch;
 146         struct hlist_head *cfq_hash;
 147         struct hlist_head *crq_hash;
 148         mempool_t *crq_pool;
 149
 150         struct io_prio_data cid[IOPRIO_NR];
 151
 152         /*
 153          * total number of busy queues and requests
 154          */
 155         int busy_rq;
 156         int busy_queues;
 157         unsigned long busy_sectors;
 158
 159
 160         request_queue_t *queue;
 161         unsigned long rq_starved_mask;
 162
 163         /*
 164          * grace period handling
 165          */
 166         struct timer_list timer;
 167         unsigned long wait_end;
 168         unsigned long flags;
 169         struct work_struct work;
 170
 171         /*
 172          * tunables
 173          */
 174         unsigned int cfq_quantum;
 175         unsigned int cfq_quantum_io;
 176         unsigned int cfq_idle_quantum;
 177         unsigned int cfq_idle_quantum_io;
 178         unsigned int cfq_queued;
 179         unsigned int cfq_grace_rt;
 180         unsigned int cfq_grace_idle;
 181
 182         unsigned long cfq_epoch;        /* duration for limit enforcement */
 183         unsigned long cfq_epochsectors; /* max sectors dispatchable/epoch */
 184 };
 185
 186 /*
 187  * per-class structure
 188  */
 189 struct cfq_queue {
 190         struct list_head cfq_list;
 191         struct hlist_node cfq_hash;
 192         int hash_key;
 193         struct rb_root sort_list;
 194         int queued[2];
 195         int ioprio;
 196
 197         unsigned long avsec;            /* avg sectors dispatched/epoch */
 198         unsigned long long lastime;     /* timestamp of last request served */
 199         unsigned long sectorate;        /* limit for sectors served/epoch */
 200         int skipped;                    /* queue skipped at last dispatch ? */
 201 };
 202
 203 /*
 204  * per-request structure
 205  */
 206 struct cfq_rq {
 207         struct cfq_queue *cfq_queue;
 208         struct rb_node rb_node;
 209         struct hlist_node hash;
 210         sector_t rb_key;
 211
 212         struct request *request;
 213         struct list_head prio_list;
 214         unsigned long nr_sectors;
 215         int ioprio;
 216 };
 217
 218 static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
 219 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
 220 static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 221                               struct cfq_rq *crq);
 222
 223 /*
 224  * lots of deadline iosched dupes, can be abstracted later...
 225  */
 226 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
 227 {
 228         hlist_del_init(&crq->hash);
 229 }
 230
 231 static inline void
 232 cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
 233 {
 234         cfq_del_crq_hash(crq);
 235
 236         if (q->last_merge == crq->request)
 237                 q->last_merge = NULL;
 238 }
 239
 240 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
 241 {
 242         struct request *rq = crq->request;
 243         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(rq));
 244
 245         BUG_ON(!hlist_unhashed(&crq->hash));
 246
 247         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
 248 }
 249
 250 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 251 {
 252         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
 253         struct hlist_node *entry, *next;
 254
 255         hlist_for_each_safe(entry, next, hash_list) {
 256                 struct cfq_rq *crq = list_entry_hash(entry);
 257                 struct request *__rq = crq->request;
 258
 259                 BUG_ON(hlist_unhashed(&crq->hash));
 260
 261                 if (!rq_mergeable(__rq)) {
 262                         cfq_del_crq_hash(crq);
 263                         continue;
 264                 }
 265
 266                 if (rq_hash_key(__rq) == offset)
 267                         return __rq;
 268         }
 269
 270         return NULL;
 271 }
 272
 273 /*
 274  * rb tree support functions
 275  */
 276 #define RB_EMPTY(node)          ((node)->rb_node == NULL)
 277 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
 278 #define rq_rb_key(rq)           (rq)->sector
 279
 280 static void
 281 cfq_del_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
 282 {
 283         if (crq->cfq_queue) {
 284                 crq->cfq_queue = NULL;
 285
 286                 if (cfq_account_io(crq)) {
 287                         cfqd->busy_rq--;
 288                         cfqd->busy_sectors -= crq->nr_sectors;
 289                         cfqd->cid[crq->ioprio].busy_rq--;
 290                         cfqd->cid[crq->ioprio].busy_sectors -= crq->nr_sectors;
 291                 }
 292                 atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_out));
 293                 atomic_add(crq->nr_sectors,
 294                            &(cfqd->cid[crq->ioprio].cum_sectors_out));
 295                 cfqq->queued[rq_data_dir(crq->request)]--;
 296                 rb_erase(&crq->rb_node, &cfqq->sort_list);
 297         }
 298 }
 299
 300 static struct cfq_rq *
 301 __cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 302 {
 303         struct rb_node **p = &cfqq->sort_list.rb_node;
 304         struct rb_node *parent = NULL;
 305         struct cfq_rq *__crq;
 306
 307         while (*p) {
 308                 parent = *p;
 309                 __crq = rb_entry_crq(parent);
 310
 311                 if (crq->rb_key < __crq->rb_key)
 312                         p = &(*p)->rb_left;
 313                 else if (crq->rb_key > __crq->rb_key)
 314                         p = &(*p)->rb_right;
 315                 else
 316                         return __crq;
 317         }
 318
 319         rb_link_node(&crq->rb_node, parent, p);
 320         return NULL;
 321 }
 322
 323 static void
 324 cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
 325 {
 326         struct request *rq = crq->request;
 327         struct cfq_rq *__alias;
 328
 329
 330         cfqq->queued[rq_data_dir(rq)]++;
 331         if (cfq_account_io(crq)) {
 332                 cfqd->busy_rq++;
 333                 cfqd->busy_sectors += crq->nr_sectors;
 334                 cfqd->cid[crq->ioprio].busy_rq++;
 335                 cfqd->cid[crq->ioprio].busy_sectors += crq->nr_sectors;
 336         }
 337         atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_in));
 338         atomic_add(crq->nr_sectors,
 339                    &(cfqd->cid[crq->ioprio].cum_sectors_in));
 340 retry:
 341         __alias = __cfq_add_crq_rb(cfqq, crq);
 342         if (!__alias) {
 343                 rb_insert_color(&crq->rb_node, &cfqq->sort_list);
 344                 crq->rb_key = rq_rb_key(rq);
 345                 crq->cfq_queue = cfqq;
 346                 return;
 347         }
 348
 349         cfq_dispatch_sort(cfqd, cfqq, __alias);
 350         goto retry;
 351 }
 352
 353 static struct request *
 354 cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
 355 {
 356         struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
 357         struct rb_node *n;
 358
 359         if (!cfqq)
 360                 goto out;
 361
 362         n = cfqq->sort_list.rb_node;
 363         while (n) {
 364                 struct cfq_rq *crq = rb_entry_crq(n);
 365
 366                 if (sector < crq->rb_key)
 367                         n = n->rb_left;
 368                 else if (sector > crq->rb_key)
 369                         n = n->rb_right;
 370                 else
 371                         return crq->request;
 372         }
 373
 374 out:
 375         return NULL;
 376 }
 377
 378 static void cfq_remove_request(request_queue_t *q, struct request *rq)
 379 {
 380         struct cfq_data *cfqd = q->elevator.elevator_data;
 381         struct cfq_rq *crq = RQ_ELV_DATA(rq);
 382
 383         if (crq) {
 384
 385                 cfq_remove_merge_hints(q, crq);
 386                 list_del_init(&crq->prio_list);
 387                 list_del_init(&rq->queuelist);
 388
 389                 /*
 390                  * set a grace period timer to allow realtime io to make real
 391                  * progress, if we release an rt request. for normal request,
 392                  * set timer so idle io doesn't interfere with other io
 393                  */
 394                 if (crq->ioprio == IOPRIO_RT) {
 395                         set_bit(CFQ_WAIT_RT, &cfqd->flags);
 396                         cfqd->wait_end = jiffies + cfqd->cfq_grace_rt;
 397                 } else if (crq->ioprio != IOPRIO_IDLE) {
 398                         set_bit(CFQ_WAIT_NORM, &cfqd->flags);
 399                         cfqd->wait_end = jiffies + cfqd->cfq_grace_idle;
 400                 }
 401
 402                 if (crq->cfq_queue) {
 403                         struct cfq_queue *cfqq = crq->cfq_queue;
 404
 405                         cfq_del_crq_rb(cfqd, cfqq, crq);
 406
 407                         if (RB_EMPTY(&cfqq->sort_list))
 408                                 cfq_put_queue(cfqd, cfqq);
 409                 }
 410         }
 411 }
 412
 413 static int
 414 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 415 {
 416         struct cfq_data *cfqd = q->elevator.elevator_data;
 417         struct request *__rq;
 418         int ret;
 419
 420         ret = elv_try_last_merge(q, bio);
 421         if (ret != ELEVATOR_NO_MERGE) {
 422                 __rq = q->last_merge;
 423                 goto out_insert;
 424         }
 425
 426         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
 427         if (__rq) {
 428                 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
 429
 430                 if (elv_rq_merge_ok(__rq, bio)) {
 431                         ret = ELEVATOR_BACK_MERGE;
 432                         goto out;
 433                 }
 434         }
 435
 436         __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
 437         if (__rq) {
 438                 if (elv_rq_merge_ok(__rq, bio)) {
 439                         ret = ELEVATOR_FRONT_MERGE;
 440                         goto out;
 441                 }
 442         }
 443
 444         return ELEVATOR_NO_MERGE;
 445 out:
 446         q->last_merge = __rq;
 447 out_insert:
 448         *req = __rq;
 449         return ret;
 450 }
 451
 452 static void cfq_merged_request(request_queue_t *q, struct request *req)
 453 {
 454         struct cfq_data *cfqd = q->elevator.elevator_data;
 455         struct cfq_rq *crq = RQ_ELV_DATA(req);
 456         int tmp;
 457
 458         cfq_del_crq_hash(crq);
 459         cfq_add_crq_hash(cfqd, crq);
 460
 461         if (crq->cfq_queue && (rq_rb_key(req) != crq->rb_key)) {
 462                 struct cfq_queue *cfqq = crq->cfq_queue;
 463
 464                 cfq_del_crq_rb(cfqd, cfqq, crq);
 465                 cfq_add_crq_rb(cfqd, cfqq, crq);
 466         }
 467
 468         tmp = req->hard_nr_sectors - crq->nr_sectors;
 469         cfqd->busy_sectors += tmp;
 470         cfqd->cid[crq->ioprio].busy_sectors += tmp;
 471         atomic_add(tmp,&(cfqd->cid[crq->ioprio].cum_sectors_in));
 472
 473         crq->nr_sectors = req->hard_nr_sectors;
 474
 475         q->last_merge = req;
 476 }
 477
 478 static void
 479 cfq_merged_requests(request_queue_t *q, struct request *req,
 480                     struct request *next)
 481 {
 482         cfq_merged_request(q, req);
 483         cfq_remove_request(q, next);
 484 }
 485
 486 /*
 487  * sort into dispatch list, in optimal ascending order
 488  */
 489 static void
 490 cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 491                   struct cfq_rq *crq)
 492 {
 493         struct list_head *head = cfqd->dispatch, *entry = head;
 494         struct request *__rq;
 495
 496         cfq_del_crq_rb(cfqd, cfqq, crq);
 497         cfq_remove_merge_hints(cfqd->queue, crq);
 498
 499         if (!list_empty(head)) {
 500                 __rq = list_entry_rq(head->next);
 501
 502                 if (crq->request->sector < __rq->sector) {
 503                         entry = head->prev;
 504                         goto link;
 505                 }
 506         }
 507
 508         while ((entry = entry->prev) != head) {
 509                 __rq = list_entry_rq(entry);
 510
 511                 if (crq->request->sector <= __rq->sector)
 512                         break;
 513         }
 514
 515 link:
 516         list_add_tail(&crq->request->queuelist, entry);
 517 }
 518
 519 /*
 520  * remove from io scheduler core and put on dispatch list for service
 521  */
 522 static inline int
 523 __cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
 524                         struct cfq_queue *cfqq)
 525 {
 526         struct cfq_rq *crq;
 527         unsigned long long ts, gap;
 528         unsigned long newavsec;
 529
 530         crq = rb_entry_crq(rb_first(&cfqq->sort_list));
 531
 532 #if 1
 533         /* Determine if queue should be skipped for being overshare */
 534         ts = sched_clock();
 535         gap = ts - cfqq->lastime;
 536 #ifdef LIMIT_DEBUG
 537         cfqq->sectorate = (cfqd->cfq_epochsectors
 538                            * CFQ_TEMPLIM)/100;
 539
 540 #endif
 541         if ((gap >= cfqd->cfq_epoch) || (gap < 0)) {
 542                 cfqq->avsec = crq->nr_sectors ;
 543                 cfqq->lastime = ts;
 544         } else {
 545                 u64 tmp;
 546                 /* Age old average and accumalate request to be served */
 547
 548 //              tmp = (u64) (cfqq->avsec * gap) ;
 549 //              do_div(tmp, cfqd->cfq_epoch);
 550                 newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors;
 551 //              if (crq->ioprio >= 0 && crq->ioprio <= 20)
 552 //                      cfqd->cid[crq->ioprio].lsectorate = newavsec;
 553 //              atomic_set(&(cfqd->cid[crq->ioprio].lsectorate),
 554 //                         newavsec);
 555
 556                 if ((newavsec < cfqq->sectorate) || cfqq->skipped) {
 557                         cfqq->avsec = newavsec ;
 558                         cfqq->lastime = ts;
 559                         cfqq->skipped = 0;
 560                 } else {
 561                         /* queue over share ; skip once */
 562                         cfqq->skipped = 1;
 563 #ifdef LIMIT_DEBUG
 564 //                      atomic_inc(&(cfqd->cid[crq->ioprio].nskip));
 565 //                      if (crq->ioprio >= 0 && crq->ioprio <= 20)
 566 //                              cfqd->cid[crq->ioprio].nskip++;
 567 #endif
 568                         return 0;
 569                 }
 570         }
 571 #endif
 572
 573 #ifdef LIMIT_DEBUG
 574 //      if (crq->ioprio >= 0 && crq->ioprio <= 20) {
 575 //              cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
 576 //              cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate;
 577 //      }
 578
 579 //      atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec);
 580 //      atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate);
 581 #endif
 582         cfq_dispatch_sort(cfqd, cfqq, crq);
 583
 584         /*
 585          * technically, for IOPRIO_RT we don't need to add it to the list.
 586          */
 587         list_add_tail(&crq->prio_list, &cfqd->cid[cfqq->ioprio].prio_list);
 588         return crq->nr_sectors;
 589 }
 590
 591 static int
 592 cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors)
 593 {
 594         struct cfq_data *cfqd = q->elevator.elevator_data;
 595         struct list_head *plist = &cfqd->cid[prio].rr_list;
 596         struct list_head *entry, *nxt;
 597         int q_rq, q_io;
 598         int ret ;
 599
 600         /*
 601          * for each queue at this prio level, dispatch a request
 602          */
 603         q_rq = q_io = 0;
 604         list_for_each_safe(entry, nxt, plist) {
 605                 struct cfq_queue *cfqq = list_entry_cfqq(entry);
 606
 607                 BUG_ON(RB_EMPTY(&cfqq->sort_list));
 608
 609                 ret = __cfq_dispatch_requests(q, cfqd, cfqq);
 610                 if (ret <= 0) {
 611                         continue; /* skip queue */
 612                         /* can optimize more by moving q to end of plist ? */
 613                 }
 614                 q_io += ret ;
 615                 q_rq++ ;
 616
 617                 if (RB_EMPTY(&cfqq->sort_list))
 618                         cfq_put_queue(cfqd, cfqq);
 619                 /*
 620                  * if we hit the queue limit, put the string of serviced
 621                  * queues at the back of the pending list
 622                  */
 623                 if (q_io >= max_sectors || q_rq >= max_rq) {
 624                         struct list_head *prv = nxt->prev;
 625
 626                         if (prv != plist) {
 627                                 list_del(plist);
 628                                 list_add(plist, prv);
 629                         }
 630                         break;
 631                 }
 632         }
 633
 634         cfqd->cid[prio].last_rq = q_rq;
 635         cfqd->cid[prio].last_sectors = q_io;
 636         return q_rq;
 637 }
 638
 639 /*
 640  * try to move some requests to the dispatch list. return 0 on success
 641  */
 642 static int cfq_select_requests(request_queue_t *q, struct cfq_data *cfqd)
 643 {
 644         int queued, busy_rq, busy_sectors, i;
 645
 646         /*
 647          * if there's any realtime io, only schedule that
 648          */
 649         if (cfq_dispatch_requests(q, IOPRIO_RT, cfqd->cfq_quantum, cfqd->cfq_quantum_io))
 650                 return 1;
 651
 652         /*
 653          * if RT io was last serviced and grace time hasn't expired,
 654          * arm the timer to restart queueing if no other RT io has been
 655          * submitted in the mean time
 656          */
 657         if (test_bit(CFQ_WAIT_RT, &cfqd->flags)) {
 658                 if (time_before(jiffies, cfqd->wait_end)) {
 659                         mod_timer(&cfqd->timer, cfqd->wait_end);
 660                         return 0;
 661                 }
 662                 clear_bit(CFQ_WAIT_RT, &cfqd->flags);
 663         }
 664
 665         /*
 666          * for each priority level, calculate number of requests we
 667          * are allowed to put into service.
 668          */
 669         queued = 0;
 670         busy_rq = cfqd->busy_rq;
 671         busy_sectors = cfqd->busy_sectors;
 672         for (i = IOPRIO_RT - 1; i > IOPRIO_IDLE; i--) {
 673                 const int o_rq = busy_rq - cfqd->cid[i].busy_rq;
 674                 const int o_sectors = busy_sectors - cfqd->cid[i].busy_sectors;
 675                 int q_rq = cfqd->cfq_quantum * (i + 1) / IOPRIO_NR;
 676                 int q_io = cfqd->cfq_quantum_io * (i + 1) / IOPRIO_NR;
 677
 678                 /*
 679                  * no need to keep iterating the list, if there are no
 680                  * requests pending anymore
 681                  */
 682                 if (!cfqd->busy_rq)
 683                         break;
 684
 685                 /*
 686                  * find out how many requests and sectors we are allowed to
 687                  * service
 688                  */
 689                 if (o_rq)
 690                         q_rq = o_sectors * (i + 1) / IOPRIO_NR;
 691                 if (q_rq > cfqd->cfq_quantum)
 692                         q_rq = cfqd->cfq_quantum;
 693
 694                 if (o_sectors)
 695                         q_io = o_sectors * (i + 1) / IOPRIO_NR;
 696                 if (q_io > cfqd->cfq_quantum_io)
 697                         q_io = cfqd->cfq_quantum_io;
 698
 699                 /*
 700                  * average with last dispatched for fairness
 701                  */
 702                 if (cfqd->cid[i].last_rq != -1)
 703                         q_rq = (cfqd->cid[i].last_rq + q_rq) / 2;
 704                 if (cfqd->cid[i].last_sectors != -1)
 705                         q_io = (cfqd->cid[i].last_sectors + q_io) / 2;
 706
 707                 queued += cfq_dispatch_requests(q, i, q_rq, q_io);
 708         }
 709
 710         if (queued)
 711                 return 1;
 712
 713         /*
 714          * only allow dispatch of idle io, if the queue has been idle from
 715          * servicing RT or normal io for the grace period
 716          */
 717         if (test_bit(CFQ_WAIT_NORM, &cfqd->flags)) {
 718                 if (time_before(jiffies, cfqd->wait_end)) {
 719                         mod_timer(&cfqd->timer, cfqd->wait_end);
 720                         return 0;
 721                 }
 722                 clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
 723         }
 724
 725         /*
 726          * if we found nothing to do, allow idle io to be serviced
 727          */
 728         if (cfq_dispatch_requests(q, IOPRIO_IDLE, cfqd->cfq_idle_quantum, cfqd->cfq_idle_quantum_io))
 729                 return 1;
 730
 731         return 0;
 732 }
 733
 734 static struct request *cfq_next_request(request_queue_t *q)
 735 {
 736         struct cfq_data *cfqd = q->elevator.elevator_data;
 737         struct request *rq;
 738
 739         if (!list_empty(cfqd->dispatch)) {
 740                 struct cfq_rq *crq;
 741 dispatch:
 742                 /*
 743                  * end grace period, we are servicing a request
 744                  */
 745                 del_timer(&cfqd->timer);
 746                 clear_bit(CFQ_WAIT_RT, &cfqd->flags);
 747                 clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
 748
 749                 BUG_ON(list_empty(cfqd->dispatch));
 750                 rq = list_entry_rq(cfqd->dispatch->next);
 751
 752                 BUG_ON(q->last_merge == rq);
 753                 crq = RQ_ELV_DATA(rq);
 754                 if (crq) {
 755                         BUG_ON(!hlist_unhashed(&crq->hash));
 756                         list_del_init(&crq->prio_list);
 757                 }
 758
 759                 return rq;
 760         }
 761
 762         /*
 763          * we moved requests to dispatch list, go back end serve one
 764          */
 765         if (cfq_select_requests(q, cfqd))
 766                 goto dispatch;
 767
 768         return NULL;
 769 }
 770
 771 static inline struct cfq_queue *
 772 __cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey, const int hashval)
 773 {
 774         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
 775         struct hlist_node *entry;
 776
 777         hlist_for_each(entry, hash_list) {
 778                 struct cfq_queue *__cfqq = list_entry_qhash(entry);
 779
 780                 if (__cfqq->hash_key == hashkey)
 781                         return __cfqq;
 782         }
 783
 784         return NULL;
 785 }
 786
 787
 788 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey)
 789 {
 790         const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
 791
 792         return __cfq_find_cfq_hash(cfqd, hashkey, hashval);
 793 }
 794
 795 static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 796 {
 797         cfqd->busy_queues--;
 798         WARN_ON(cfqd->busy_queues < 0);
 799
 800         cfqd->cid[cfqq->ioprio].busy_queues--;
 801         WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0);
 802         atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
 803
 804         list_del(&cfqq->cfq_list);
 805         hlist_del(&cfqq->cfq_hash);
 806         mempool_free(cfqq, cfq_mpool);
 807 }
 808
 809 static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey,
 810                                          int gfp_mask)
 811 {
 812         const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
 813         struct cfq_queue *cfqq, *new_cfqq = NULL;
 814         request_queue_t *q = cfqd->queue;
 815
 816 retry:
 817         cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
 818
 819         if (!cfqq) {
 820                 if (new_cfqq) {
 821                         cfqq = new_cfqq;
 822                         new_cfqq = NULL;
 823                 } else if (gfp_mask & __GFP_WAIT) {
 824                         spin_unlock_irq(q->queue_lock);
 825                         new_cfqq = mempool_alloc(cfq_mpool, gfp_mask);
 826                         spin_lock_irq(q->queue_lock);
 827                         goto retry;
 828                 } else
 829                         return NULL;
 830
 831                 memset(cfqq, 0, sizeof(*cfqq));
 832                 INIT_HLIST_NODE(&cfqq->cfq_hash);
 833                 INIT_LIST_HEAD(&cfqq->cfq_list);
 834                 cfqq->hash_key = cfq_hash_key(current);
 835                 cfqq->ioprio = cfq_ioprio(current);
 836                 cfqq->avsec = 0 ;
 837                 cfqq->lastime = sched_clock();
 838                 cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100;
 839                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
 840         }
 841
 842         if (new_cfqq)
 843                 mempool_free(new_cfqq, cfq_mpool);
 844
 845         return cfqq;
 846 }
 847
 848 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int hashkey,
 849                                        int gfp_mask)
 850 {
 851         request_queue_t *q = cfqd->queue;
 852         struct cfq_queue *cfqq;
 853
 854         spin_lock_irq(q->queue_lock);
 855         cfqq = __cfq_get_queue(cfqd, hashkey, gfp_mask);
 856         spin_unlock_irq(q->queue_lock);
 857
 858         return cfqq;
 859 }
 860
 861 static void
 862 __cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
 863 {
 864         const int prio = crq->ioprio;
 865         struct cfq_queue *cfqq;
 866
 867         cfqq = __cfq_get_queue(cfqd, cfq_hash_key(current), GFP_ATOMIC);
 868         if (cfqq) {
 869
 870                 /*
 871                  * not too good...
 872                  */
 873                 if (prio > cfqq->ioprio) {
 874                         printk("prio hash collision %d %d\n",
 875                                prio, cfqq->ioprio);
 876                         if (!list_empty(&cfqq->cfq_list)) {
 877                                 cfqd->cid[cfqq->ioprio].busy_queues--;
 878                                 WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues<0);
 879                                 atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out));
 880                                 cfqd->cid[prio].busy_queues++;
 881                                 atomic_inc(&(cfqd->cid[prio].cum_queues_in));
 882                                 list_move_tail(&cfqq->cfq_list,
 883                                                &cfqd->cid[prio].rr_list);
 884                         }
 885                         cfqq->ioprio = prio;
 886                 }
 887
 888                 cfq_add_crq_rb(cfqd, cfqq, crq);
 889
 890                 if (list_empty(&cfqq->cfq_list)) {
 891                         list_add_tail(&cfqq->cfq_list,
 892                                       &cfqd->cid[prio].rr_list);
 893                         cfqd->cid[prio].busy_queues++;
 894                         atomic_inc(&(cfqd->cid[prio].cum_queues_in));
 895                         cfqd->busy_queues++;
 896                 }
 897
 898                 if (rq_mergeable(crq->request)) {
 899                         cfq_add_crq_hash(cfqd, crq);
 900
 901                         if (!q->last_merge)
 902                                 q->last_merge = crq->request;
 903                 }
 904
 905         } else {
 906                 /*
 907                  * should can only happen if the request wasn't allocated
 908                  * through blk_alloc_request(), eg stack requests from ide-cd
 909                  * (those should be removed) _and_ we are in OOM.
 910                  */
 911                 list_add_tail(&crq->request->queuelist, cfqd->dispatch);
 912         }
 913 }
 914
 915 static void cfq_reenqueue(request_queue_t *q, struct cfq_data *cfqd, int prio)
 916 {
 917         struct list_head *prio_list = &cfqd->cid[prio].prio_list;
 918         struct list_head *entry, *tmp;
 919
 920         list_for_each_safe(entry, tmp, prio_list) {
 921                 struct cfq_rq *crq = list_entry_prio(entry);
 922
 923                 list_del_init(entry);
 924                 list_del_init(&crq->request->queuelist);
 925                 __cfq_enqueue(q, cfqd, crq);
 926         }
 927 }
 928
 929 static void
 930 cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq)
 931 {
 932         const int prio = cfq_ioprio(current);
 933
 934         crq->ioprio = prio;
 935         crq->nr_sectors = crq->request->hard_nr_sectors;
 936         __cfq_enqueue(q, cfqd, crq);
 937
 938         if (prio == IOPRIO_RT) {
 939                 int i;
 940
 941                 /*
 942                  * realtime io gets priority, move all other io back
 943                  */
 944                 for (i = IOPRIO_IDLE; i < IOPRIO_RT; i++)
 945                         cfq_reenqueue(q, cfqd, i);
 946         } else if (prio != IOPRIO_IDLE) {
 947                 /*
 948                  * check if we need to move idle io back into queue
 949                  */
 950                 cfq_reenqueue(q, cfqd, IOPRIO_IDLE);
 951         }
 952 }
 953
 954 static void
 955 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 956 {
 957         struct cfq_data *cfqd = q->elevator.elevator_data;
 958         struct cfq_rq *crq = RQ_ELV_DATA(rq);
 959
 960         switch (where) {
 961                 case ELEVATOR_INSERT_BACK:
 962 #if 0
 963                         while (cfq_dispatch_requests(q, cfqd))
 964                                 ;
 965 #endif
 966                         list_add_tail(&rq->queuelist, cfqd->dispatch);
 967                         break;
 968                 case ELEVATOR_INSERT_FRONT:
 969                         list_add(&rq->queuelist, cfqd->dispatch);
 970                         break;
 971                 case ELEVATOR_INSERT_SORT:
 972                         BUG_ON(!blk_fs_request(rq));
 973                         cfq_enqueue(q, cfqd, crq);
 974                         break;
 975                 default:
 976                         printk("%s: bad insert point %d\n",
 977                                __FUNCTION__,where);
 978                         return;
 979         }
 980 }
 981
 982 static int cfq_queue_empty(request_queue_t *q)
 983 {
 984         struct cfq_data *cfqd = q->elevator.elevator_data;
 985
 986         if (list_empty(cfqd->dispatch) && !cfqd->busy_queues)
 987                 return 1;
 988
 989         return 0;
 990 }
 991
 992 static struct request *
 993 cfq_former_request(request_queue_t *q, struct request *rq)
 994 {
 995         struct cfq_rq *crq = RQ_ELV_DATA(rq);
 996         struct rb_node *rbprev = rb_prev(&crq->rb_node);
 997
 998         if (rbprev)
 999                 return rb_entry_crq(rbprev)->request;
1000
1001         return NULL;
1002 }
1003
1004 static struct request *
1005 cfq_latter_request(request_queue_t *q, struct request *rq)
1006 {
1007         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1008         struct rb_node *rbnext = rb_next(&crq->rb_node);
1009
1010         if (rbnext)
1011                 return rb_entry_crq(rbnext)->request;
1012
1013         return NULL;
1014 }
1015
1016 static void cfq_queue_congested(request_queue_t *q)
1017 {
1018         struct cfq_data *cfqd = q->elevator.elevator_data;
1019
1020         set_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
1021 }
1022
1023 static int cfq_may_queue(request_queue_t *q, int rw)
1024 {
1025         struct cfq_data *cfqd = q->elevator.elevator_data;
1026         struct cfq_queue *cfqq;
1027         const int prio = cfq_ioprio(current);
1028         int limit, ret = 1;
1029
1030         if (!cfqd->busy_queues)
1031                 goto out;
1032
1033         cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
1034         if (!cfqq)
1035                 goto out;
1036
1037         cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current));
1038         if (!cfqq)
1039                 goto out;
1040
1041         /*
1042          * if higher or equal prio io is sleeping waiting for a request, don't
1043          * allow this one to allocate one. as long as ll_rw_blk does fifo
1044          * waitqueue wakeups this should work...
1045          */
1046         if (cfqd->rq_starved_mask & ~((1 << prio) - 1))
1047                 goto out;
1048
1049         if (cfqq->queued[rw] < cfqd->cfq_queued || !cfqd->cid[prio].busy_queues)
1050                 goto out;
1051
1052         limit = q->nr_requests * (prio + 1) / IOPRIO_NR;
1053         limit /= cfqd->cid[prio].busy_queues;
1054         if (cfqq->queued[rw] > limit)
1055                 ret = 0;
1056 out:
1057         return ret;
1058 }
1059
1060 static void cfq_put_request(request_queue_t *q, struct request *rq)
1061 {
1062         struct cfq_data *cfqd = q->elevator.elevator_data;
1063         struct cfq_rq *crq = RQ_ELV_DATA(rq);
1064         struct request_list *rl;
1065         int other_rw;
1066
1067         if (crq) {
1068                 BUG_ON(q->last_merge == rq);
1069                 BUG_ON(!hlist_unhashed(&crq->hash));
1070
1071                 mempool_free(crq, cfqd->crq_pool);
1072                 rq->elevator_private = NULL;
1073         }
1074
1075         /*
1076          * work-around for may_queue "bug": if a read gets issued and refused
1077          * to queue because writes ate all the allowed slots and no other
1078          * reads are pending for this queue, it could get stuck infinitely
1079          * since freed_request() only checks the waitqueue for writes when
1080          * freeing them. or vice versa for a single write vs many reads.
1081          * so check here whether "the other" data direction might be able
1082          * to queue and wake them
1083          */
1084         rl = &q->rq;
1085         other_rw = rq_data_dir(rq) ^ 1;
1086         if (rl->count[other_rw] <= q->nr_requests) {
1087                 smp_mb();
1088                 if (waitqueue_active(&rl->wait[other_rw]))
1089                         wake_up(&rl->wait[other_rw]);
1090         }
1091 }
1092
1093 static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
1094 {
1095         struct cfq_data *cfqd = q->elevator.elevator_data;
1096         struct cfq_queue *cfqq;
1097         struct cfq_rq *crq;
1098
1099         /*
1100          * prepare a queue up front, so cfq_enqueue() doesn't have to
1101          */
1102         cfqq = cfq_get_queue(cfqd, cfq_hash_key(current), gfp_mask);
1103         if (!cfqq)
1104                 return 1;
1105
1106         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
1107         if (crq) {
1108                 /*
1109                  * process now has one request
1110                  */
1111                 clear_bit(cfq_ioprio(current), &cfqd->rq_starved_mask);
1112
1113                 memset(crq, 0, sizeof(*crq));
1114                 crq->request = rq;
1115                 INIT_HLIST_NODE(&crq->hash);
1116                 INIT_LIST_HEAD(&crq->prio_list);
1117                 rq->elevator_private = crq;
1118                 return 0;
1119         }
1120
1121         return 1;
1122 }
1123
1124 static void cfq_exit(request_queue_t *q, elevator_t *e)
1125 {
1126         struct cfq_data *cfqd = e->elevator_data;
1127
1128         e->elevator_data = NULL;
1129         mempool_destroy(cfqd->crq_pool);
1130         kfree(cfqd->crq_hash);
1131         kfree(cfqd->cfq_hash);
1132         kfree(cfqd);
1133 }
1134
1135 static void cfq_timer(unsigned long data)
1136 {
1137         struct cfq_data *cfqd = (struct cfq_data *) data;
1138
1139         clear_bit(CFQ_WAIT_RT, &cfqd->flags);
1140         clear_bit(CFQ_WAIT_NORM, &cfqd->flags);
1141         kblockd_schedule_work(&cfqd->work);
1142 }
1143
1144 static void cfq_work(void *data)
1145 {
1146         request_queue_t *q = data;
1147         unsigned long flags;
1148
1149         spin_lock_irqsave(q->queue_lock, flags);
1150         if (cfq_next_request(q))
1151                 q->request_fn(q);
1152         spin_unlock_irqrestore(q->queue_lock, flags);
1153 }
1154
1155 static int cfq_init(request_queue_t *q, elevator_t *e)
1156 {
1157         struct cfq_data *cfqd;
1158         int i;
1159
1160         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
1161         if (!cfqd)
1162                 return -ENOMEM;
1163
1164         memset(cfqd, 0, sizeof(*cfqd));
1165         init_timer(&cfqd->timer);
1166         cfqd->timer.function = cfq_timer;
1167         cfqd->timer.data = (unsigned long) cfqd;
1168
1169         INIT_WORK(&cfqd->work, cfq_work, q);
1170
1171         for (i = 0; i < IOPRIO_NR; i++) {
1172                 struct io_prio_data *cid = &cfqd->cid[i];
1173
1174                 INIT_LIST_HEAD(&cid->rr_list);
1175                 INIT_LIST_HEAD(&cid->prio_list);
1176                 cid->last_rq = -1;
1177                 cid->last_sectors = -1;
1178
1179                 atomic_set(&cid->cum_rq_in,0);
1180                 atomic_set(&cid->cum_rq_out,0);
1181                 atomic_set(&cid->cum_sectors_in,0);
1182                 atomic_set(&cid->cum_sectors_out,0);
1183                 atomic_set(&cid->cum_queues_in,0);
1184                 atomic_set(&cid->cum_queues_out,0);
1185 #if 0
1186                 atomic_set(&cid->nskip,0);
1187                 atomic_set(&cid->navsec,0);
1188                 atomic_set(&cid->csectorate,0);
1189                 atomic_set(&cid->lsectorate,0);
1190 #endif
1191         }
1192
1193         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES,
1194                                  GFP_KERNEL);
1195         if (!cfqd->crq_hash)
1196                 goto out_crqhash;
1197
1198         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES,
1199                                  GFP_KERNEL);
1200         if (!cfqd->cfq_hash)
1201                 goto out_cfqhash;
1202
1203         cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab,
1204                                         mempool_free_slab, crq_pool);
1205         if (!cfqd->crq_pool)
1206                 goto out_crqpool;
1207
1208         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
1209                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
1210         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
1211                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
1212
1213         cfqd->cfq_queued = cfq_queued;
1214         cfqd->cfq_quantum = cfq_quantum;
1215         cfqd->cfq_quantum_io = cfq_quantum_io;
1216         cfqd->cfq_idle_quantum = cfq_idle_quantum;
1217         cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
1218         cfqd->cfq_grace_rt = cfq_grace_rt;
1219         cfqd->cfq_grace_idle = cfq_grace_idle;
1220
1221         q->nr_requests <<= 2;
1222
1223         cfqd->dispatch = &q->queue_head;
1224         e->elevator_data = cfqd;
1225         cfqd->queue = q;
1226
1227         cfqd->cfq_epoch = CFQ_EPOCH;
1228         if (q->hardsect_size)
1229                 cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/
1230                                       q->hardsect_size)* (1000000 / CFQ_EPOCH);
1231         else
1232                 cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512)
1233                         * (1000000 / CFQ_EPOCH) ;
1234
1235         return 0;
1236 out_crqpool:
1237         kfree(cfqd->cfq_hash);
1238 out_cfqhash:
1239         kfree(cfqd->crq_hash);
1240 out_crqhash:
1241         kfree(cfqd);
1242         return -ENOMEM;
1243 }
1244
1245 static int __init cfq_slab_setup(void)
1246 {
1247         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
1248                                         NULL, NULL);
1249
1250         if (!crq_pool)
1251                 panic("cfq_iosched: can't init crq pool\n");
1252
1253         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
1254                                         NULL, NULL);
1255
1256         if (!cfq_pool)
1257                 panic("cfq_iosched: can't init cfq pool\n");
1258
1259         cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
1260
1261         if (!cfq_mpool)
1262                 panic("cfq_iosched: can't init cfq mpool\n");
1263
1264         return 0;
1265 }
1266
1267 subsys_initcall(cfq_slab_setup);
1268
1269 /*
1270  * sysfs parts below -->
1271  */
1272 struct cfq_fs_entry {
1273         struct attribute attr;
1274         ssize_t (*show)(struct cfq_data *, char *);
1275         ssize_t (*store)(struct cfq_data *, const char *, size_t);
1276 };
1277
1278 static ssize_t
1279 cfq_var_show(unsigned int var, char *page)
1280 {
1281         return sprintf(page, "%d\n", var);
1282 }
1283
1284 static ssize_t
1285 cfq_var_store(unsigned int *var, const char *page, size_t count)
1286 {
1287         char *p = (char *) page;
1288
1289         *var = simple_strtoul(p, &p, 10);
1290         return count;
1291 }
1292
1293 #define SHOW_FUNCTION(__FUNC, __VAR)                                    \
1294 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)                \
1295 {                                                                       \
1296         return cfq_var_show(__VAR, (page));                             \
1297 }
1298 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum);
1299 SHOW_FUNCTION(cfq_quantum_io_show, cfqd->cfq_quantum_io);
1300 SHOW_FUNCTION(cfq_idle_quantum_show, cfqd->cfq_idle_quantum);
1301 SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io);
1302 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
1303 SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
1304 SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
1305 #undef SHOW_FUNCTION
1306
1307 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                         \
1308 static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)    \
1309 {                                                                       \
1310         int ret = cfq_var_store(__PTR, (page), count);                  \
1311         if (*(__PTR) < (MIN))                                           \
1312                 *(__PTR) = (MIN);                                       \
1313         else if (*(__PTR) > (MAX))                                      \
1314                 *(__PTR) = (MAX);                                       \
1315         return ret;                                                     \
1316 }
1317 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX);
1318 STORE_FUNCTION(cfq_quantum_io_store, &cfqd->cfq_quantum_io, 4, INT_MAX);
1319 STORE_FUNCTION(cfq_idle_quantum_store, &cfqd->cfq_idle_quantum, 1, INT_MAX);
1320 STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX);
1321 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
1322 STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
1323 STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
1324 #undef STORE_FUNCTION
1325
1326
1327 static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page)
1328 {
1329         return sprintf(page, "%lu\n", cfqd->cfq_epoch);
1330 }
1331
1332 static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count)
1333 {
1334         char *p = (char *) page;
1335         cfqd->cfq_epoch = simple_strtoul(p, &p, 10);
1336         return count;
1337 }
1338
1339 static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page)
1340 {
1341         return sprintf(page, "%lu\n", cfqd->cfq_epochsectors);
1342 }
1343
1344 static ssize_t
1345 cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count)
1346 {
1347         char *p = (char *) page;
1348         cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10);
1349         return count;
1350 }
1351
1352 /* Additional entries to get priority level data */
1353 static ssize_t
1354 cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
1355 {
1356         int r1,r2,s1,s2,q1,q2;
1357
1358         if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT))
1359                 return 0;
1360
1361         r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
1362         r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
1363         s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
1364         s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
1365         q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in));
1366         q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
1367
1368         return sprintf(page,"skip %d avsec %lu rate %lu new %lu"
1369                        "rq (%d,%d) sec (%d,%d) q (%d,%d)\n",
1370                        cfqd->cid[priolvl].nskip,
1371                        cfqd->cid[priolvl].navsec,
1372                        cfqd->cid[priolvl].csectorate,
1373                        cfqd->cid[priolvl].lsectorate,
1374 //                     atomic_read(&cfqd->cid[priolvl].nskip),
1375 //                     atomic_read(&cfqd->cid[priolvl].navsec),
1376 //                     atomic_read(&cfqd->cid[priolvl].csectorate),
1377 //                     atomic_read(&cfqd->cid[priolvl].lsectorate),
1378                        r1,r2,
1379                        s1,s2,
1380                        q1,q2);
1381 }
1382
1383 #define SHOW_PRIO_DATA(__PRIOLVL)                                               \
1384 static ssize_t cfq_prio_##__PRIOLVL##_show(struct cfq_data *cfqd, char *page)   \
1385 {                                                                               \
1386         return cfq_prio_show(cfqd,page,__PRIOLVL);                              \
1387 }
1388 SHOW_PRIO_DATA(0);
1389 SHOW_PRIO_DATA(1);
1390 SHOW_PRIO_DATA(2);
1391 SHOW_PRIO_DATA(3);
1392 SHOW_PRIO_DATA(4);
1393 SHOW_PRIO_DATA(5);
1394 SHOW_PRIO_DATA(6);
1395 SHOW_PRIO_DATA(7);
1396 SHOW_PRIO_DATA(8);
1397 SHOW_PRIO_DATA(9);
1398 SHOW_PRIO_DATA(10);
1399 SHOW_PRIO_DATA(11);
1400 SHOW_PRIO_DATA(12);
1401 SHOW_PRIO_DATA(13);
1402 SHOW_PRIO_DATA(14);
1403 SHOW_PRIO_DATA(15);
1404 SHOW_PRIO_DATA(16);
1405 SHOW_PRIO_DATA(17);
1406 SHOW_PRIO_DATA(18);
1407 SHOW_PRIO_DATA(19);
1408 SHOW_PRIO_DATA(20);
1409 #undef SHOW_PRIO_DATA
1410
1411
1412 static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
1413 {
1414         atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
1415         atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
1416         atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
1417         atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
1418         atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
1419         atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
1420
1421         return count;
1422 }
1423
1424
1425 #define STORE_PRIO_DATA(__PRIOLVL)                                                                 \
1426 static ssize_t cfq_prio_##__PRIOLVL##_store(struct cfq_data *cfqd, const char *page, size_t count) \
1427 {                                                                                                  \
1428         return cfq_prio_store(cfqd,page,count,__PRIOLVL);                                          \
1429 }
1430 STORE_PRIO_DATA(0);
1431 STORE_PRIO_DATA(1);
1432 STORE_PRIO_DATA(2);
1433 STORE_PRIO_DATA(3);
1434 STORE_PRIO_DATA(4);
1435 STORE_PRIO_DATA(5);
1436 STORE_PRIO_DATA(6);
1437 STORE_PRIO_DATA(7);
1438 STORE_PRIO_DATA(8);
1439 STORE_PRIO_DATA(9);
1440 STORE_PRIO_DATA(10);
1441 STORE_PRIO_DATA(11);
1442 STORE_PRIO_DATA(12);
1443 STORE_PRIO_DATA(13);
1444 STORE_PRIO_DATA(14);
1445 STORE_PRIO_DATA(15);
1446 STORE_PRIO_DATA(16);
1447 STORE_PRIO_DATA(17);
1448 STORE_PRIO_DATA(18);
1449 STORE_PRIO_DATA(19);
1450 STORE_PRIO_DATA(20);
1451 #undef STORE_PRIO_DATA
1452
1453
1454 static struct cfq_fs_entry cfq_quantum_entry = {
1455         .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR },
1456         .show = cfq_quantum_show,
1457         .store = cfq_quantum_store,
1458 };
1459 static struct cfq_fs_entry cfq_quantum_io_entry = {
1460         .attr = {.name = "quantum_io", .mode = S_IRUGO | S_IWUSR },
1461         .show = cfq_quantum_io_show,
1462         .store = cfq_quantum_io_store,
1463 };
1464 static struct cfq_fs_entry cfq_idle_quantum_entry = {
1465         .attr = {.name = "idle_quantum", .mode = S_IRUGO | S_IWUSR },
1466         .show = cfq_idle_quantum_show,
1467         .store = cfq_idle_quantum_store,
1468 };
1469 static struct cfq_fs_entry cfq_idle_quantum_io_entry = {
1470         .attr = {.name = "idle_quantum_io", .mode = S_IRUGO | S_IWUSR },
1471         .show = cfq_idle_quantum_io_show,
1472         .store = cfq_idle_quantum_io_store,
1473 };
1474 static struct cfq_fs_entry cfq_queued_entry = {
1475         .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR },
1476         .show = cfq_queued_show,
1477         .store = cfq_queued_store,
1478 };
1479 static struct cfq_fs_entry cfq_grace_rt_entry = {
1480         .attr = {.name = "grace_rt", .mode = S_IRUGO | S_IWUSR },
1481         .show = cfq_grace_rt_show,
1482         .store = cfq_grace_rt_store,
1483 };
1484 static struct cfq_fs_entry cfq_grace_idle_entry = {
1485         .attr = {.name = "grace_idle", .mode = S_IRUGO | S_IWUSR },
1486         .show = cfq_grace_idle_show,
1487         .store = cfq_grace_idle_store,
1488 };
1489 static struct cfq_fs_entry cfq_epoch_entry = {
1490         .attr = {.name = "epoch", .mode = S_IRUGO | S_IWUSR },
1491         .show = cfq_epoch_show,
1492         .store = cfq_epoch_store,
1493 };
1494 static struct cfq_fs_entry cfq_epochsectors_entry = {
1495         .attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR },
1496         .show = cfq_epochsectors_show,
1497         .store = cfq_epochsectors_store,
1498 };
1499
1500 #define P_0_STR   "p0"
1501 #define P_1_STR   "p1"
1502 #define P_2_STR   "p2"
1503 #define P_3_STR   "p3"
1504 #define P_4_STR   "p4"
1505 #define P_5_STR   "p5"
1506 #define P_6_STR   "p6"
1507 #define P_7_STR   "p7"
1508 #define P_8_STR   "p8"
1509 #define P_9_STR   "p9"
1510 #define P_10_STR  "p10"
1511 #define P_11_STR  "p11"
1512 #define P_12_STR  "p12"
1513 #define P_13_STR  "p13"
1514 #define P_14_STR  "p14"
1515 #define P_15_STR  "p15"
1516 #define P_16_STR  "p16"
1517 #define P_17_STR  "p17"
1518 #define P_18_STR  "p18"
1519 #define P_19_STR  "p19"
1520 #define P_20_STR  "p20"
1521
1522
1523 #define CFQ_PRIO_SYSFS_ENTRY(__PRIOLVL)                                    \
1524 static struct cfq_fs_entry cfq_prio_##__PRIOLVL##_entry = {                \
1525         .attr = {.name = P_##__PRIOLVL##_STR, .mode = S_IRUGO | S_IWUSR }, \
1526         .show = cfq_prio_##__PRIOLVL##_show,                               \
1527         .store = cfq_prio_##__PRIOLVL##_store,                             \
1528 };
1529 CFQ_PRIO_SYSFS_ENTRY(0);
1530 CFQ_PRIO_SYSFS_ENTRY(1);
1531 CFQ_PRIO_SYSFS_ENTRY(2);
1532 CFQ_PRIO_SYSFS_ENTRY(3);
1533 CFQ_PRIO_SYSFS_ENTRY(4);
1534 CFQ_PRIO_SYSFS_ENTRY(5);
1535 CFQ_PRIO_SYSFS_ENTRY(6);
1536 CFQ_PRIO_SYSFS_ENTRY(7);
1537 CFQ_PRIO_SYSFS_ENTRY(8);
1538 CFQ_PRIO_SYSFS_ENTRY(9);
1539 CFQ_PRIO_SYSFS_ENTRY(10);
1540 CFQ_PRIO_SYSFS_ENTRY(11);
1541 CFQ_PRIO_SYSFS_ENTRY(12);
1542 CFQ_PRIO_SYSFS_ENTRY(13);
1543 CFQ_PRIO_SYSFS_ENTRY(14);
1544 CFQ_PRIO_SYSFS_ENTRY(15);
1545 CFQ_PRIO_SYSFS_ENTRY(16);
1546 CFQ_PRIO_SYSFS_ENTRY(17);
1547 CFQ_PRIO_SYSFS_ENTRY(18);
1548 CFQ_PRIO_SYSFS_ENTRY(19);
1549 CFQ_PRIO_SYSFS_ENTRY(20);
1550 #undef CFQ_PRIO_SYSFS_ENTRY
1551
1552 static struct attribute *default_attrs[] = {
1553         &cfq_quantum_entry.attr,
1554         &cfq_quantum_io_entry.attr,
1555         &cfq_idle_quantum_entry.attr,
1556         &cfq_idle_quantum_io_entry.attr,
1557         &cfq_queued_entry.attr,
1558         &cfq_grace_rt_entry.attr,
1559         &cfq_grace_idle_entry.attr,
1560         &cfq_epoch_entry.attr,
1561         &cfq_epochsectors_entry.attr,
1562         &cfq_prio_0_entry.attr,
1563         &cfq_prio_1_entry.attr,
1564         &cfq_prio_2_entry.attr,
1565         &cfq_prio_3_entry.attr,
1566         &cfq_prio_4_entry.attr,
1567         &cfq_prio_5_entry.attr,
1568         &cfq_prio_6_entry.attr,
1569         &cfq_prio_7_entry.attr,
1570         &cfq_prio_8_entry.attr,
1571         &cfq_prio_9_entry.attr,
1572         &cfq_prio_10_entry.attr,
1573         &cfq_prio_11_entry.attr,
1574         &cfq_prio_12_entry.attr,
1575         &cfq_prio_13_entry.attr,
1576         &cfq_prio_14_entry.attr,
1577         &cfq_prio_15_entry.attr,
1578         &cfq_prio_16_entry.attr,
1579         &cfq_prio_17_entry.attr,
1580         &cfq_prio_18_entry.attr,
1581         &cfq_prio_19_entry.attr,
1582         &cfq_prio_20_entry.attr,
1583         NULL,
1584 };
1585
1586 #define to_cfq(atr) container_of((atr), struct cfq_fs_entry, attr)
1587
1588 static ssize_t
1589 cfq_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
1590 {
1591         elevator_t *e = container_of(kobj, elevator_t, kobj);
1592         struct cfq_fs_entry *entry = to_cfq(attr);
1593
1594         if (!entry->show)
1595                 return 0;
1596
1597         return entry->show(e->elevator_data, page);
1598 }
1599
1600 static ssize_t
1601 cfq_attr_store(struct kobject *kobj, struct attribute *attr,
1602                const char *page, size_t length)
1603 {
1604         elevator_t *e = container_of(kobj, elevator_t, kobj);
1605         struct cfq_fs_entry *entry = to_cfq(attr);
1606
1607         if (!entry->store)
1608                 return -EINVAL;
1609
1610         return entry->store(e->elevator_data, page, length);
1611 }
1612
1613 static struct sysfs_ops cfq_sysfs_ops = {
1614         .show   = cfq_attr_show,
1615         .store  = cfq_attr_store,
1616 };
1617
1618 struct kobj_type cfq_ktype = {
1619         .sysfs_ops      = &cfq_sysfs_ops,
1620         .default_attrs  = default_attrs,
1621 };
1622
1623 elevator_t iosched_cfq = {
1624         .elevator_name =                "cfq",
1625         .elevator_ktype =               &cfq_ktype,
1626         .elevator_merge_fn =            cfq_merge,
1627         .elevator_merged_fn =           cfq_merged_request,
1628         .elevator_merge_req_fn =        cfq_merged_requests,
1629         .elevator_next_req_fn =         cfq_next_request,
1630         .elevator_add_req_fn =          cfq_insert_request,
1631         .elevator_remove_req_fn =       cfq_remove_request,
1632         .elevator_queue_empty_fn =      cfq_queue_empty,
1633         .elevator_former_req_fn =       cfq_former_request,
1634         .elevator_latter_req_fn =       cfq_latter_request,
1635         .elevator_set_req_fn =          cfq_set_request,
1636         .elevator_put_req_fn =          cfq_put_request,
1637         .elevator_may_queue_fn =        cfq_may_queue,
1638         .elevator_set_congested_fn =    cfq_queue_congested,
1639         .elevator_init_fn =             cfq_init,
1640         .elevator_exit_fn =             cfq_exit,
1641 };
1642
1643 EXPORT_SYMBOL(iosched_cfq);