block/cfq-iosched.c

   1 /*
   2  *  CFQ, or complete fairness queueing, disk scheduler.
   3  *
   4  *  Based on ideas from a previously unfinished io
   5  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
   6  *
   7  *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
   8  */
   9 #include <linux/module.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/elevator.h>
  12 #include <linux/hash.h>
  13 #include <linux/rbtree.h>
  14 #include <linux/ioprio.h>
  15
  16 /*
  17  * tunables
  18  */
  19 static const int cfq_quantum = 4;               /* max queue in one round of service */
  20 static const int cfq_queued = 8;                /* minimum rq allocate limit per-queue*/
  21 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
  22 static const int cfq_back_max = 16 * 1024;      /* maximum backwards seek, in KiB */
  23 static const int cfq_back_penalty = 2;          /* penalty of a backwards seek */
  24
  25 static const int cfq_slice_sync = HZ / 10;
  26 static int cfq_slice_async = HZ / 25;
  27 static const int cfq_slice_async_rq = 2;
  28 static int cfq_slice_idle = HZ / 125;
  29
  30 #define CFQ_IDLE_GRACE          (HZ / 10)
  31 #define CFQ_SLICE_SCALE         (5)
  32
  33 #define CFQ_KEY_ASYNC           (0)
  34
  35 static DEFINE_SPINLOCK(cfq_exit_lock);
  36
  37 /*
  38  * for the hash of cfqq inside the cfqd
  39  */
  40 #define CFQ_QHASH_SHIFT         6
  41 #define CFQ_QHASH_ENTRIES       (1 << CFQ_QHASH_SHIFT)
  42 #define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash)
  43
  44 /*
  45  * for the hash of crq inside the cfqq
  46  */
  47 #define CFQ_MHASH_SHIFT         6
  48 #define CFQ_MHASH_BLOCK(sec)    ((sec) >> 3)
  49 #define CFQ_MHASH_ENTRIES       (1 << CFQ_MHASH_SHIFT)
  50 #define CFQ_MHASH_FN(sec)       hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
  51 #define rq_hash_key(rq)         ((rq)->sector + (rq)->nr_sectors)
  52 #define list_entry_hash(ptr)    hlist_entry((ptr), struct cfq_rq, hash)
  53
  54 #define list_entry_cfqq(ptr)    list_entry((ptr), struct cfq_queue, cfq_list)
  55 #define list_entry_fifo(ptr)    list_entry((ptr), struct request, queuelist)
  56
  57 #define RQ_DATA(rq)             (rq)->elevator_private
  58
  59 /*
  60  * rb-tree defines
  61  */
  62 #define rb_entry_crq(node)      rb_entry((node), struct cfq_rq, rb_node)
  63 #define rq_rb_key(rq)           (rq)->sector
  64
  65 static kmem_cache_t *crq_pool;
  66 static kmem_cache_t *cfq_pool;
  67 static kmem_cache_t *cfq_ioc_pool;
  68
  69 static atomic_t ioc_count = ATOMIC_INIT(0);
  70 static struct completion *ioc_gone;
  71
  72 #define CFQ_PRIO_LISTS          IOPRIO_BE_NR
  73 #define cfq_class_idle(cfqq)    ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
  74 #define cfq_class_be(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
  75 #define cfq_class_rt(cfqq)      ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
  76
  77 #define ASYNC                   (0)
  78 #define SYNC                    (1)
  79
  80 #define cfq_cfqq_dispatched(cfqq)       \
  81         ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
  82
  83 #define cfq_cfqq_class_sync(cfqq)       ((cfqq)->key != CFQ_KEY_ASYNC)
  84
  85 #define cfq_cfqq_sync(cfqq)             \
  86         (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
  87
  88 #define sample_valid(samples)   ((samples) > 80)
  89
  90 /*
  91  * Per block device queue structure
  92  */
  93 struct cfq_data {
  94         request_queue_t *queue;
  95
  96         /*
  97          * rr list of queues with requests and the count of them
  98          */
  99         struct list_head rr_list[CFQ_PRIO_LISTS];
 100         struct list_head busy_rr;
 101         struct list_head cur_rr;
 102         struct list_head idle_rr;
 103         unsigned int busy_queues;
 104
 105         /*
 106          * non-ordered list of empty cfqq's
 107          */
 108         struct list_head empty_list;
 109
 110         /*
 111          * cfqq lookup hash
 112          */
 113         struct hlist_head *cfq_hash;
 114
 115         /*
 116          * global crq hash for all queues
 117          */
 118         struct hlist_head *crq_hash;
 119
 120         mempool_t *crq_pool;
 121
 122         int rq_in_driver;
 123         int hw_tag;
 124
 125         /*
 126          * schedule slice state info
 127          */
 128         /*
 129          * idle window management
 130          */
 131         struct timer_list idle_slice_timer;
 132         struct work_struct unplug_work;
 133
 134         struct cfq_queue *active_queue;
 135         struct cfq_io_context *active_cic;
 136         int cur_prio, cur_end_prio;
 137         unsigned int dispatch_slice;
 138
 139         struct timer_list idle_class_timer;
 140
 141         sector_t last_sector;
 142         unsigned long last_end_request;
 143
 144         unsigned int rq_starved;
 145
 146         /*
 147          * tunables, see top of file
 148          */
 149         unsigned int cfq_quantum;
 150         unsigned int cfq_queued;
 151         unsigned int cfq_fifo_expire[2];
 152         unsigned int cfq_back_penalty;
 153         unsigned int cfq_back_max;
 154         unsigned int cfq_slice[2];
 155         unsigned int cfq_slice_async_rq;
 156         unsigned int cfq_slice_idle;
 157
 158         struct list_head cic_list;
 159 };
 160
 161 /*
 162  * Per process-grouping structure
 163  */
 164 struct cfq_queue {
 165         /* reference count */
 166         atomic_t ref;
 167         /* parent cfq_data */
 168         struct cfq_data *cfqd;
 169         /* cfqq lookup hash */
 170         struct hlist_node cfq_hash;
 171         /* hash key */
 172         unsigned int key;
 173         /* on either rr or empty list of cfqd */
 174         struct list_head cfq_list;
 175         /* sorted list of pending requests */
 176         struct rb_root sort_list;
 177         /* if fifo isn't expired, next request to serve */
 178         struct cfq_rq *next_crq;
 179         /* requests queued in sort_list */
 180         int queued[2];
 181         /* currently allocated requests */
 182         int allocated[2];
 183         /* fifo list of requests in sort_list */
 184         struct list_head fifo;
 185
 186         unsigned long slice_start;
 187         unsigned long slice_end;
 188         unsigned long slice_left;
 189         unsigned long service_last;
 190
 191         /* number of requests that are on the dispatch list */
 192         int on_dispatch[2];
 193
 194         /* io prio of this group */
 195         unsigned short ioprio, org_ioprio;
 196         unsigned short ioprio_class, org_ioprio_class;
 197
 198         /* various state flags, see below */
 199         unsigned int flags;
 200 };
 201
 202 struct cfq_rq {
 203         struct rb_node rb_node;
 204         sector_t rb_key;
 205         struct request *request;
 206         struct hlist_node hash;
 207
 208         struct cfq_queue *cfq_queue;
 209         struct cfq_io_context *io_context;
 210
 211         unsigned int crq_flags;
 212 };
 213
 214 enum cfqq_state_flags {
 215         CFQ_CFQQ_FLAG_on_rr = 0,
 216         CFQ_CFQQ_FLAG_wait_request,
 217         CFQ_CFQQ_FLAG_must_alloc,
 218         CFQ_CFQQ_FLAG_must_alloc_slice,
 219         CFQ_CFQQ_FLAG_must_dispatch,
 220         CFQ_CFQQ_FLAG_fifo_expire,
 221         CFQ_CFQQ_FLAG_idle_window,
 222         CFQ_CFQQ_FLAG_prio_changed,
 223 };
 224
 225 #define CFQ_CFQQ_FNS(name)                                              \
 226 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)         \
 227 {                                                                       \
 228         cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);                     \
 229 }                                                                       \
 230 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)        \
 231 {                                                                       \
 232         cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);                    \
 233 }                                                                       \
 234 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)         \
 235 {                                                                       \
 236         return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;        \
 237 }
 238
 239 CFQ_CFQQ_FNS(on_rr);
 240 CFQ_CFQQ_FNS(wait_request);
 241 CFQ_CFQQ_FNS(must_alloc);
 242 CFQ_CFQQ_FNS(must_alloc_slice);
 243 CFQ_CFQQ_FNS(must_dispatch);
 244 CFQ_CFQQ_FNS(fifo_expire);
 245 CFQ_CFQQ_FNS(idle_window);
 246 CFQ_CFQQ_FNS(prio_changed);
 247 #undef CFQ_CFQQ_FNS
 248
 249 enum cfq_rq_state_flags {
 250         CFQ_CRQ_FLAG_is_sync = 0,
 251 };
 252
 253 #define CFQ_CRQ_FNS(name)                                               \
 254 static inline void cfq_mark_crq_##name(struct cfq_rq *crq)              \
 255 {                                                                       \
 256         crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);                   \
 257 }                                                                       \
 258 static inline void cfq_clear_crq_##name(struct cfq_rq *crq)             \
 259 {                                                                       \
 260         crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);                  \
 261 }                                                                       \
 262 static inline int cfq_crq_##name(const struct cfq_rq *crq)              \
 263 {                                                                       \
 264         return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;      \
 265 }
 266
 267 CFQ_CRQ_FNS(is_sync);
 268 #undef CFQ_CRQ_FNS
 269
 270 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 271 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
 272 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 273
 274 /*
 275  * lots of deadline iosched dupes, can be abstracted later...
 276  */
 277 static inline void cfq_del_crq_hash(struct cfq_rq *crq)
 278 {
 279         hlist_del_init(&crq->hash);
 280 }
 281
 282 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
 283 {
 284         const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
 285
 286         hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
 287 }
 288
 289 static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 290 {
 291         struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
 292         struct hlist_node *entry, *next;
 293
 294         hlist_for_each_safe(entry, next, hash_list) {
 295                 struct cfq_rq *crq = list_entry_hash(entry);
 296                 struct request *__rq = crq->request;
 297
 298                 if (!rq_mergeable(__rq)) {
 299                         cfq_del_crq_hash(crq);
 300                         continue;
 301                 }
 302
 303                 if (rq_hash_key(__rq) == offset)
 304                         return __rq;
 305         }
 306
 307         return NULL;
 308 }
 309
 310 /*
 311  * scheduler run of queue, if there are requests pending and no one in the
 312  * driver that will restart queueing
 313  */
 314 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 315 {
 316         if (cfqd->busy_queues)
 317                 kblockd_schedule_work(&cfqd->unplug_work);
 318 }
 319
 320 static int cfq_queue_empty(request_queue_t *q)
 321 {
 322         struct cfq_data *cfqd = q->elevator->elevator_data;
 323
 324         return !cfqd->busy_queues;
 325 }
 326
 327 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
 328 {
 329         if (task->xid)
 330                 return task->xid + (1 << 16);
 331         if (rw == READ || rw == WRITE_SYNC)
 332                 return task->pid;
 333
 334         return CFQ_KEY_ASYNC;
 335 }
 336
 337 /*
 338  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
 339  * We choose the request that is closest to the head right now. Distance
 340  * behind the head is penalized and only allowed to a certain extent.
 341  */
 342 static struct cfq_rq *
 343 cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
 344 {
 345         sector_t last, s1, s2, d1 = 0, d2 = 0;
 346         unsigned long back_max;
 347 #define CFQ_RQ1_WRAP    0x01 /* request 1 wraps */
 348 #define CFQ_RQ2_WRAP    0x02 /* request 2 wraps */
 349         unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 350
 351         if (crq1 == NULL || crq1 == crq2)
 352                 return crq2;
 353         if (crq2 == NULL)
 354                 return crq1;
 355
 356         if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
 357                 return crq1;
 358         else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
 359                 return crq2;
 360
 361         s1 = crq1->request->sector;
 362         s2 = crq2->request->sector;
 363
 364         last = cfqd->last_sector;
 365
 366         /*
 367          * by definition, 1KiB is 2 sectors
 368          */
 369         back_max = cfqd->cfq_back_max * 2;
 370
 371         /*
 372          * Strict one way elevator _except_ in the case where we allow
 373          * short backward seeks which are biased as twice the cost of a
 374          * similar forward seek.
 375          */
 376         if (s1 >= last)
 377                 d1 = s1 - last;
 378         else if (s1 + back_max >= last)
 379                 d1 = (last - s1) * cfqd->cfq_back_penalty;
 380         else
 381                 wrap |= CFQ_RQ1_WRAP;
 382
 383         if (s2 >= last)
 384                 d2 = s2 - last;
 385         else if (s2 + back_max >= last)
 386                 d2 = (last - s2) * cfqd->cfq_back_penalty;
 387         else
 388                 wrap |= CFQ_RQ2_WRAP;
 389
 390         /* Found required data */
 391
 392         /*
 393          * By doing switch() on the bit mask "wrap" we avoid having to
 394          * check two variables for all permutations: --> faster!
 395          */
 396         switch (wrap) {
 397         case 0: /* common case for CFQ: crq1 and crq2 not wrapped */
 398                 if (d1 < d2)
 399                         return crq1;
 400                 else if (d2 < d1)
 401                         return crq2;
 402                 else {
 403                         if (s1 >= s2)
 404                                 return crq1;
 405                         else
 406                                 return crq2;
 407                 }
 408
 409         case CFQ_RQ2_WRAP:
 410                 return crq1;
 411         case CFQ_RQ1_WRAP:
 412                 return crq2;
 413         case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */
 414         default:
 415                 /*
 416                  * Since both rqs are wrapped,
 417                  * start with the one that's further behind head
 418                  * (--> only *one* back seek required),
 419                  * since back seek takes more time than forward.
 420                  */
 421                 if (s1 <= s2)
 422                         return crq1;
 423                 else
 424                         return crq2;
 425         }
 426 }
 427
 428 /*
 429  * would be nice to take fifo expire time into account as well
 430  */
 431 static struct cfq_rq *
 432 cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 433                   struct cfq_rq *last)
 434 {
 435         struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
 436         struct rb_node *rbnext, *rbprev;
 437
 438         if (!(rbnext = rb_next(&last->rb_node))) {
 439                 rbnext = rb_first(&cfqq->sort_list);
 440                 if (rbnext == &last->rb_node)
 441                         rbnext = NULL;
 442         }
 443
 444         rbprev = rb_prev(&last->rb_node);
 445
 446         if (rbprev)
 447                 crq_prev = rb_entry_crq(rbprev);
 448         if (rbnext)
 449                 crq_next = rb_entry_crq(rbnext);
 450
 451         return cfq_choose_req(cfqd, crq_next, crq_prev);
 452 }
 453
 454 static void cfq_update_next_crq(struct cfq_rq *crq)
 455 {
 456         struct cfq_queue *cfqq = crq->cfq_queue;
 457
 458         if (cfqq->next_crq == crq)
 459                 cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
 460 }
 461
 462 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 463 {
 464         struct cfq_data *cfqd = cfqq->cfqd;
 465         struct list_head *list, *entry;
 466
 467         BUG_ON(!cfq_cfqq_on_rr(cfqq));
 468
 469         list_del(&cfqq->cfq_list);
 470
 471         if (cfq_class_rt(cfqq))
 472                 list = &cfqd->cur_rr;
 473         else if (cfq_class_idle(cfqq))
 474                 list = &cfqd->idle_rr;
 475         else {
 476                 /*
 477                  * if cfqq has requests in flight, don't allow it to be
 478                  * found in cfq_set_active_queue before it has finished them.
 479                  * this is done to increase fairness between a process that
 480                  * has lots of io pending vs one that only generates one
 481                  * sporadically or synchronously
 482                  */
 483                 if (cfq_cfqq_dispatched(cfqq))
 484                         list = &cfqd->busy_rr;
 485                 else
 486                         list = &cfqd->rr_list[cfqq->ioprio];
 487         }
 488
 489         /*
 490          * if queue was preempted, just add to front to be fair. busy_rr
 491          * isn't sorted, but insert at the back for fairness.
 492          */
 493         if (preempted || list == &cfqd->busy_rr) {
 494                 if (preempted)
 495                         list = list->prev;
 496
 497                 list_add_tail(&cfqq->cfq_list, list);
 498                 return;
 499         }
 500
 501         /*
 502          * sort by when queue was last serviced
 503          */
 504         entry = list;
 505         while ((entry = entry->prev) != list) {
 506                 struct cfq_queue *__cfqq = list_entry_cfqq(entry);
 507
 508                 if (!__cfqq->service_last)
 509                         break;
 510                 if (time_before(__cfqq->service_last, cfqq->service_last))
 511                         break;
 512         }
 513
 514         list_add(&cfqq->cfq_list, entry);
 515 }
 516
 517 /*
 518  * add to busy list of queues for service, trying to be fair in ordering
 519  * the pending list according to last request service
 520  */
 521 static inline void
 522 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 523 {
 524         BUG_ON(cfq_cfqq_on_rr(cfqq));
 525         cfq_mark_cfqq_on_rr(cfqq);
 526         cfqd->busy_queues++;
 527
 528         cfq_resort_rr_list(cfqq, 0);
 529 }
 530
 531 static inline void
 532 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 533 {
 534         BUG_ON(!cfq_cfqq_on_rr(cfqq));
 535         cfq_clear_cfqq_on_rr(cfqq);
 536         list_move(&cfqq->cfq_list, &cfqd->empty_list);
 537
 538         BUG_ON(!cfqd->busy_queues);
 539         cfqd->busy_queues--;
 540 }
 541
 542 /*
 543  * rb tree support functions
 544  */
 545 static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 546 {
 547         struct cfq_queue *cfqq = crq->cfq_queue;
 548         struct cfq_data *cfqd = cfqq->cfqd;
 549         const int sync = cfq_crq_is_sync(crq);
 550
 551         BUG_ON(!cfqq->queued[sync]);
 552         cfqq->queued[sync]--;
 553
 554         cfq_update_next_crq(crq);
 555
 556         rb_erase(&crq->rb_node, &cfqq->sort_list);
 557
 558         if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 559                 cfq_del_cfqq_rr(cfqd, cfqq);
 560 }
 561
 562 static struct cfq_rq *
 563 __cfq_add_crq_rb(struct cfq_rq *crq)
 564 {
 565         struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
 566         struct rb_node *parent = NULL;
 567         struct cfq_rq *__crq;
 568
 569         while (*p) {
 570                 parent = *p;
 571                 __crq = rb_entry_crq(parent);
 572
 573                 if (crq->rb_key < __crq->rb_key)
 574                         p = &(*p)->rb_left;
 575                 else if (crq->rb_key > __crq->rb_key)
 576                         p = &(*p)->rb_right;
 577                 else
 578                         return __crq;
 579         }
 580
 581         rb_link_node(&crq->rb_node, parent, p);
 582         return NULL;
 583 }
 584
 585 static void cfq_add_crq_rb(struct cfq_rq *crq)
 586 {
 587         struct cfq_queue *cfqq = crq->cfq_queue;
 588         struct cfq_data *cfqd = cfqq->cfqd;
 589         struct request *rq = crq->request;
 590         struct cfq_rq *__alias;
 591
 592         crq->rb_key = rq_rb_key(rq);
 593         cfqq->queued[cfq_crq_is_sync(crq)]++;
 594
 595         /*
 596          * looks a little odd, but the first insert might return an alias.
 597          * if that happens, put the alias on the dispatch list
 598          */
 599         while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
 600                 cfq_dispatch_insert(cfqd->queue, __alias);
 601
 602         rb_insert_color(&crq->rb_node, &cfqq->sort_list);
 603
 604         if (!cfq_cfqq_on_rr(cfqq))
 605                 cfq_add_cfqq_rr(cfqd, cfqq);
 606
 607         /*
 608          * check if this request is a better next-serve candidate
 609          */
 610         cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
 611 }
 612
 613 static inline void
 614 cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 615 {
 616         rb_erase(&crq->rb_node, &cfqq->sort_list);
 617         cfqq->queued[cfq_crq_is_sync(crq)]--;
 618
 619         cfq_add_crq_rb(crq);
 620 }
 621
 622 static struct request *
 623 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 624 {
 625         struct task_struct *tsk = current;
 626         pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
 627         struct cfq_queue *cfqq;
 628         struct rb_node *n;
 629         sector_t sector;
 630
 631         cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
 632         if (!cfqq)
 633                 goto out;
 634
 635         sector = bio->bi_sector + bio_sectors(bio);
 636         n = cfqq->sort_list.rb_node;
 637         while (n) {
 638                 struct cfq_rq *crq = rb_entry_crq(n);
 639
 640                 if (sector < crq->rb_key)
 641                         n = n->rb_left;
 642                 else if (sector > crq->rb_key)
 643                         n = n->rb_right;
 644                 else
 645                         return crq->request;
 646         }
 647
 648 out:
 649         return NULL;
 650 }
 651
 652 static void cfq_activate_request(request_queue_t *q, struct request *rq)
 653 {
 654         struct cfq_data *cfqd = q->elevator->elevator_data;
 655
 656         cfqd->rq_in_driver++;
 657
 658         /*
 659          * If the depth is larger 1, it really could be queueing. But lets
 660          * make the mark a little higher - idling could still be good for
 661          * low queueing, and a low queueing number could also just indicate
 662          * a SCSI mid layer like behaviour where limit+1 is often seen.
 663          */
 664         if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
 665                 cfqd->hw_tag = 1;
 666 }
 667
 668 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
 669 {
 670         struct cfq_data *cfqd = q->elevator->elevator_data;
 671
 672         WARN_ON(!cfqd->rq_in_driver);
 673         cfqd->rq_in_driver--;
 674 }
 675
 676 static void cfq_remove_request(struct request *rq)
 677 {
 678         struct cfq_rq *crq = RQ_DATA(rq);
 679
 680         list_del_init(&rq->queuelist);
 681         cfq_del_crq_rb(crq);
 682         cfq_del_crq_hash(crq);
 683 }
 684
 685 static int
 686 cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 687 {
 688         struct cfq_data *cfqd = q->elevator->elevator_data;
 689         struct request *__rq;
 690         int ret;
 691
 692         __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
 693         if (__rq && elv_rq_merge_ok(__rq, bio)) {
 694                 ret = ELEVATOR_BACK_MERGE;
 695                 goto out;
 696         }
 697
 698         __rq = cfq_find_rq_fmerge(cfqd, bio);
 699         if (__rq && elv_rq_merge_ok(__rq, bio)) {
 700                 ret = ELEVATOR_FRONT_MERGE;
 701                 goto out;
 702         }
 703
 704         return ELEVATOR_NO_MERGE;
 705 out:
 706         *req = __rq;
 707         return ret;
 708 }
 709
 710 static void cfq_merged_request(request_queue_t *q, struct request *req)
 711 {
 712         struct cfq_data *cfqd = q->elevator->elevator_data;
 713         struct cfq_rq *crq = RQ_DATA(req);
 714
 715         cfq_del_crq_hash(crq);
 716         cfq_add_crq_hash(cfqd, crq);
 717
 718         if (rq_rb_key(req) != crq->rb_key) {
 719                 struct cfq_queue *cfqq = crq->cfq_queue;
 720
 721                 cfq_update_next_crq(crq);
 722                 cfq_reposition_crq_rb(cfqq, crq);
 723         }
 724 }
 725
 726 static void
 727 cfq_merged_requests(request_queue_t *q, struct request *rq,
 728                     struct request *next)
 729 {
 730         cfq_merged_request(q, rq);
 731
 732         /*
 733          * reposition in fifo if next is older than rq
 734          */
 735         if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
 736             time_before(next->start_time, rq->start_time))
 737                 list_move(&rq->queuelist, &next->queuelist);
 738
 739         cfq_remove_request(next);
 740 }
 741
 742 static inline void
 743 __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 744 {
 745         if (cfqq) {
 746                 /*
 747                  * stop potential idle class queues waiting service
 748                  */
 749                 del_timer(&cfqd->idle_class_timer);
 750
 751                 cfqq->slice_start = jiffies;
 752                 cfqq->slice_end = 0;
 753                 cfqq->slice_left = 0;
 754                 cfq_clear_cfqq_must_alloc_slice(cfqq);
 755                 cfq_clear_cfqq_fifo_expire(cfqq);
 756         }
 757
 758         cfqd->active_queue = cfqq;
 759 }
 760
 761 /*
 762  * current cfqq expired its slice (or was too idle), select new one
 763  */
 764 static void
 765 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 766                     int preempted)
 767 {
 768         unsigned long now = jiffies;
 769
 770         if (cfq_cfqq_wait_request(cfqq))
 771                 del_timer(&cfqd->idle_slice_timer);
 772
 773         if (!preempted && !cfq_cfqq_dispatched(cfqq)) {
 774                 cfqq->service_last = now;
 775                 cfq_schedule_dispatch(cfqd);
 776         }
 777
 778         cfq_clear_cfqq_must_dispatch(cfqq);
 779         cfq_clear_cfqq_wait_request(cfqq);
 780
 781         /*
 782          * store what was left of this slice, if the queue idled out
 783          * or was preempted
 784          */
 785         if (time_after(cfqq->slice_end, now))
 786                 cfqq->slice_left = cfqq->slice_end - now;
 787         else
 788                 cfqq->slice_left = 0;
 789
 790         if (cfq_cfqq_on_rr(cfqq))
 791                 cfq_resort_rr_list(cfqq, preempted);
 792
 793         if (cfqq == cfqd->active_queue)
 794                 cfqd->active_queue = NULL;
 795
 796         if (cfqd->active_cic) {
 797                 put_io_context(cfqd->active_cic->ioc);
 798                 cfqd->active_cic = NULL;
 799         }
 800
 801         cfqd->dispatch_slice = 0;
 802 }
 803
 804 static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
 805 {
 806         struct cfq_queue *cfqq = cfqd->active_queue;
 807
 808         if (cfqq)
 809                 __cfq_slice_expired(cfqd, cfqq, preempted);
 810 }
 811
 812 /*
 813  * 0
 814  * 0,1
 815  * 0,1,2
 816  * 0,1,2,3
 817  * 0,1,2,3,4
 818  * 0,1,2,3,4,5
 819  * 0,1,2,3,4,5,6
 820  * 0,1,2,3,4,5,6,7
 821  */
 822 static int cfq_get_next_prio_level(struct cfq_data *cfqd)
 823 {
 824         int prio, wrap;
 825
 826         prio = -1;
 827         wrap = 0;
 828         do {
 829                 int p;
 830
 831                 for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
 832                         if (!list_empty(&cfqd->rr_list[p])) {
 833                                 prio = p;
 834                                 break;
 835                         }
 836                 }
 837
 838                 if (prio != -1)
 839                         break;
 840                 cfqd->cur_prio = 0;
 841                 if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
 842                         cfqd->cur_end_prio = 0;
 843                         if (wrap)
 844                                 break;
 845                         wrap = 1;
 846                 }
 847         } while (1);
 848
 849         if (unlikely(prio == -1))
 850                 return -1;
 851
 852         BUG_ON(prio >= CFQ_PRIO_LISTS);
 853
 854         list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
 855
 856         cfqd->cur_prio = prio + 1;
 857         if (cfqd->cur_prio > cfqd->cur_end_prio) {
 858                 cfqd->cur_end_prio = cfqd->cur_prio;
 859                 cfqd->cur_prio = 0;
 860         }
 861         if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
 862                 cfqd->cur_prio = 0;
 863                 cfqd->cur_end_prio = 0;
 864         }
 865
 866         return prio;
 867 }
 868
 869 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 870 {
 871         struct cfq_queue *cfqq = NULL;
 872
 873         /*
 874          * if current list is non-empty, grab first entry. if it is empty,
 875          * get next prio level and grab first entry then if any are spliced
 876          */
 877         if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
 878                 cfqq = list_entry_cfqq(cfqd->cur_rr.next);
 879
 880         /*
 881          * If no new queues are available, check if the busy list has some
 882          * before falling back to idle io.
 883          */
 884         if (!cfqq && !list_empty(&cfqd->busy_rr))
 885                 cfqq = list_entry_cfqq(cfqd->busy_rr.next);
 886
 887         /*
 888          * if we have idle queues and no rt or be queues had pending
 889          * requests, either allow immediate service if the grace period
 890          * has passed or arm the idle grace timer
 891          */
 892         if (!cfqq && !list_empty(&cfqd->idle_rr)) {
 893                 unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
 894
 895                 if (time_after_eq(jiffies, end))
 896                         cfqq = list_entry_cfqq(cfqd->idle_rr.next);
 897                 else
 898                         mod_timer(&cfqd->idle_class_timer, end);
 899         }
 900
 901         __cfq_set_active_queue(cfqd, cfqq);
 902         return cfqq;
 903 }
 904
 905 #define CIC_SEEKY(cic) ((cic)->seek_mean > (128 * 1024))
 906
 907 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 908
 909 {
 910         struct cfq_io_context *cic;
 911         unsigned long sl;
 912
 913         WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
 914         WARN_ON(cfqq != cfqd->active_queue);
 915
 916         /*
 917          * idle is disabled, either manually or by past process history
 918          */
 919         if (!cfqd->cfq_slice_idle)
 920                 return 0;
 921         if (!cfq_cfqq_idle_window(cfqq))
 922                 return 0;
 923         /*
 924          * task has exited, don't wait
 925          */
 926         cic = cfqd->active_cic;
 927         if (!cic || !cic->ioc->task)
 928                 return 0;
 929
 930         cfq_mark_cfqq_must_dispatch(cfqq);
 931         cfq_mark_cfqq_wait_request(cfqq);
 932
 933         sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
 934
 935         /*
 936          * we don't want to idle for seeks, but we do want to allow
 937          * fair distribution of slice time for a process doing back-to-back
 938          * seeks. so allow a little bit of time for him to submit a new rq
 939          */
 940         if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
 941                 sl = min(sl, msecs_to_jiffies(2));
 942
 943         mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 944         return 1;
 945 }
 946
 947 static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
 948 {
 949         struct cfq_data *cfqd = q->elevator->elevator_data;
 950         struct cfq_queue *cfqq = crq->cfq_queue;
 951         struct request *rq;
 952
 953         cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
 954         cfq_remove_request(crq->request);
 955         cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
 956         elv_dispatch_sort(q, crq->request);
 957
 958         rq = list_entry(q->queue_head.prev, struct request, queuelist);
 959         cfqd->last_sector = rq->sector + rq->nr_sectors;
 960 }
 961
 962 /*
 963  * return expired entry, or NULL to just start from scratch in rbtree
 964  */
 965 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
 966 {
 967         struct cfq_data *cfqd = cfqq->cfqd;
 968         struct request *rq;
 969         struct cfq_rq *crq;
 970
 971         if (cfq_cfqq_fifo_expire(cfqq))
 972                 return NULL;
 973
 974         if (!list_empty(&cfqq->fifo)) {
 975                 int fifo = cfq_cfqq_class_sync(cfqq);
 976
 977                 crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
 978                 rq = crq->request;
 979                 if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
 980                         cfq_mark_cfqq_fifo_expire(cfqq);
 981                         return crq;
 982                 }
 983         }
 984
 985         return NULL;
 986 }
 987
 988 /*
 989  * Scale schedule slice based on io priority. Use the sync time slice only
 990  * if a queue is marked sync and has sync io queued. A sync queue with async
 991  * io only, should not get full sync slice length.
 992  */
 993 static inline int
 994 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 995 {
 996         const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
 997
 998         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
 999
1000         return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
1001 }
1002
1003 static inline void
1004 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1005 {
1006         cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
1007 }
1008
1009 static inline int
1010 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1011 {
1012         const int base_rq = cfqd->cfq_slice_async_rq;
1013
1014         WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1015
1016         return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1017 }
1018
1019 /*
1020  * get next queue for service
1021  */
1022 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
1023 {
1024         unsigned long now = jiffies;
1025         struct cfq_queue *cfqq;
1026
1027         cfqq = cfqd->active_queue;
1028         if (!cfqq)
1029                 goto new_queue;
1030
1031         /*
1032          * slice has expired
1033          */
1034         if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
1035                 goto expire;
1036
1037         /*
1038          * if queue has requests, dispatch one. if not, check if
1039          * enough slice is left to wait for one
1040          */
1041         if (!RB_EMPTY_ROOT(&cfqq->sort_list))
1042                 goto keep_queue;
1043         else if (cfq_cfqq_dispatched(cfqq)) {
1044                 cfqq = NULL;
1045                 goto keep_queue;
1046         } else if (cfq_cfqq_class_sync(cfqq)) {
1047                 if (cfq_arm_slice_timer(cfqd, cfqq))
1048                         return NULL;
1049         }
1050
1051 expire:
1052         cfq_slice_expired(cfqd, 0);
1053 new_queue:
1054         cfqq = cfq_set_active_queue(cfqd);
1055 keep_queue:
1056         return cfqq;
1057 }
1058
1059 static int
1060 __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1061                         int max_dispatch)
1062 {
1063         int dispatched = 0;
1064
1065         BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
1066
1067         do {
1068                 struct cfq_rq *crq;
1069
1070                 /*
1071                  * follow expired path, else get first next available
1072                  */
1073                 if ((crq = cfq_check_fifo(cfqq)) == NULL)
1074                         crq = cfqq->next_crq;
1075
1076                 /*
1077                  * finally, insert request into driver dispatch list
1078                  */
1079                 cfq_dispatch_insert(cfqd->queue, crq);
1080
1081                 cfqd->dispatch_slice++;
1082                 dispatched++;
1083
1084                 if (!cfqd->active_cic) {
1085                         atomic_inc(&crq->io_context->ioc->refcount);
1086                         cfqd->active_cic = crq->io_context;
1087                 }
1088
1089                 if (RB_EMPTY_ROOT(&cfqq->sort_list))
1090                         break;
1091
1092         } while (dispatched < max_dispatch);
1093
1094         /*
1095          * if slice end isn't set yet, set it.
1096          */
1097         if (!cfqq->slice_end)
1098                 cfq_set_prio_slice(cfqd, cfqq);
1099
1100         /*
1101          * expire an async queue immediately if it has used up its slice. idle
1102          * queue always expire after 1 dispatch round.
1103          */
1104         if ((!cfq_cfqq_sync(cfqq) &&
1105             cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
1106             cfq_class_idle(cfqq) ||
1107             !cfq_cfqq_idle_window(cfqq))
1108                 cfq_slice_expired(cfqd, 0);
1109
1110         return dispatched;
1111 }
1112
1113 static int
1114 cfq_forced_dispatch_cfqqs(struct list_head *list)
1115 {
1116         struct cfq_queue *cfqq, *next;
1117         struct cfq_rq *crq;
1118         int dispatched;
1119
1120         dispatched = 0;
1121         list_for_each_entry_safe(cfqq, next, list, cfq_list) {
1122                 while ((crq = cfqq->next_crq)) {
1123                         cfq_dispatch_insert(cfqq->cfqd->queue, crq);
1124                         dispatched++;
1125                 }
1126                 BUG_ON(!list_empty(&cfqq->fifo));
1127         }
1128
1129         return dispatched;
1130 }
1131
1132 static int
1133 cfq_forced_dispatch(struct cfq_data *cfqd)
1134 {
1135         int i, dispatched = 0;
1136
1137         for (i = 0; i < CFQ_PRIO_LISTS; i++)
1138                 dispatched += cfq_forced_dispatch_cfqqs(&cfqd->rr_list[i]);
1139
1140         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->busy_rr);
1141         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->cur_rr);
1142         dispatched += cfq_forced_dispatch_cfqqs(&cfqd->idle_rr);
1143
1144         cfq_slice_expired(cfqd, 0);
1145
1146         BUG_ON(cfqd->busy_queues);
1147
1148         return dispatched;
1149 }
1150
1151 static int
1152 cfq_dispatch_requests(request_queue_t *q, int force)
1153 {
1154         struct cfq_data *cfqd = q->elevator->elevator_data;
1155         struct cfq_queue *cfqq, *prev_cfqq;
1156         int dispatched;
1157
1158         if (!cfqd->busy_queues)
1159                 return 0;
1160
1161         if (unlikely(force))
1162                 return cfq_forced_dispatch(cfqd);
1163
1164         dispatched = 0;
1165         prev_cfqq = NULL;
1166         while ((cfqq = cfq_select_queue(cfqd)) != NULL) {
1167                 int max_dispatch;
1168
1169                 /*
1170                  * Don't repeat dispatch from the previous queue.
1171                  */
1172                 if (prev_cfqq == cfqq)
1173                         break;
1174
1175                 cfq_clear_cfqq_must_dispatch(cfqq);
1176                 cfq_clear_cfqq_wait_request(cfqq);
1177                 del_timer(&cfqd->idle_slice_timer);
1178
1179                 max_dispatch = cfqd->cfq_quantum;
1180                 if (cfq_class_idle(cfqq))
1181                         max_dispatch = 1;
1182
1183                 dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
1184
1185                 /*
1186                  * If the dispatch cfqq has idling enabled and is still
1187                  * the active queue, break out.
1188                  */
1189                 if (cfq_cfqq_idle_window(cfqq) && cfqd->active_queue)
1190                         break;
1191
1192                 prev_cfqq = cfqq;
1193         }
1194
1195         return dispatched;
1196 }
1197
1198 /*
1199  * task holds one reference to the queue, dropped when task exits. each crq
1200  * in-flight on this queue also holds a reference, dropped when crq is freed.
1201  *
1202  * queue lock must be held here.
1203  */
1204 static void cfq_put_queue(struct cfq_queue *cfqq)
1205 {
1206         struct cfq_data *cfqd = cfqq->cfqd;
1207
1208         BUG_ON(atomic_read(&cfqq->ref) <= 0);
1209
1210         if (!atomic_dec_and_test(&cfqq->ref))
1211                 return;
1212
1213         BUG_ON(rb_first(&cfqq->sort_list));
1214         BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
1215         BUG_ON(cfq_cfqq_on_rr(cfqq));
1216
1217         if (unlikely(cfqd->active_queue == cfqq))
1218                 __cfq_slice_expired(cfqd, cfqq, 0);
1219
1220         /*
1221          * it's on the empty list and still hashed
1222          */
1223         list_del(&cfqq->cfq_list);
1224         hlist_del(&cfqq->cfq_hash);
1225         kmem_cache_free(cfq_pool, cfqq);
1226 }
1227
1228 static inline struct cfq_queue *
1229 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
1230                     const int hashval)
1231 {
1232         struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
1233         struct hlist_node *entry;
1234         struct cfq_queue *__cfqq;
1235
1236         hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
1237                 const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
1238
1239                 if (__cfqq->key == key && (__p == prio || !prio))
1240                         return __cfqq;
1241         }
1242
1243         return NULL;
1244 }
1245
1246 static struct cfq_queue *
1247 cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
1248 {
1249         return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
1250 }
1251
1252 static void cfq_free_io_context(struct io_context *ioc)
1253 {
1254         struct cfq_io_context *__cic;
1255         struct rb_node *n;
1256         int freed = 0;
1257
1258         while ((n = rb_first(&ioc->cic_root)) != NULL) {
1259                 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1260                 rb_erase(&__cic->rb_node, &ioc->cic_root);
1261                 kmem_cache_free(cfq_ioc_pool, __cic);
1262                 freed++;
1263         }
1264
1265         if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
1266                 complete(ioc_gone);
1267 }
1268
1269 static void cfq_trim(struct io_context *ioc)
1270 {
1271         ioc->set_ioprio = NULL;
1272         cfq_free_io_context(ioc);
1273 }
1274
1275 /*
1276  * Called with interrupts disabled
1277  */
1278 static void cfq_exit_single_io_context(struct cfq_io_context *cic)
1279 {
1280         struct cfq_data *cfqd = cic->key;
1281         request_queue_t *q;
1282
1283         if (!cfqd)
1284                 return;
1285
1286         q = cfqd->queue;
1287
1288         WARN_ON(!irqs_disabled());
1289
1290         spin_lock(q->queue_lock);
1291
1292         if (cic->cfqq[ASYNC]) {
1293                 if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
1294                         __cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
1295                 cfq_put_queue(cic->cfqq[ASYNC]);
1296                 cic->cfqq[ASYNC] = NULL;
1297         }
1298
1299         if (cic->cfqq[SYNC]) {
1300                 if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
1301                         __cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
1302                 cfq_put_queue(cic->cfqq[SYNC]);
1303                 cic->cfqq[SYNC] = NULL;
1304         }
1305
1306         cic->key = NULL;
1307         list_del_init(&cic->queue_list);
1308         spin_unlock(q->queue_lock);
1309 }
1310
1311 static void cfq_exit_io_context(struct io_context *ioc)
1312 {
1313         struct cfq_io_context *__cic;
1314         unsigned long flags;
1315         struct rb_node *n;
1316
1317         /*
1318          * put the reference this task is holding to the various queues
1319          */
1320         spin_lock_irqsave(&cfq_exit_lock, flags);
1321
1322         n = rb_first(&ioc->cic_root);
1323         while (n != NULL) {
1324                 __cic = rb_entry(n, struct cfq_io_context, rb_node);
1325
1326                 cfq_exit_single_io_context(__cic);
1327                 n = rb_next(n);
1328         }
1329
1330         spin_unlock_irqrestore(&cfq_exit_lock, flags);
1331 }
1332
1333 static struct cfq_io_context *
1334 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1335 {
1336         struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
1337
1338         if (cic) {
1339                 memset(cic, 0, sizeof(*cic));
1340                 cic->last_end_request = jiffies;
1341                 INIT_LIST_HEAD(&cic->queue_list);
1342                 cic->dtor = cfq_free_io_context;
1343                 cic->exit = cfq_exit_io_context;
1344                 atomic_inc(&ioc_count);
1345         }
1346
1347         return cic;
1348 }
1349
1350 static void cfq_init_prio_data(struct cfq_queue *cfqq)
1351 {
1352         struct task_struct *tsk = current;
1353         int ioprio_class;
1354
1355         if (!cfq_cfqq_prio_changed(cfqq))
1356                 return;
1357
1358         ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
1359         switch (ioprio_class) {
1360                 default:
1361                         printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
1362                 case IOPRIO_CLASS_NONE:
1363                         /*
1364                          * no prio set, place us in the middle of the BE classes
1365                          */
1366                         cfqq->ioprio = task_nice_ioprio(tsk);
1367                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1368                         break;
1369                 case IOPRIO_CLASS_RT:
1370                         cfqq->ioprio = task_ioprio(tsk);
1371                         cfqq->ioprio_class = IOPRIO_CLASS_RT;
1372                         break;
1373                 case IOPRIO_CLASS_BE:
1374                         cfqq->ioprio = task_ioprio(tsk);
1375                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1376                         break;
1377                 case IOPRIO_CLASS_IDLE:
1378                         cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
1379                         cfqq->ioprio = 7;
1380                         cfq_clear_cfqq_idle_window(cfqq);
1381                         break;
1382         }
1383
1384         /*
1385          * keep track of original prio settings in case we have to temporarily
1386          * elevate the priority of this queue
1387          */
1388         cfqq->org_ioprio = cfqq->ioprio;
1389         cfqq->org_ioprio_class = cfqq->ioprio_class;
1390
1391         if (cfq_cfqq_on_rr(cfqq))
1392                 cfq_resort_rr_list(cfqq, 0);
1393
1394         cfq_clear_cfqq_prio_changed(cfqq);
1395 }
1396
1397 static inline void changed_ioprio(struct cfq_io_context *cic)
1398 {
1399         struct cfq_data *cfqd = cic->key;
1400         struct cfq_queue *cfqq;
1401
1402         if (unlikely(!cfqd))
1403                 return;
1404
1405         spin_lock(cfqd->queue->queue_lock);
1406
1407         cfqq = cic->cfqq[ASYNC];
1408         if (cfqq) {
1409                 struct cfq_queue *new_cfqq;
1410                 new_cfqq = cfq_get_queue(cfqd, CFQ_KEY_ASYNC, cic->ioc->task,
1411                                          GFP_ATOMIC);
1412                 if (new_cfqq) {
1413                         cic->cfqq[ASYNC] = new_cfqq;
1414                         cfq_put_queue(cfqq);
1415                 }
1416         }
1417
1418         cfqq = cic->cfqq[SYNC];
1419         if (cfqq)
1420                 cfq_mark_cfqq_prio_changed(cfqq);
1421
1422         spin_unlock(cfqd->queue->queue_lock);
1423 }
1424
1425 /*
1426  * callback from sys_ioprio_set, irqs are disabled
1427  */
1428 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
1429 {
1430         struct cfq_io_context *cic;
1431         struct rb_node *n;
1432
1433         spin_lock(&cfq_exit_lock);
1434
1435         n = rb_first(&ioc->cic_root);
1436         while (n != NULL) {
1437                 cic = rb_entry(n, struct cfq_io_context, rb_node);
1438
1439                 changed_ioprio(cic);
1440                 n = rb_next(n);
1441         }
1442
1443         spin_unlock(&cfq_exit_lock);
1444
1445         return 0;
1446 }
1447
1448 static struct cfq_queue *
1449 cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk,
1450               gfp_t gfp_mask)
1451 {
1452         const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
1453         struct cfq_queue *cfqq, *new_cfqq = NULL;
1454         unsigned short ioprio;
1455
1456 retry:
1457         ioprio = tsk->ioprio;
1458         cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
1459
1460         if (!cfqq) {
1461                 if (new_cfqq) {
1462                         cfqq = new_cfqq;
1463                         new_cfqq = NULL;
1464                 } else if (gfp_mask & __GFP_WAIT) {
1465                         spin_unlock_irq(cfqd->queue->queue_lock);
1466                         new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1467                         spin_lock_irq(cfqd->queue->queue_lock);
1468                         goto retry;
1469                 } else {
1470                         cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
1471                         if (!cfqq)
1472                                 goto out;
1473                 }
1474
1475                 memset(cfqq, 0, sizeof(*cfqq));
1476
1477                 INIT_HLIST_NODE(&cfqq->cfq_hash);
1478                 INIT_LIST_HEAD(&cfqq->cfq_list);
1479                 INIT_LIST_HEAD(&cfqq->fifo);
1480
1481                 cfqq->key = key;
1482                 hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
1483                 atomic_set(&cfqq->ref, 0);
1484                 cfqq->cfqd = cfqd;
1485                 cfqq->service_last = 0;
1486                 /*
1487                  * set ->slice_left to allow preemption for a new process
1488                  */
1489                 cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
1490                 cfq_mark_cfqq_idle_window(cfqq);
1491                 cfq_mark_cfqq_prio_changed(cfqq);
1492                 cfq_init_prio_data(cfqq);
1493         }
1494
1495         if (new_cfqq)
1496                 kmem_cache_free(cfq_pool, new_cfqq);
1497
1498         atomic_inc(&cfqq->ref);
1499 out:
1500         WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
1501         return cfqq;
1502 }
1503
1504 static void
1505 cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
1506 {
1507         spin_lock(&cfq_exit_lock);
1508         rb_erase(&cic->rb_node, &ioc->cic_root);
1509         list_del_init(&cic->queue_list);
1510         spin_unlock(&cfq_exit_lock);
1511         kmem_cache_free(cfq_ioc_pool, cic);
1512         atomic_dec(&ioc_count);
1513 }
1514
1515 static struct cfq_io_context *
1516 cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
1517 {
1518         struct rb_node *n;
1519         struct cfq_io_context *cic;
1520         void *k, *key = cfqd;
1521
1522 restart:
1523         n = ioc->cic_root.rb_node;
1524         while (n) {
1525                 cic = rb_entry(n, struct cfq_io_context, rb_node);
1526                 /* ->key must be copied to avoid race with cfq_exit_queue() */
1527                 k = cic->key;
1528                 if (unlikely(!k)) {
1529                         cfq_drop_dead_cic(ioc, cic);
1530                         goto restart;
1531                 }
1532
1533                 if (key < k)
1534                         n = n->rb_left;
1535                 else if (key > k)
1536                         n = n->rb_right;
1537                 else
1538                         return cic;
1539         }
1540
1541         return NULL;
1542 }
1543
1544 static inline void
1545 cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
1546              struct cfq_io_context *cic)
1547 {
1548         struct rb_node **p;
1549         struct rb_node *parent;
1550         struct cfq_io_context *__cic;
1551         void *k;
1552
1553         cic->ioc = ioc;
1554         cic->key = cfqd;
1555
1556         ioc->set_ioprio = cfq_ioc_set_ioprio;
1557 restart:
1558         parent = NULL;
1559         p = &ioc->cic_root.rb_node;
1560         while (*p) {
1561                 parent = *p;
1562                 __cic = rb_entry(parent, struct cfq_io_context, rb_node);
1563                 /* ->key must be copied to avoid race with cfq_exit_queue() */
1564                 k = __cic->key;
1565                 if (unlikely(!k)) {
1566                         cfq_drop_dead_cic(ioc, __cic);
1567                         goto restart;
1568                 }
1569
1570                 if (cic->key < k)
1571                         p = &(*p)->rb_left;
1572                 else if (cic->key > k)
1573                         p = &(*p)->rb_right;
1574                 else
1575                         BUG();
1576         }
1577
1578         spin_lock(&cfq_exit_lock);
1579         rb_link_node(&cic->rb_node, parent, p);
1580         rb_insert_color(&cic->rb_node, &ioc->cic_root);
1581         list_add(&cic->queue_list, &cfqd->cic_list);
1582         spin_unlock(&cfq_exit_lock);
1583 }
1584
1585 /*
1586  * Setup general io context and cfq io context. There can be several cfq
1587  * io contexts per general io context, if this process is doing io to more
1588  * than one device managed by cfq.
1589  */
1590 static struct cfq_io_context *
1591 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
1592 {
1593         struct io_context *ioc = NULL;
1594         struct cfq_io_context *cic;
1595
1596         might_sleep_if(gfp_mask & __GFP_WAIT);
1597
1598         ioc = get_io_context(gfp_mask);
1599         if (!ioc)
1600                 return NULL;
1601
1602         cic = cfq_cic_rb_lookup(cfqd, ioc);
1603         if (cic)
1604                 goto out;
1605
1606         cic = cfq_alloc_io_context(cfqd, gfp_mask);
1607         if (cic == NULL)
1608                 goto err;
1609
1610         cfq_cic_link(cfqd, ioc, cic);
1611 out:
1612         return cic;
1613 err:
1614         put_io_context(ioc);
1615         return NULL;
1616 }
1617
1618 static void
1619 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
1620 {
1621         unsigned long elapsed, ttime;
1622
1623         /*
1624          * if this context already has stuff queued, thinktime is from
1625          * last queue not last end
1626          */
1627 #if 0
1628         if (time_after(cic->last_end_request, cic->last_queue))
1629                 elapsed = jiffies - cic->last_end_request;
1630         else
1631                 elapsed = jiffies - cic->last_queue;
1632 #else
1633                 elapsed = jiffies - cic->last_end_request;
1634 #endif
1635
1636         ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
1637
1638         cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
1639         cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
1640         cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
1641 }
1642
1643 static void
1644 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
1645                        struct cfq_rq *crq)
1646 {
1647         sector_t sdist;
1648         u64 total;
1649
1650         if (cic->last_request_pos < crq->request->sector)
1651                 sdist = crq->request->sector - cic->last_request_pos;
1652         else
1653                 sdist = cic->last_request_pos - crq->request->sector;
1654
1655         /*
1656          * Don't allow the seek distance to get too large from the
1657          * odd fragment, pagein, etc
1658          */
1659         if (cic->seek_samples <= 60) /* second&third seek */
1660                 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
1661         else
1662                 sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
1663
1664         cic->seek_samples = (7*cic->seek_samples + 256) / 8;
1665         cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
1666         total = cic->seek_total + (cic->seek_samples/2);
1667         do_div(total, cic->seek_samples);
1668         cic->seek_mean = (sector_t)total;
1669 }
1670
1671 /*
1672  * Disable idle window if the process thinks too long or seeks so much that
1673  * it doesn't matter
1674  */
1675 static void
1676 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1677                        struct cfq_io_context *cic)
1678 {
1679         int enable_idle = cfq_cfqq_idle_window(cfqq);
1680
1681         if (!cic->ioc->task || !cfqd->cfq_slice_idle ||
1682             (cfqd->hw_tag && CIC_SEEKY(cic)))
1683                 enable_idle = 0;
1684         else if (sample_valid(cic->ttime_samples)) {
1685                 if (cic->ttime_mean > cfqd->cfq_slice_idle)
1686                         enable_idle = 0;
1687                 else
1688                         enable_idle = 1;
1689         }
1690
1691         if (enable_idle)
1692                 cfq_mark_cfqq_idle_window(cfqq);
1693         else
1694                 cfq_clear_cfqq_idle_window(cfqq);
1695 }
1696
1697
1698 /*
1699  * Check if new_cfqq should preempt the currently active queue. Return 0 for
1700  * no or if we aren't sure, a 1 will cause a preempt.
1701  */
1702 static int
1703 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
1704                    struct cfq_rq *crq)
1705 {
1706         struct cfq_queue *cfqq = cfqd->active_queue;
1707
1708         if (cfq_class_idle(new_cfqq))
1709                 return 0;
1710
1711         if (!cfqq)
1712                 return 0;
1713
1714         if (cfq_class_idle(cfqq))
1715                 return 1;
1716         if (!cfq_cfqq_wait_request(new_cfqq))
1717                 return 0;
1718         /*
1719          * if it doesn't have slice left, forget it
1720          */
1721         if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
1722                 return 0;
1723         if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
1724                 return 1;
1725
1726         return 0;
1727 }
1728
1729 /*
1730  * cfqq preempts the active queue. if we allowed preempt with no slice left,
1731  * let it have half of its nominal slice.
1732  */
1733 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1734 {
1735         struct cfq_queue *__cfqq, *next;
1736
1737         list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
1738                 cfq_resort_rr_list(__cfqq, 1);
1739
1740         if (!cfqq->slice_left)
1741                 cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
1742
1743         cfqq->slice_end = cfqq->slice_left + jiffies;
1744         cfq_slice_expired(cfqd, 1);
1745         __cfq_set_active_queue(cfqd, cfqq);
1746 }
1747
1748 /*
1749  * should really be a ll_rw_blk.c helper
1750  */
1751 static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1752 {
1753         request_queue_t *q = cfqd->queue;
1754
1755         if (!blk_queue_plugged(q))
1756                 q->request_fn(q);
1757         else
1758                 __generic_unplug_device(q);
1759 }
1760
1761 /*
1762  * Called when a new fs request (crq) is added (to cfqq). Check if there's
1763  * something we should do about it
1764  */
1765 static void
1766 cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1767                  struct cfq_rq *crq)
1768 {
1769         struct cfq_io_context *cic = crq->io_context;
1770
1771         /*
1772          * we never wait for an async request and we don't allow preemption
1773          * of an async request. so just return early
1774          */
1775         if (!cfq_crq_is_sync(crq)) {
1776                 /*
1777                  * sync process issued an async request, if it's waiting
1778                  * then expire it and kick rq handling.
1779                  */
1780                 if (cic == cfqd->active_cic &&
1781                     del_timer(&cfqd->idle_slice_timer)) {
1782                         cfq_slice_expired(cfqd, 0);
1783                         cfq_start_queueing(cfqd, cfqq);
1784                 }
1785                 return;
1786         }
1787
1788         cfq_update_io_thinktime(cfqd, cic);
1789         cfq_update_io_seektime(cfqd, cic, crq);
1790         cfq_update_idle_window(cfqd, cfqq, cic);
1791
1792         cic->last_queue = jiffies;
1793         cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
1794
1795         if (cfqq == cfqd->active_queue) {
1796                 /*
1797                  * if we are waiting for a request for this queue, let it rip
1798                  * immediately and flag that we must not expire this queue
1799                  * just now
1800                  */
1801                 if (cfq_cfqq_wait_request(cfqq)) {
1802                         cfq_mark_cfqq_must_dispatch(cfqq);
1803                         del_timer(&cfqd->idle_slice_timer);
1804                         cfq_start_queueing(cfqd, cfqq);
1805                 }
1806         } else if (cfq_should_preempt(cfqd, cfqq, crq)) {
1807                 /*
1808                  * not the active queue - expire current slice if it is
1809                  * idle and has expired it's mean thinktime or this new queue
1810                  * has some old slice time left and is of higher priority
1811                  */
1812                 cfq_preempt_queue(cfqd, cfqq);
1813                 cfq_mark_cfqq_must_dispatch(cfqq);
1814                 cfq_start_queueing(cfqd, cfqq);
1815         }
1816 }
1817
1818 static void cfq_insert_request(request_queue_t *q, struct request *rq)
1819 {
1820         struct cfq_data *cfqd = q->elevator->elevator_data;
1821         struct cfq_rq *crq = RQ_DATA(rq);
1822         struct cfq_queue *cfqq = crq->cfq_queue;
1823
1824         cfq_init_prio_data(cfqq);
1825
1826         cfq_add_crq_rb(crq);
1827
1828         list_add_tail(&rq->queuelist, &cfqq->fifo);
1829
1830         if (rq_mergeable(rq))
1831                 cfq_add_crq_hash(cfqd, crq);
1832
1833         cfq_crq_enqueued(cfqd, cfqq, crq);
1834 }
1835
1836 static void cfq_completed_request(request_queue_t *q, struct request *rq)
1837 {
1838         struct cfq_rq *crq = RQ_DATA(rq);
1839         struct cfq_queue *cfqq = crq->cfq_queue;
1840         struct cfq_data *cfqd = cfqq->cfqd;
1841         const int sync = cfq_crq_is_sync(crq);
1842         unsigned long now;
1843
1844         now = jiffies;
1845
1846         WARN_ON(!cfqd->rq_in_driver);
1847         WARN_ON(!cfqq->on_dispatch[sync]);
1848         cfqd->rq_in_driver--;
1849         cfqq->on_dispatch[sync]--;
1850
1851         if (!cfq_class_idle(cfqq))
1852                 cfqd->last_end_request = now;
1853
1854         if (!cfq_cfqq_dispatched(cfqq)) {
1855                 if (cfq_cfqq_on_rr(cfqq)) {
1856                         cfqq->service_last = now;
1857                         cfq_resort_rr_list(cfqq, 0);
1858                 }
1859         }
1860
1861         if (sync)
1862                 crq->io_context->last_end_request = now;
1863
1864         /*
1865          * If this is the active queue, check if it needs to be expired,
1866          * or if we want to idle in case it has no pending requests.
1867          */
1868         if (cfqd->active_queue == cfqq) {
1869                 if (time_after(now, cfqq->slice_end))
1870                         cfq_slice_expired(cfqd, 0);
1871                 else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1872                         if (!cfq_arm_slice_timer(cfqd, cfqq))
1873                                 cfq_schedule_dispatch(cfqd);
1874                 }
1875         }
1876 }
1877
1878 static struct request *
1879 cfq_former_request(request_queue_t *q, struct request *rq)
1880 {
1881         struct cfq_rq *crq = RQ_DATA(rq);
1882         struct rb_node *rbprev = rb_prev(&crq->rb_node);
1883
1884         if (rbprev)
1885                 return rb_entry_crq(rbprev)->request;
1886
1887         return NULL;
1888 }
1889
1890 static struct request *
1891 cfq_latter_request(request_queue_t *q, struct request *rq)
1892 {
1893         struct cfq_rq *crq = RQ_DATA(rq);
1894         struct rb_node *rbnext = rb_next(&crq->rb_node);
1895
1896         if (rbnext)
1897                 return rb_entry_crq(rbnext)->request;
1898
1899         return NULL;
1900 }
1901
1902 /*
1903  * we temporarily boost lower priority queues if they are holding fs exclusive
1904  * resources. they are boosted to normal prio (CLASS_BE/4)
1905  */
1906 static void cfq_prio_boost(struct cfq_queue *cfqq)
1907 {
1908         const int ioprio_class = cfqq->ioprio_class;
1909         const int ioprio = cfqq->ioprio;
1910
1911         if (has_fs_excl()) {
1912                 /*
1913                  * boost idle prio on transactions that would lock out other
1914                  * users of the filesystem
1915                  */
1916                 if (cfq_class_idle(cfqq))
1917                         cfqq->ioprio_class = IOPRIO_CLASS_BE;
1918                 if (cfqq->ioprio > IOPRIO_NORM)
1919                         cfqq->ioprio = IOPRIO_NORM;
1920         } else {
1921                 /*
1922                  * check if we need to unboost the queue
1923                  */
1924                 if (cfqq->ioprio_class != cfqq->org_ioprio_class)
1925                         cfqq->ioprio_class = cfqq->org_ioprio_class;
1926                 if (cfqq->ioprio != cfqq->org_ioprio)
1927                         cfqq->ioprio = cfqq->org_ioprio;
1928         }
1929
1930         /*
1931          * refile between round-robin lists if we moved the priority class
1932          */
1933         if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
1934             cfq_cfqq_on_rr(cfqq))
1935                 cfq_resort_rr_list(cfqq, 0);
1936 }
1937
1938 static inline int
1939 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1940                 struct task_struct *task, int rw)
1941 {
1942         if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
1943             !cfq_cfqq_must_alloc_slice(cfqq)) {
1944                 cfq_mark_cfqq_must_alloc_slice(cfqq);
1945                 return ELV_MQUEUE_MUST;
1946         }
1947
1948         return ELV_MQUEUE_MAY;
1949 }
1950
1951 static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
1952 {
1953         struct cfq_data *cfqd = q->elevator->elevator_data;
1954         struct task_struct *tsk = current;
1955         struct cfq_queue *cfqq;
1956
1957         /*
1958          * don't force setup of a queue from here, as a call to may_queue
1959          * does not necessarily imply that a request actually will be queued.
1960          * so just lookup a possibly existing queue, or return 'may queue'
1961          * if that fails
1962          */
1963         cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
1964         if (cfqq) {
1965                 cfq_init_prio_data(cfqq);
1966                 cfq_prio_boost(cfqq);
1967
1968                 return __cfq_may_queue(cfqd, cfqq, tsk, rw);
1969         }
1970
1971         return ELV_MQUEUE_MAY;
1972 }
1973
1974 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
1975 {
1976         struct cfq_data *cfqd = q->elevator->elevator_data;
1977
1978         if (unlikely(cfqd->rq_starved)) {
1979                 struct request_list *rl = &q->rq;
1980
1981                 smp_mb();
1982                 if (waitqueue_active(&rl->wait[READ]))
1983                         wake_up(&rl->wait[READ]);
1984                 if (waitqueue_active(&rl->wait[WRITE]))
1985                         wake_up(&rl->wait[WRITE]);
1986         }
1987 }
1988
1989 /*
1990  * queue lock held here
1991  */
1992 static void cfq_put_request(request_queue_t *q, struct request *rq)
1993 {
1994         struct cfq_data *cfqd = q->elevator->elevator_data;
1995         struct cfq_rq *crq = RQ_DATA(rq);
1996
1997         if (crq) {
1998                 struct cfq_queue *cfqq = crq->cfq_queue;
1999                 const int rw = rq_data_dir(rq);
2000
2001                 BUG_ON(!cfqq->allocated[rw]);
2002                 cfqq->allocated[rw]--;
2003
2004                 put_io_context(crq->io_context->ioc);
2005
2006                 mempool_free(crq, cfqd->crq_pool);
2007                 rq->elevator_private = NULL;
2008
2009                 cfq_check_waiters(q, cfqq);
2010                 cfq_put_queue(cfqq);
2011         }
2012 }
2013
2014 /*
2015  * Allocate cfq data structures associated with this request.
2016  */
2017 static int
2018 cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
2019                 gfp_t gfp_mask)
2020 {
2021         struct cfq_data *cfqd = q->elevator->elevator_data;
2022         struct task_struct *tsk = current;
2023         struct cfq_io_context *cic;
2024         const int rw = rq_data_dir(rq);
2025         pid_t key = cfq_queue_pid(tsk, rw);
2026         struct cfq_queue *cfqq;
2027         struct cfq_rq *crq;
2028         unsigned long flags;
2029         int is_sync = key != CFQ_KEY_ASYNC;
2030
2031         might_sleep_if(gfp_mask & __GFP_WAIT);
2032
2033         cic = cfq_get_io_context(cfqd, gfp_mask);
2034
2035         spin_lock_irqsave(q->queue_lock, flags);
2036
2037         if (!cic)
2038                 goto queue_fail;
2039
2040         if (!cic->cfqq[is_sync]) {
2041                 cfqq = cfq_get_queue(cfqd, key, tsk, gfp_mask);
2042                 if (!cfqq)
2043                         goto queue_fail;
2044
2045                 cic->cfqq[is_sync] = cfqq;
2046         } else
2047                 cfqq = cic->cfqq[is_sync];
2048
2049         cfqq->allocated[rw]++;
2050         cfq_clear_cfqq_must_alloc(cfqq);
2051         cfqd->rq_starved = 0;
2052         atomic_inc(&cfqq->ref);
2053         spin_unlock_irqrestore(q->queue_lock, flags);
2054
2055         crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
2056         if (crq) {
2057                 RB_CLEAR_NODE(&crq->rb_node);
2058                 crq->rb_key = 0;
2059                 crq->request = rq;
2060                 INIT_HLIST_NODE(&crq->hash);
2061                 crq->cfq_queue = cfqq;
2062                 crq->io_context = cic;
2063
2064                 if (is_sync)
2065                         cfq_mark_crq_is_sync(crq);
2066                 else
2067                         cfq_clear_crq_is_sync(crq);
2068
2069                 rq->elevator_private = crq;
2070                 return 0;
2071         }
2072
2073         spin_lock_irqsave(q->queue_lock, flags);
2074         cfqq->allocated[rw]--;
2075         if (!(cfqq->allocated[0] + cfqq->allocated[1]))
2076                 cfq_mark_cfqq_must_alloc(cfqq);
2077         cfq_put_queue(cfqq);
2078 queue_fail:
2079         if (cic)
2080                 put_io_context(cic->ioc);
2081         /*
2082          * mark us rq allocation starved. we need to kickstart the process
2083          * ourselves if there are no pending requests that can do it for us.
2084          * that would be an extremely rare OOM situation
2085          */
2086         cfqd->rq_starved = 1;
2087         cfq_schedule_dispatch(cfqd);
2088         spin_unlock_irqrestore(q->queue_lock, flags);
2089         return 1;
2090 }
2091
2092 static void cfq_kick_queue(void *data)
2093 {
2094         request_queue_t *q = data;
2095         struct cfq_data *cfqd = q->elevator->elevator_data;
2096         unsigned long flags;
2097
2098         spin_lock_irqsave(q->queue_lock, flags);
2099
2100         if (cfqd->rq_starved) {
2101                 struct request_list *rl = &q->rq;
2102
2103                 /*
2104                  * we aren't guaranteed to get a request after this, but we
2105                  * have to be opportunistic
2106                  */
2107                 smp_mb();
2108                 if (waitqueue_active(&rl->wait[READ]))
2109                         wake_up(&rl->wait[READ]);
2110                 if (waitqueue_active(&rl->wait[WRITE]))
2111                         wake_up(&rl->wait[WRITE]);
2112         }
2113
2114         blk_remove_plug(q);
2115         q->request_fn(q);
2116         spin_unlock_irqrestore(q->queue_lock, flags);
2117 }
2118
2119 /*
2120  * Timer running if the active_queue is currently idling inside its time slice
2121  */
2122 static void cfq_idle_slice_timer(unsigned long data)
2123 {
2124         struct cfq_data *cfqd = (struct cfq_data *) data;
2125         struct cfq_queue *cfqq;
2126         unsigned long flags;
2127
2128         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2129
2130         if ((cfqq = cfqd->active_queue) != NULL) {
2131                 unsigned long now = jiffies;
2132
2133                 /*
2134                  * expired
2135                  */
2136                 if (time_after(now, cfqq->slice_end))
2137                         goto expire;
2138
2139                 /*
2140                  * only expire and reinvoke request handler, if there are
2141                  * other queues with pending requests
2142                  */
2143                 if (!cfqd->busy_queues)
2144                         goto out_cont;
2145
2146                 /*
2147                  * not expired and it has a request pending, let it dispatch
2148                  */
2149                 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) {
2150                         cfq_mark_cfqq_must_dispatch(cfqq);
2151                         goto out_kick;
2152                 }
2153         }
2154 expire:
2155         cfq_slice_expired(cfqd, 0);
2156 out_kick:
2157         cfq_schedule_dispatch(cfqd);
2158 out_cont:
2159         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2160 }
2161
2162 /*
2163  * Timer running if an idle class queue is waiting for service
2164  */
2165 static void cfq_idle_class_timer(unsigned long data)
2166 {
2167         struct cfq_data *cfqd = (struct cfq_data *) data;
2168         unsigned long flags, end;
2169
2170         spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2171
2172         /*
2173          * race with a non-idle queue, reset timer
2174          */
2175         end = cfqd->last_end_request + CFQ_IDLE_GRACE;
2176         if (!time_after_eq(jiffies, end))
2177                 mod_timer(&cfqd->idle_class_timer, end);
2178         else
2179                 cfq_schedule_dispatch(cfqd);
2180
2181         spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2182 }
2183
2184 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
2185 {
2186         del_timer_sync(&cfqd->idle_slice_timer);
2187         del_timer_sync(&cfqd->idle_class_timer);
2188         blk_sync_queue(cfqd->queue);
2189 }
2190
2191 static void cfq_exit_queue(elevator_t *e)
2192 {
2193         struct cfq_data *cfqd = e->elevator_data;
2194         request_queue_t *q = cfqd->queue;
2195
2196         cfq_shutdown_timer_wq(cfqd);
2197
2198         spin_lock(&cfq_exit_lock);
2199         spin_lock_irq(q->queue_lock);
2200
2201         if (cfqd->active_queue)
2202                 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
2203
2204         while (!list_empty(&cfqd->cic_list)) {
2205                 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
2206                                                         struct cfq_io_context,
2207                                                         queue_list);
2208                 if (cic->cfqq[ASYNC]) {
2209                         cfq_put_queue(cic->cfqq[ASYNC]);
2210                         cic->cfqq[ASYNC] = NULL;
2211                 }
2212                 if (cic->cfqq[SYNC]) {
2213                         cfq_put_queue(cic->cfqq[SYNC]);
2214                         cic->cfqq[SYNC] = NULL;
2215                 }
2216                 cic->key = NULL;
2217                 list_del_init(&cic->queue_list);
2218         }
2219
2220         spin_unlock_irq(q->queue_lock);
2221         spin_unlock(&cfq_exit_lock);
2222
2223         cfq_shutdown_timer_wq(cfqd);
2224
2225         mempool_destroy(cfqd->crq_pool);
2226         kfree(cfqd->crq_hash);
2227         kfree(cfqd->cfq_hash);
2228         kfree(cfqd);
2229 }
2230
2231 static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
2232 {
2233         struct cfq_data *cfqd;
2234         int i;
2235
2236         cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
2237         if (!cfqd)
2238                 return NULL;
2239
2240         memset(cfqd, 0, sizeof(*cfqd));
2241
2242         for (i = 0; i < CFQ_PRIO_LISTS; i++)
2243                 INIT_LIST_HEAD(&cfqd->rr_list[i]);
2244
2245         INIT_LIST_HEAD(&cfqd->busy_rr);
2246         INIT_LIST_HEAD(&cfqd->cur_rr);
2247         INIT_LIST_HEAD(&cfqd->idle_rr);
2248         INIT_LIST_HEAD(&cfqd->empty_list);
2249         INIT_LIST_HEAD(&cfqd->cic_list);
2250
2251         cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
2252         if (!cfqd->crq_hash)
2253                 goto out_crqhash;
2254
2255         cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
2256         if (!cfqd->cfq_hash)
2257                 goto out_cfqhash;
2258
2259         cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
2260         if (!cfqd->crq_pool)
2261                 goto out_crqpool;
2262
2263         for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
2264                 INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
2265         for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2266                 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2267
2268         cfqd->queue = q;
2269
2270         init_timer(&cfqd->idle_slice_timer);
2271         cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
2272         cfqd->idle_slice_timer.data = (unsigned long) cfqd;
2273
2274         init_timer(&cfqd->idle_class_timer);
2275         cfqd->idle_class_timer.function = cfq_idle_class_timer;
2276         cfqd->idle_class_timer.data = (unsigned long) cfqd;
2277
2278         INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
2279
2280         cfqd->cfq_queued = cfq_queued;
2281         cfqd->cfq_quantum = cfq_quantum;
2282         cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
2283         cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
2284         cfqd->cfq_back_max = cfq_back_max;
2285         cfqd->cfq_back_penalty = cfq_back_penalty;
2286         cfqd->cfq_slice[0] = cfq_slice_async;
2287         cfqd->cfq_slice[1] = cfq_slice_sync;
2288         cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2289         cfqd->cfq_slice_idle = cfq_slice_idle;
2290
2291         return cfqd;
2292 out_crqpool:
2293         kfree(cfqd->cfq_hash);
2294 out_cfqhash:
2295         kfree(cfqd->crq_hash);
2296 out_crqhash:
2297         kfree(cfqd);
2298         return NULL;
2299 }
2300
2301 static void cfq_slab_kill(void)
2302 {
2303         if (crq_pool)
2304                 kmem_cache_destroy(crq_pool);
2305         if (cfq_pool)
2306                 kmem_cache_destroy(cfq_pool);
2307         if (cfq_ioc_pool)
2308                 kmem_cache_destroy(cfq_ioc_pool);
2309 }
2310
2311 static int __init cfq_slab_setup(void)
2312 {
2313         crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
2314                                         NULL, NULL);
2315         if (!crq_pool)
2316                 goto fail;
2317
2318         cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
2319                                         NULL, NULL);
2320         if (!cfq_pool)
2321                 goto fail;
2322
2323         cfq_ioc_pool = kmem_cache_create("cfq_ioc_pool",
2324                         sizeof(struct cfq_io_context), 0, 0, NULL, NULL);
2325         if (!cfq_ioc_pool)
2326                 goto fail;
2327
2328         return 0;
2329 fail:
2330         cfq_slab_kill();
2331         return -ENOMEM;
2332 }
2333
2334 /*
2335  * sysfs parts below -->
2336  */
2337
2338 static ssize_t
2339 cfq_var_show(unsigned int var, char *page)
2340 {
2341         return sprintf(page, "%d\n", var);
2342 }
2343
2344 static ssize_t
2345 cfq_var_store(unsigned int *var, const char *page, size_t count)
2346 {
2347         char *p = (char *) page;
2348
2349         *var = simple_strtoul(p, &p, 10);
2350         return count;
2351 }
2352
2353 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
2354 static ssize_t __FUNC(elevator_t *e, char *page)                        \
2355 {                                                                       \
2356         struct cfq_data *cfqd = e->elevator_data;                       \
2357         unsigned int __data = __VAR;                                    \
2358         if (__CONV)                                                     \
2359                 __data = jiffies_to_msecs(__data);                      \
2360         return cfq_var_show(__data, (page));                            \
2361 }
2362 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
2363 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
2364 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
2365 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
2366 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
2367 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
2368 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
2369 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
2370 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
2371 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
2372 #undef SHOW_FUNCTION
2373
2374 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
2375 static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)    \
2376 {                                                                       \
2377         struct cfq_data *cfqd = e->elevator_data;                       \
2378         unsigned int __data;                                            \
2379         int ret = cfq_var_store(&__data, (page), count);                \
2380         if (__data < (MIN))                                             \
2381                 __data = (MIN);                                         \
2382         else if (__data > (MAX))                                        \
2383                 __data = (MAX);                                         \
2384         if (__CONV)                                                     \
2385                 *(__PTR) = msecs_to_jiffies(__data);                    \
2386         else                                                            \
2387                 *(__PTR) = __data;                                      \
2388         return ret;                                                     \
2389 }
2390 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
2391 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
2392 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
2393 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
2394 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
2395 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
2396 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
2397 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
2398 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
2399 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
2400 #undef STORE_FUNCTION
2401
2402 #define CFQ_ATTR(name) \
2403         __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
2404
2405 static struct elv_fs_entry cfq_attrs[] = {
2406         CFQ_ATTR(quantum),
2407         CFQ_ATTR(queued),
2408         CFQ_ATTR(fifo_expire_sync),
2409         CFQ_ATTR(fifo_expire_async),
2410         CFQ_ATTR(back_seek_max),
2411         CFQ_ATTR(back_seek_penalty),
2412         CFQ_ATTR(slice_sync),
2413         CFQ_ATTR(slice_async),
2414         CFQ_ATTR(slice_async_rq),
2415         CFQ_ATTR(slice_idle),
2416         __ATTR_NULL
2417 };
2418
2419 static struct elevator_type iosched_cfq = {
2420         .ops = {
2421                 .elevator_merge_fn =            cfq_merge,
2422                 .elevator_merged_fn =           cfq_merged_request,
2423                 .elevator_merge_req_fn =        cfq_merged_requests,
2424                 .elevator_dispatch_fn =         cfq_dispatch_requests,
2425                 .elevator_add_req_fn =          cfq_insert_request,
2426                 .elevator_activate_req_fn =     cfq_activate_request,
2427                 .elevator_deactivate_req_fn =   cfq_deactivate_request,
2428                 .elevator_queue_empty_fn =      cfq_queue_empty,
2429                 .elevator_completed_req_fn =    cfq_completed_request,
2430                 .elevator_former_req_fn =       cfq_former_request,
2431                 .elevator_latter_req_fn =       cfq_latter_request,
2432                 .elevator_set_req_fn =          cfq_set_request,
2433                 .elevator_put_req_fn =          cfq_put_request,
2434                 .elevator_may_queue_fn =        cfq_may_queue,
2435                 .elevator_init_fn =             cfq_init_queue,
2436                 .elevator_exit_fn =             cfq_exit_queue,
2437                 .trim =                         cfq_trim,
2438         },
2439         .elevator_attrs =       cfq_attrs,
2440         .elevator_name =        "cfq",
2441         .elevator_owner =       THIS_MODULE,
2442 };
2443
2444 static int __init cfq_init(void)
2445 {
2446         int ret;
2447
2448         /*
2449          * could be 0 on HZ < 1000 setups
2450          */
2451         if (!cfq_slice_async)
2452                 cfq_slice_async = 1;
2453         if (!cfq_slice_idle)
2454                 cfq_slice_idle = 1;
2455
2456         if (cfq_slab_setup())
2457                 return -ENOMEM;
2458
2459         ret = elv_register(&iosched_cfq);
2460         if (ret)
2461                 cfq_slab_kill();
2462
2463         return ret;
2464 }
2465
2466 static void __exit cfq_exit(void)
2467 {
2468         DECLARE_COMPLETION_ONSTACK(all_gone);
2469         elv_unregister(&iosched_cfq);
2470         ioc_gone = &all_gone;
2471         /* ioc_gone's update must be visible before reading ioc_count */
2472         smp_wmb();
2473         if (atomic_read(&ioc_count))
2474                 wait_for_completion(ioc_gone);
2475         synchronize_rcu();
2476         cfq_slab_kill();
2477 }
2478
2479 module_init(cfq_init);
2480 module_exit(cfq_exit);
2481
2482 MODULE_AUTHOR("Jens Axboe");
2483 MODULE_LICENSE("GPL");
2484 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");