ckrm e16 io controller merge v5
authorMarc Fiuczynski <mef@cs.princeton.edu>
Mon, 27 Dec 2004 22:13:16 +0000 (22:13 +0000)
committerMarc Fiuczynski <mef@cs.princeton.edu>
Mon, 27 Dec 2004 22:13:16 +0000 (22:13 +0000)
drivers/block/cfq-iosched-orig.c [deleted file]
drivers/block/cfq-iosched.c
drivers/block/ckrm-io.c
drivers/block/ckrm-iostub.c
include/linux/ckrm-io.h
include/linux/fs.h
kernel/ckrm/Makefile
kernel/ckrm/ckrm_laq.c [deleted file]
kernel/ckrm/ckrm_listenaq.c

diff --git a/drivers/block/cfq-iosched-orig.c b/drivers/block/cfq-iosched-orig.c
deleted file mode 100644 (file)
index 977d32d..0000000
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- *  linux/drivers/block/cfq-iosched.c
- *
- *  CFQ, or complete fairness queueing, disk scheduler.
- *
- *  Based on ideas from a previously unfinished io
- *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
- *
- *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/hash.h>
-#include <linux/rbtree.h>
-#include <linux/mempool.h>
-
-/*
- * tunables
- */
-static int cfq_quantum = 4;
-static int cfq_queued = 8;
-
-#define CFQ_QHASH_SHIFT                6
-#define CFQ_QHASH_ENTRIES      (1 << CFQ_QHASH_SHIFT)
-#define list_entry_qhash(entry)        list_entry((entry), struct cfq_queue, cfq_hash)
-
-#define CFQ_MHASH_SHIFT                8
-#define CFQ_MHASH_BLOCK(sec)   ((sec) >> 3)
-#define CFQ_MHASH_ENTRIES      (1 << CFQ_MHASH_SHIFT)
-#define CFQ_MHASH_FN(sec)      (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
-#define ON_MHASH(crq)          !list_empty(&(crq)->hash)
-#define rq_hash_key(rq)                ((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr)   list_entry((ptr), struct cfq_rq, hash)
-
-#define list_entry_cfqq(ptr)   list_entry((ptr), struct cfq_queue, cfq_list)
-
-#define RQ_DATA(rq)            ((struct cfq_rq *) (rq)->elevator_private)
-
-static kmem_cache_t *crq_pool;
-static kmem_cache_t *cfq_pool;
-static mempool_t *cfq_mpool;
-
-struct cfq_data {
-       struct list_head rr_list;
-       struct list_head *dispatch;
-       struct list_head *cfq_hash;
-
-       struct list_head *crq_hash;
-
-       unsigned int busy_queues;
-       unsigned int max_queued;
-
-       mempool_t *crq_pool;
-};
-
-struct cfq_queue {
-       struct list_head cfq_hash;
-       struct list_head cfq_list;
-       struct rb_root sort_list;
-       int pid;
-       int queued[2];
-#if 0
-       /*
-        * with a simple addition like this, we can do io priorities. almost.
-        * does need a split request free list, too.
-        */
-       int io_prio
-#endif
-};
-
-struct cfq_rq {
-       struct rb_node rb_node;
-       sector_t rb_key;
-
-       struct request *request;
-
-       struct cfq_queue *cfq_queue;
-
-       struct list_head hash;
-};
-
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
-static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq);
-
-/*
- * lots of deadline iosched dupes, can be abstracted later...
- */
-static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
-{
-       list_del_init(&crq->hash);
-}
-
-static inline void cfq_del_crq_hash(struct cfq_rq *crq)
-{
-       if (ON_MHASH(crq))
-               __cfq_del_crq_hash(crq);
-}
-
-static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
-{
-       cfq_del_crq_hash(crq);
-
-       if (q->last_merge == crq->request)
-               q->last_merge = NULL;
-}
-
-static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-       struct request *rq = crq->request;
-
-       BUG_ON(ON_MHASH(crq));
-
-       list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
-}
-
-static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
-{
-       struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
-       struct list_head *entry, *next = hash_list->next;
-
-       while ((entry = next) != hash_list) {
-               struct cfq_rq *crq = list_entry_hash(entry);
-               struct request *__rq = crq->request;
-
-               next = entry->next;
-
-               BUG_ON(!ON_MHASH(crq));
-
-               if (!rq_mergeable(__rq)) {
-                       __cfq_del_crq_hash(crq);
-                       continue;
-               }
-
-               if (rq_hash_key(__rq) == offset)
-                       return __rq;
-       }
-
-       return NULL;
-}
-
-/*
- * rb tree support functions
- */
-#define RB_NONE                (2)
-#define RB_EMPTY(node) ((node)->rb_node == NULL)
-#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
-#define RB_CLEAR_ROOT(root)    ((root)->rb_node = NULL)
-#define ON_RB(node)    ((node)->rb_color != RB_NONE)
-#define rb_entry_crq(node)     rb_entry((node), struct cfq_rq, rb_node)
-#define rq_rb_key(rq)          (rq)->sector
-
-static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
-{
-       if (ON_RB(&crq->rb_node)) {
-               cfqq->queued[rq_data_dir(crq->request)]--;
-               rb_erase(&crq->rb_node, &cfqq->sort_list);
-               crq->cfq_queue = NULL;
-       }
-}
-
-static struct cfq_rq *
-__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
-{
-       struct rb_node **p = &cfqq->sort_list.rb_node;
-       struct rb_node *parent = NULL;
-       struct cfq_rq *__crq;
-
-       while (*p) {
-               parent = *p;
-               __crq = rb_entry_crq(parent);
-
-               if (crq->rb_key < __crq->rb_key)
-                       p = &(*p)->rb_left;
-               else if (crq->rb_key > __crq->rb_key)
-                       p = &(*p)->rb_right;
-               else
-                       return __crq;
-       }
-
-       rb_link_node(&crq->rb_node, parent, p);
-       return 0;
-}
-
-static void
-cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
-{
-       struct request *rq = crq->request;
-       struct cfq_rq *__alias;
-
-       crq->rb_key = rq_rb_key(rq);
-       cfqq->queued[rq_data_dir(rq)]++;
-retry:
-       __alias = __cfq_add_crq_rb(cfqq, crq);
-       if (!__alias) {
-               rb_insert_color(&crq->rb_node, &cfqq->sort_list);
-               crq->cfq_queue = cfqq;
-               return;
-       }
-
-       cfq_del_crq_rb(cfqq, __alias);
-       cfq_dispatch_sort(cfqd->dispatch, __alias);
-       goto retry;
-}
-
-static struct request *
-cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
-{
-       struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
-       struct rb_node *n;
-
-       if (!cfqq)
-               goto out;
-
-       n = cfqq->sort_list.rb_node;
-       while (n) {
-               struct cfq_rq *crq = rb_entry_crq(n);
-
-               if (sector < crq->rb_key)
-                       n = n->rb_left;
-               else if (sector > crq->rb_key)
-                       n = n->rb_right;
-               else
-                       return crq->request;
-       }
-
-out:
-       return NULL;
-}
-
-static void cfq_remove_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(rq);
-
-       if (crq) {
-               struct cfq_queue *cfqq = crq->cfq_queue;
-
-               cfq_remove_merge_hints(q, crq);
-               list_del_init(&rq->queuelist);
-
-               if (cfqq) {
-                       cfq_del_crq_rb(cfqq, crq);
-
-                       if (RB_EMPTY(&cfqq->sort_list))
-                               cfq_put_queue(cfqd, cfqq);
-               }
-       }
-}
-
-static int
-cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct request *__rq;
-       int ret;
-
-       ret = elv_try_last_merge(q, bio);
-       if (ret != ELEVATOR_NO_MERGE) {
-               __rq = q->last_merge;
-               goto out_insert;
-       }
-
-       __rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
-       if (__rq) {
-               BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-
-               if (elv_rq_merge_ok(__rq, bio)) {
-                       ret = ELEVATOR_BACK_MERGE;
-                       goto out;
-               }
-       }
-
-       __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
-       if (__rq) {
-               if (elv_rq_merge_ok(__rq, bio)) {
-                       ret = ELEVATOR_FRONT_MERGE;
-                       goto out;
-               }
-       }
-
-       return ELEVATOR_NO_MERGE;
-out:
-       q->last_merge = __rq;
-out_insert:
-       *req = __rq;
-       return ret;
-}
-
-static void cfq_merged_request(request_queue_t *q, struct request *req)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(req);
-
-       cfq_del_crq_hash(crq);
-       cfq_add_crq_hash(cfqd, crq);
-
-       if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
-               struct cfq_queue *cfqq = crq->cfq_queue;
-
-               cfq_del_crq_rb(cfqq, crq);
-               cfq_add_crq_rb(cfqd, cfqq, crq);
-       }
-
-       q->last_merge = req;
-}
-
-static void
-cfq_merged_requests(request_queue_t *q, struct request *req,
-                   struct request *next)
-{
-       cfq_merged_request(q, req);
-       cfq_remove_request(q, next);
-}
-
-static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq)
-{
-       struct list_head *entry = head;
-       struct request *__rq;
-
-       if (!list_empty(head)) {
-               __rq = list_entry_rq(head->next);
-
-               if (crq->request->sector < __rq->sector) {
-                       entry = head->prev;
-                       goto link;
-               }
-       }
-
-       while ((entry = entry->prev) != head) {
-               __rq = list_entry_rq(entry);
-
-               if (crq->request->sector <= __rq->sector)
-                       break;
-       }
-
-link:
-       list_add_tail(&crq->request->queuelist, entry);
-}
-
-static inline void
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-                       struct cfq_queue *cfqq)
-{
-       struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
-
-       cfq_del_crq_rb(cfqq, crq);
-       cfq_remove_merge_hints(q, crq);
-       cfq_dispatch_sort(cfqd->dispatch, crq);
-}
-
-static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
-{
-       struct cfq_queue *cfqq;
-       struct list_head *entry, *tmp;
-       int ret, queued, good_queues;
-
-       if (list_empty(&cfqd->rr_list))
-               return 0;
-
-       queued = ret = 0;
-restart:
-       good_queues = 0;
-       list_for_each_safe(entry, tmp, &cfqd->rr_list) {
-               cfqq = list_entry_cfqq(cfqd->rr_list.next);
-
-               BUG_ON(RB_EMPTY(&cfqq->sort_list));
-
-               __cfq_dispatch_requests(q, cfqd, cfqq);
-
-               if (RB_EMPTY(&cfqq->sort_list))
-                       cfq_put_queue(cfqd, cfqq);
-               else
-                       good_queues++;
-
-               queued++;
-               ret = 1;
-       }
-
-       if ((queued < cfq_quantum) && good_queues)
-               goto restart;
-
-       return ret;
-}
-
-static struct request *cfq_next_request(request_queue_t *q)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct request *rq;
-
-       if (!list_empty(cfqd->dispatch)) {
-               struct cfq_rq *crq;
-dispatch:
-               rq = list_entry_rq(cfqd->dispatch->next);
-
-               crq = RQ_DATA(rq);
-               if (crq)
-                       cfq_remove_merge_hints(q, crq);
-
-               return rq;
-       }
-
-       if (cfq_dispatch_requests(q, cfqd))
-               goto dispatch;
-
-       return NULL;
-}
-
-static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
-{
-       struct list_head *hash_list = &cfqd->cfq_hash[hashval];
-       struct list_head *entry;
-
-       list_for_each(entry, hash_list) {
-               struct cfq_queue *__cfqq = list_entry_qhash(entry);
-
-               if (__cfqq->pid == pid)
-                       return __cfqq;
-       }
-
-       return NULL;
-}
-
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
-{
-       const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
-
-       return __cfq_find_cfq_hash(cfqd, pid, hashval);
-}
-
-static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-       cfqd->busy_queues--;
-       list_del(&cfqq->cfq_list);
-       list_del(&cfqq->cfq_hash);
-       mempool_free(cfqq, cfq_mpool);
-}
-
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid)
-{
-       const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
-       struct cfq_queue *cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
-
-       if (!cfqq) {
-               cfqq = mempool_alloc(cfq_mpool, GFP_NOIO);
-
-               INIT_LIST_HEAD(&cfqq->cfq_hash);
-               INIT_LIST_HEAD(&cfqq->cfq_list);
-               RB_CLEAR_ROOT(&cfqq->sort_list);
-
-               cfqq->pid = pid;
-               cfqq->queued[0] = cfqq->queued[1] = 0;
-               list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
-       }
-
-       return cfqq;
-}
-
-static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-       struct cfq_queue *cfqq;
-
-       cfqq = cfq_get_queue(cfqd, current->tgid);
-
-       cfq_add_crq_rb(cfqd, cfqq, crq);
-
-       if (list_empty(&cfqq->cfq_list)) {
-               list_add(&cfqq->cfq_list, &cfqd->rr_list);
-               cfqd->busy_queues++;
-       }
-}
-
-static void
-cfq_insert_request(request_queue_t *q, struct request *rq, int where)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(rq);
-
-       switch (where) {
-               case ELEVATOR_INSERT_BACK:
-                       while (cfq_dispatch_requests(q, cfqd))
-                               ;
-                       list_add_tail(&rq->queuelist, cfqd->dispatch);
-                       break;
-               case ELEVATOR_INSERT_FRONT:
-                       list_add(&rq->queuelist, cfqd->dispatch);
-                       break;
-               case ELEVATOR_INSERT_SORT:
-                       BUG_ON(!blk_fs_request(rq));
-                       cfq_enqueue(cfqd, crq);
-                       break;
-               default:
-                       printk("%s: bad insert point %d\n", __FUNCTION__,where);
-                       return;
-       }
-
-       if (rq_mergeable(rq)) {
-               cfq_add_crq_hash(cfqd, crq);
-
-               if (!q->last_merge)
-                       q->last_merge = rq;
-       }
-}
-
-static int cfq_queue_empty(request_queue_t *q)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-
-       if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
-               return 1;
-
-       return 0;
-}
-
-static struct request *
-cfq_former_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_rq *crq = RQ_DATA(rq);
-       struct rb_node *rbprev = rb_prev(&crq->rb_node);
-
-       if (rbprev)
-               return rb_entry_crq(rbprev)->request;
-
-       return NULL;
-}
-
-static struct request *
-cfq_latter_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_rq *crq = RQ_DATA(rq);
-       struct rb_node *rbnext = rb_next(&crq->rb_node);
-
-       if (rbnext)
-               return rb_entry_crq(rbnext)->request;
-
-       return NULL;
-}
-
-static int cfq_may_queue(request_queue_t *q, int rw)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_queue *cfqq;
-       int ret = 1;
-
-       if (!cfqd->busy_queues)
-               goto out;
-
-       cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
-       if (cfqq) {
-               int limit = (q->nr_requests - cfq_queued) / cfqd->busy_queues;
-
-               if (limit < 3)
-                       limit = 3;
-               else if (limit > cfqd->max_queued)
-                       limit = cfqd->max_queued;
-
-               if (cfqq->queued[rw] > limit)
-                       ret = 0;
-       }
-out:
-       return ret;
-}
-
-static void cfq_put_request(request_queue_t *q, struct request *rq)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = RQ_DATA(rq);
-
-       if (crq) {
-               BUG_ON(q->last_merge == rq);
-               BUG_ON(ON_MHASH(crq));
-
-               mempool_free(crq, cfqd->crq_pool);
-               rq->elevator_private = NULL;
-       }
-}
-
-static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
-{
-       struct cfq_data *cfqd = q->elevator.elevator_data;
-       struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
-
-       if (crq) {
-               RB_CLEAR(&crq->rb_node);
-               crq->request = rq;
-               crq->cfq_queue = NULL;
-               INIT_LIST_HEAD(&crq->hash);
-               rq->elevator_private = crq;
-               return 0;
-       }
-
-       return 1;
-}
-
-static void cfq_exit(request_queue_t *q, elevator_t *e)
-{
-       struct cfq_data *cfqd = e->elevator_data;
-
-       e->elevator_data = NULL;
-       mempool_destroy(cfqd->crq_pool);
-       kfree(cfqd->crq_hash);
-       kfree(cfqd->cfq_hash);
-       kfree(cfqd);
-}
-
-static int cfq_init(request_queue_t *q, elevator_t *e)
-{
-       struct cfq_data *cfqd;
-       int i;
-
-       cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
-       if (!cfqd)
-               return -ENOMEM;
-
-       memset(cfqd, 0, sizeof(*cfqd));
-       INIT_LIST_HEAD(&cfqd->rr_list);
-
-       cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
-       if (!cfqd->crq_hash)
-               goto out_crqhash;
-
-       cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
-       if (!cfqd->cfq_hash)
-               goto out_cfqhash;
-
-       cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
-       if (!cfqd->crq_pool)
-               goto out_crqpool;
-
-       for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
-               INIT_LIST_HEAD(&cfqd->crq_hash[i]);
-       for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
-               INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
-
-       cfqd->dispatch = &q->queue_head;
-       e->elevator_data = cfqd;
-
-       /*
-        * just set it to some high value, we want anyone to be able to queue
-        * some requests. fairness is handled differently
-        */
-       cfqd->max_queued = q->nr_requests;
-       q->nr_requests = 8192;
-
-       return 0;
-out_crqpool:
-       kfree(cfqd->cfq_hash);
-out_cfqhash:
-       kfree(cfqd->crq_hash);
-out_crqhash:
-       kfree(cfqd);
-       return -ENOMEM;
-}
-
-static int __init cfq_slab_setup(void)
-{
-       crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
-                                       NULL, NULL);
-
-       if (!crq_pool)
-               panic("cfq_iosched: can't init crq pool\n");
-
-       cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
-                                       NULL, NULL);
-
-       if (!cfq_pool)
-               panic("cfq_iosched: can't init cfq pool\n");
-
-       cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
-
-       if (!cfq_mpool)
-               panic("cfq_iosched: can't init cfq mpool\n");
-
-       return 0;
-}
-
-subsys_initcall(cfq_slab_setup);
-
-elevator_t iosched_cfq = {
-       .elevator_name =                "cfq",
-       .elevator_merge_fn =            cfq_merge,
-       .elevator_merged_fn =           cfq_merged_request,
-       .elevator_merge_req_fn =        cfq_merged_requests,
-       .elevator_next_req_fn =         cfq_next_request,
-       .elevator_add_req_fn =          cfq_insert_request,
-       .elevator_remove_req_fn =       cfq_remove_request,
-       .elevator_queue_empty_fn =      cfq_queue_empty,
-       .elevator_former_req_fn =       cfq_former_request,
-       .elevator_latter_req_fn =       cfq_latter_request,
-       .elevator_set_req_fn =          cfq_set_request,
-       .elevator_put_req_fn =          cfq_put_request,
-       .elevator_may_queue_fn =        cfq_may_queue,
-       .elevator_init_fn =             cfq_init,
-       .elevator_exit_fn =             cfq_exit,
-};
-
-EXPORT_SYMBOL(iosched_cfq);
index 7b45a80..70d66c5 100644 (file)
@@ -39,8 +39,6 @@
 #error Cannot support this many io priority levels
 #endif
 
-#define LIMIT_DEBUG   1
-
 /*
  * tunables
  */
@@ -52,6 +50,10 @@ static int cfq_queued = 4;
 static int cfq_grace_rt = HZ / 100 ?: 1;
 static int cfq_grace_idle = HZ / 10;
 
+#define CFQ_EPOCH              1000000000
+#define CFQ_SECTORATE          1000   
+#define CFQ_HMAX_PCT           80
+
 #define CFQ_QHASH_SHIFT                6
 #define CFQ_QHASH_ENTRIES      (1 << CFQ_QHASH_SHIFT)
 #define list_entry_qhash(entry)        hlist_entry((entry), struct cfq_queue, cfq_hash)
@@ -69,13 +71,6 @@ static int cfq_grace_idle = HZ / 10;
 #define cfq_account_io(crq)    \
        ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT)
 
-/* define to be 50 ms for now; make tunable later */
-#define CFQ_EPOCH              50000
-/* Needs to be made tunable right away, in MiB/s */
-#define CFQ_DISKBW             10       
-/* Temporary global limit, as percent of available b/w, for each "class" */
-#define CFQ_TEMPLIM            10
-
 /*
  * defines how we distribute bandwidth (can be tgid, uid, etc)
  */
@@ -87,18 +82,22 @@ static int cfq_grace_idle = HZ / 10;
  */
 
 #if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE)
-extern inline void *cki_hash_key(struct task_struct *tsk);
-extern inline int cki_ioprio(struct task_struct *tsk);
-#define cfq_hash_key(current)   ((int)cki_hash_key((current)))
-#define cfq_ioprio(current)    (cki_ioprio((current)))
+extern void *cki_hash_key(struct task_struct *tsk);
+extern int cki_ioprio(struct task_struct *tsk);
+extern void *cki_cfqpriv(struct task_struct *tsk); 
+
+#define cfq_hash_key(tsk)   ((int)cki_hash_key((tsk)))
+#define cfq_ioprio(tsk)        (cki_ioprio((tsk)))
+#define cfq_cfqpriv(cfqd,tsk)  (cki_cfqpriv((tsk)))
 
 #else
-#define cfq_hash_key(current)  ((current)->tgid)
+#define cfq_hash_key(tsk)      ((tsk)->tgid)
+#define cfq_cfqpriv(cfqd,tsk)  (&(((cfqd)->cid[(tsk)->ioprio]).cfqpriv))
 
 /*
  * move to io_context
  */
-#define cfq_ioprio(current)    ((current)->ioprio)
+#define cfq_ioprio(tsk)        ((tsk)->ioprio)
 #endif
 
 #define CFQ_WAIT_RT    0
@@ -125,16 +124,12 @@ struct io_prio_data {
        atomic_t cum_sectors_in,cum_sectors_out;    
        atomic_t cum_queues_in,cum_queues_out;
 
-#ifdef LIMIT_DEBUG
-       int nskip;
-       unsigned long navsec;
-       unsigned long csectorate;
-       unsigned long lsectorate;
-#endif
+       cfqlim_t cfqpriv;       /* data for enforcing limits */
 
        struct list_head prio_list;
        int last_rq;
        int last_sectors;
+
 };
 
 /*
@@ -179,8 +174,9 @@ struct cfq_data {
        unsigned int cfq_grace_rt;
        unsigned int cfq_grace_idle;
 
-       unsigned long cfq_epoch;        /* duration for limit enforcement */
-       unsigned long cfq_epochsectors; /* max sectors dispatchable/epoch */
+       unsigned int cfq_epoch;
+       unsigned int cfq_hmax_pct;
+       unsigned int cfq_qsectorate;
 };
 
 /*
@@ -194,14 +190,34 @@ struct cfq_queue {
        int queued[2];
        int ioprio;
 
+       /* limit related settings/stats obtained 
+          either from io_prio_data or ckrm I/O class
+       */
+       struct cfqlim *cfqpriv; 
+
+       u64 epstart;            /* current epoch's starting timestamp (ns) */
+       u64 epsector[2];        /* Total sectors dispatched in [0] previous
+                                * and [1] current epoch
+                                */
+       
        unsigned long avsec;            /* avg sectors dispatched/epoch */
-       unsigned long long lastime;     /* timestamp of last request served */
-       unsigned long sectorate;        /* limit for sectors served/epoch */
+//     unsigned long long lastime;     /* timestamp of last request served */
+//     unsigned long sectorate;        /* limit for sectors served/epoch */
        int skipped;                    /* queue skipped at last dispatch ? */
+
+       /* Per queue timer to suspend/resume queue from processing */
+       struct timer_list timer;
+       unsigned long wait_end;
+       unsigned long flags;
+       struct work_struct work;
+
+       struct cfq_data *cfqd;
 };
 
+
+
 /*
- * per-request structure
+ * Per-request structure
  */
 struct cfq_rq {
        struct cfq_queue *cfq_queue;
@@ -516,69 +532,101 @@ link:
        list_add_tail(&crq->request->queuelist, entry);
 }
 
-/*
- * remove from io scheduler core and put on dispatch list for service
- */
+struct cfq_queue *dcfqq;
+u64 dtmp;
+
+
+
+/* Over how many ns is sectorate defined */
+#define NS4SCALE  (100000000)
+
 static inline int
-__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
-                       struct cfq_queue *cfqq)
+__cfq_check_limit(struct cfq_data *cfqd,struct cfq_queue *cfqq, int dontskip)
 {
        struct cfq_rq *crq;
-       unsigned long long ts, gap;
-       unsigned long newavsec;
+       unsigned long long ts, gap, epoch, tmp;
+       unsigned long newavsec, sectorate;
 
        crq = rb_entry_crq(rb_first(&cfqq->sort_list));
 
-#if 1
-       /* Determine if queue should be skipped for being overshare */
        ts = sched_clock();
-       gap = ts - cfqq->lastime;
-#ifdef LIMIT_DEBUG
-       cfqq->sectorate = (cfqd->cfq_epochsectors 
-                          * CFQ_TEMPLIM)/100;
-       
-#endif
-       if ((gap >= cfqd->cfq_epoch) || (gap < 0)) {
-               cfqq->avsec = crq->nr_sectors ; 
-               cfqq->lastime = ts;
+       gap = ts - cfqq->epstart;
+       epoch = cfqd->cfq_epoch;
+
+       sectorate = atomic_read(&cfqq->cfqpriv->sectorate);
+//     sectorate = atomic_read(&(cfqd->cid[crq->ioprio].sectorate));
+
+       dcfqq = cfqq;
+
+       if ((gap >= epoch) || (gap < 0)) {
+
+               if (gap >= (epoch << 1)) {
+                       cfqq->epsector[0] = 0;
+                       cfqq->epstart = ts ; 
+               } else {
+                       cfqq->epsector[0] = cfqq->epsector[1];
+                       cfqq->epstart += epoch;
+               } 
+               cfqq->epsector[1] = 0;
+               gap = ts - cfqq->epstart;
+
+               tmp  = (cfqq->epsector[0] + crq->nr_sectors) * NS4SCALE;
+               do_div(tmp,epoch+gap);
+
+               cfqq->avsec = (unsigned long)tmp;
+               cfqq->skipped = 0;
+               cfqq->epsector[1] += crq->nr_sectors;
+               
+               cfqq->cfqpriv->navsec = cfqq->avsec;
+               cfqq->cfqpriv->sec[0] = cfqq->epsector[0];
+               cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
+               cfqq->cfqpriv->timedout++;
+               /*
+               cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
+               cfqd->cid[crq->ioprio].sec[0] = cfqq->epsector[0];
+               cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
+               cfqd->cid[crq->ioprio].timedout++;
+               */
+               return 0;
        } else {
-               u64 tmp;
-               /* Age old average and accumalate request to be served */
-
-//             tmp = (u64) (cfqq->avsec * gap) ;
-//             do_div(tmp, cfqd->cfq_epoch);
-               newavsec = (unsigned long)(cfqq->avsec >> 1) + crq->nr_sectors;
-//             if (crq->ioprio >= 0 && crq->ioprio <= 20)
-//                     cfqd->cid[crq->ioprio].lsectorate = newavsec; 
-//             atomic_set(&(cfqd->cid[crq->ioprio].lsectorate),
-//                        newavsec);
-
-               if ((newavsec < cfqq->sectorate) || cfqq->skipped) {
+               
+               tmp = (cfqq->epsector[0] + cfqq->epsector[1] + crq->nr_sectors)
+                       * NS4SCALE;
+               do_div(tmp,epoch+gap);
+
+               newavsec = (unsigned long)tmp;
+               if ((newavsec < sectorate) || dontskip) {
                        cfqq->avsec = newavsec ;
-                       cfqq->lastime = ts;
                        cfqq->skipped = 0;
+                       cfqq->epsector[1] += crq->nr_sectors;
+                       cfqq->cfqpriv->navsec = cfqq->avsec;
+                       cfqq->cfqpriv->sec[1] = cfqq->epsector[1];
+                       /*
+                       cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
+                       cfqd->cid[crq->ioprio].sec[1] = cfqq->epsector[1];
+                       */
                } else {
-                       /* queue over share ; skip once */
                        cfqq->skipped = 1;
-#ifdef LIMIT_DEBUG     
-//                     atomic_inc(&(cfqd->cid[crq->ioprio].nskip));
-//                     if (crq->ioprio >= 0 && crq->ioprio <= 20)
-//                             cfqd->cid[crq->ioprio].nskip++;
-#endif
-                       return 0;
+                       /* pause q's processing till avsec drops to 
+                          cfq_hmax_pct % of its value */
+                       tmp = (epoch+gap) * (100-cfqd->cfq_hmax_pct);
+                       do_div(tmp,1000000*cfqd->cfq_hmax_pct);
+                       cfqq->wait_end = jiffies+msecs_to_jiffies(tmp);
                }
-       }
-#endif
+       }                       
+}
 
-#ifdef LIMIT_DEBUG
-//     if (crq->ioprio >= 0 && crq->ioprio <= 20) {
-//             cfqd->cid[crq->ioprio].navsec = cfqq->avsec;
-//             cfqd->cid[crq->ioprio].csectorate = cfqq->sectorate;
-//     }
+/*
+ * remove from io scheduler core and put on dispatch list for service
+ */
+static inline int
+__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
+                       struct cfq_queue *cfqq)
+{
+       struct cfq_rq *crq;
+
+       crq = rb_entry_crq(rb_first(&cfqq->sort_list));
 
-//     atomic_set(&(cfqd->cid[crq->ioprio].navsec),cfqq->avsec);
-//     atomic_set(&(cfqd->cid[crq->ioprio].csectorate),cfqq->sectorate);
-#endif
        cfq_dispatch_sort(cfqd, cfqq, crq);
 
        /*
@@ -593,44 +641,83 @@ cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors)
 {
        struct cfq_data *cfqd = q->elevator.elevator_data;
        struct list_head *plist = &cfqd->cid[prio].rr_list;
+       struct cfq_queue *cfqq;
        struct list_head *entry, *nxt;
        int q_rq, q_io;
-       int ret ;
+       int first_round,busy_queues,busy_unlimited;
+
 
        /*
         * for each queue at this prio level, dispatch a request
         */
        q_rq = q_io = 0;
+       first_round=1;
+ restart:
+       busy_unlimited = 0;
+       busy_queues = 0;
        list_for_each_safe(entry, nxt, plist) {
-               struct cfq_queue *cfqq = list_entry_cfqq(entry);
+               cfqq = list_entry_cfqq(entry);
 
                BUG_ON(RB_EMPTY(&cfqq->sort_list));
+               busy_queues++;
 
-               ret = __cfq_dispatch_requests(q, cfqd, cfqq);
-               if (ret <= 0) {
-                       continue; /* skip queue */
-                       /* can optimize more by moving q to end of plist ? */
+               
+               if (first_round || busy_unlimited)
+                       __cfq_check_limit(cfqd,cfqq,0);
+               else
+                       __cfq_check_limit(cfqd,cfqq,1);
+
+               if (cfqq->skipped) {
+                       cfqq->cfqpriv->nskip++;
+                       /* cfqd->cid[prio].nskip++; */
+                       busy_queues--;
+                       if (time_before(jiffies, cfqq->wait_end)) {
+                               list_del(&cfqq->cfq_list);
+                               mod_timer(&cfqq->timer,cfqq->wait_end);
+                       }
+                       continue;
                }
-               q_io += ret ;
-               q_rq++ ;
+               busy_unlimited++;
+
+               q_io += __cfq_dispatch_requests(q, cfqd, cfqq);
+               q_rq++;
 
-               if (RB_EMPTY(&cfqq->sort_list))
+               if (RB_EMPTY(&cfqq->sort_list)) {
+                       busy_unlimited--;
+                       busy_queues--;
                        cfq_put_queue(cfqd, cfqq);
-               /*
-                * if we hit the queue limit, put the string of serviced
-                * queues at the back of the pending list
-                */
+               } 
+
                if (q_io >= max_sectors || q_rq >= max_rq) {
+#if 0
                        struct list_head *prv = nxt->prev;
 
                        if (prv != plist) {
                                list_del(plist);
                                list_add(plist, prv);
                        }
+#endif
                        break;
                }
        }
 
+       if ((q_io < max_sectors) && (q_rq < max_rq) && 
+           (busy_queues || first_round))
+       {
+               first_round = 0;
+               goto restart;
+       } else {
+               /*
+                * if we hit the queue limit, put the string of serviced
+                * queues at the back of the pending list
+                */
+               struct list_head *prv = nxt->prev;
+               if (prv != plist) {
+                       list_del(plist);
+                       list_add(plist, prv);
+               }
+       }
+
        cfqd->cid[prio].last_rq = q_rq;
        cfqd->cid[prio].last_sectors = q_io;
        return q_rq;
@@ -806,6 +893,29 @@ static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        mempool_free(cfqq, cfq_mpool);
 }
 
+static void cfq_pauseq_timer(unsigned long data)
+{
+       struct cfq_queue *cfqq = (struct cfq_queue *) data;
+       kblockd_schedule_work(&cfqq->work);
+}
+
+static void cfq_pauseq_work(void *data)
+{
+       struct cfq_queue *cfqq = (struct cfq_queue *) data;
+       struct cfq_data *cfqd = cfqq->cfqd;
+       request_queue_t *q = cfqd->queue;
+       unsigned long flags;
+       
+       spin_lock_irqsave(q->queue_lock, flags);
+       list_add_tail(&cfqq->cfq_list,&cfqd->cid[cfqq->ioprio].rr_list);
+       cfqq->skipped = 0;
+       if (cfq_next_request(q))
+               q->request_fn(q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+
+       //del_timer(&cfqq->timer);
+}      
+
 static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int hashkey,
                                         int gfp_mask)
 {
@@ -833,9 +943,22 @@ retry:
                INIT_LIST_HEAD(&cfqq->cfq_list);
                cfqq->hash_key = cfq_hash_key(current);
                cfqq->ioprio = cfq_ioprio(current);
-               cfqq->avsec = 0 ;
-               cfqq->lastime = sched_clock();
-               cfqq->sectorate = (cfqd->cfq_epochsectors * CFQ_TEMPLIM)/100;
+               
+               cfqq->cfqpriv = cfq_cfqpriv(cfqd,current);
+               if (!cfqq->cfqpriv)
+                       cfqq->cfqpriv = &((cfqd->cid[cfqq->ioprio]).cfqpriv);
+
+               cfqq->epstart = sched_clock();
+               /* epsector, avsec, skipped initialized to zero by memset */
+               
+               init_timer(&cfqq->timer);
+               cfqq->timer.function = cfq_pauseq_timer;
+               cfqq->timer.data = (unsigned long) cfqq;
+
+               INIT_WORK(&cfqq->work, cfq_pauseq_work, cfqq); 
+
+               cfqq->cfqd = cfqd ;
+
                hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
        }
 
@@ -1132,6 +1255,8 @@ static void cfq_exit(request_queue_t *q, elevator_t *e)
        kfree(cfqd);
 }
 
+       
+
 static void cfq_timer(unsigned long data)
 {
        struct cfq_data *cfqd = (struct cfq_data *) data;
@@ -1182,12 +1307,12 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
                atomic_set(&cid->cum_sectors_out,0);            
                atomic_set(&cid->cum_queues_in,0);
                atomic_set(&cid->cum_queues_out,0);
-#if 0
-               atomic_set(&cid->nskip,0);
-               atomic_set(&cid->navsec,0);
-               atomic_set(&cid->csectorate,0);
-               atomic_set(&cid->lsectorate,0);
-#endif
+
+               
+               atomic_set(&((cid->cfqpriv).sectorate),CFQ_SECTORATE);
+               (cid->cfqpriv).nskip = 0;
+               (cid->cfqpriv).navsec = 0;
+               (cid->cfqpriv).timedout = 0;
        }
 
        cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES,
@@ -1217,6 +1342,9 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
        cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io;
        cfqd->cfq_grace_rt = cfq_grace_rt;
        cfqd->cfq_grace_idle = cfq_grace_idle;
+       
+       cfqd->cfq_epoch = CFQ_EPOCH;
+       cfqd->cfq_hmax_pct = CFQ_HMAX_PCT;
 
        q->nr_requests <<= 2;
 
@@ -1224,14 +1352,6 @@ static int cfq_init(request_queue_t *q, elevator_t *e)
        e->elevator_data = cfqd;
        cfqd->queue = q;
 
-       cfqd->cfq_epoch = CFQ_EPOCH;
-       if (q->hardsect_size)
-               cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/
-                                     q->hardsect_size)* (1000000 / CFQ_EPOCH);
-       else
-               cfqd->cfq_epochsectors = ((CFQ_DISKBW * 1000000)/512)
-                       * (1000000 / CFQ_EPOCH) ;
-
        return 0;
 out_crqpool:
        kfree(cfqd->cfq_hash);
@@ -1302,6 +1422,8 @@ SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued);
 SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt);
 SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle);
+SHOW_FUNCTION(cfq_epoch_show, cfqd->cfq_epoch);
+SHOW_FUNCTION(cfq_hmax_pct_show, cfqd->cfq_hmax_pct);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                                \
@@ -1321,63 +1443,38 @@ STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX
 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX);
 STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX);
 STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX);
+STORE_FUNCTION(cfq_epoch_store, &cfqd->cfq_epoch, 0, INT_MAX);
+STORE_FUNCTION(cfq_hmax_pct_store, &cfqd->cfq_hmax_pct, 1, 100);
 #undef STORE_FUNCTION
 
 
-static ssize_t cfq_epoch_show(struct cfq_data *cfqd, char *page)
-{
-       return sprintf(page, "%lu\n", cfqd->cfq_epoch);
-}
-
-static ssize_t cfq_epoch_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
-       char *p = (char *) page;
-       cfqd->cfq_epoch = simple_strtoul(p, &p, 10);
-       return count;
-}
-
-static ssize_t cfq_epochsectors_show(struct cfq_data *cfqd, char *page)
-{
-       return sprintf(page, "%lu\n", cfqd->cfq_epochsectors);
-}
-
-static ssize_t 
-cfq_epochsectors_store(struct cfq_data *cfqd, const char *page, size_t count)
-{
-       char *p = (char *) page;
-       cfqd->cfq_epochsectors = simple_strtoul(p, &p, 10);
-       return count;
-}
-
 /* Additional entries to get priority level data */
 static ssize_t
 cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl)
 {
-       int r1,r2,s1,s2,q1,q2;
+    //int r1,r2,s1,s2,q1,q2;
 
        if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) 
                return 0;
        
+       /*
        r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in));
        r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out));
        s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in));
        s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out));
        q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); 
        q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out));
-       
-       return sprintf(page,"skip %d avsec %lu rate %lu new %lu"
-                      "rq (%d,%d) sec (%d,%d) q (%d,%d)\n",
-                      cfqd->cid[priolvl].nskip,
-                      cfqd->cid[priolvl].navsec,
-                      cfqd->cid[priolvl].csectorate,
-                      cfqd->cid[priolvl].lsectorate,
-//                    atomic_read(&cfqd->cid[priolvl].nskip),
-//                    atomic_read(&cfqd->cid[priolvl].navsec),
-//                    atomic_read(&cfqd->cid[priolvl].csectorate),
-//                    atomic_read(&cfqd->cid[priolvl].lsectorate),
-                      r1,r2,
-                      s1,s2,
-                      q1,q2);
+       */
+
+       return sprintf(page,"skip %d timdout %d avsec %lu rate %ld "
+                      " sec0 %lu sec1 %lu\n",
+                      cfqd->cid[priolvl].cfqpriv.nskip,
+                      cfqd->cid[priolvl].cfqpriv.timedout,
+                      cfqd->cid[priolvl].cfqpriv.navsec,
+                      atomic_read(&(cfqd->cid[priolvl].cfqpriv.sectorate)),
+                      (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[0],
+                      (unsigned long)cfqd->cid[priolvl].cfqpriv.sec[1]);
+
 }
 
 #define SHOW_PRIO_DATA(__PRIOLVL)                                               \
@@ -1411,12 +1508,25 @@ SHOW_PRIO_DATA(20);
 
 static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl)
 {      
+
+       char *p = (char *) page;
+       int val;
+
+       val = (int) simple_strtoul(p, &p, 10);
+
+       atomic_set(&(cfqd->cid[priolvl].cfqpriv.sectorate),val);
+       cfqd->cid[priolvl].cfqpriv.nskip = 0;
+       cfqd->cid[priolvl].cfqpriv.navsec = 0;
+       cfqd->cid[priolvl].cfqpriv.timedout = 0;
+
+#if 0
        atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0);
        atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0);
        atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0);
        atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0);
        atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0);
        atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0);
+#endif
 
        return count;
 }
@@ -1491,10 +1601,10 @@ static struct cfq_fs_entry cfq_epoch_entry = {
        .show = cfq_epoch_show,
        .store = cfq_epoch_store,
 };
-static struct cfq_fs_entry cfq_epochsectors_entry = {
-       .attr = {.name = "epochsectors", .mode = S_IRUGO | S_IWUSR },
-       .show = cfq_epochsectors_show,
-       .store = cfq_epochsectors_store,
+static struct cfq_fs_entry cfq_hmax_pct_entry = {
+       .attr = {.name = "hmaxpct", .mode = S_IRUGO | S_IWUSR },
+       .show = cfq_hmax_pct_show,
+       .store = cfq_hmax_pct_store,
 };
 
 #define P_0_STR   "p0"
@@ -1558,7 +1668,7 @@ static struct attribute *default_attrs[] = {
        &cfq_grace_rt_entry.attr,
        &cfq_grace_idle_entry.attr,
        &cfq_epoch_entry.attr,
-       &cfq_epochsectors_entry.attr,
+       &cfq_hmax_pct_entry.attr,
        &cfq_prio_0_entry.attr,
        &cfq_prio_1_entry.attr,
        &cfq_prio_2_entry.attr,
index 7edfce7..8991026 100644 (file)
 #include <linux/ckrm_tc.h>
 #include <linux/ckrm-io.h>
 
-/* Tie to cfq priorities */
-#define CKI_IOPRIO_NORM                IOPRIO_NORM
+/* sectorate == 512 byte sectors served in CFQ_EPOCH ns*/
 
-/* Divisor to get fraction of bandwidth represented by an IOPRIO value */
-/* FIXME: Will not work if IOPRIO_NR > 100 */
-#define CKI_IOPRIO_DIV         (IOPRIO_NR-1)
-/* Minimum ioprio value to be assigned to a class */
-#define CKI_IOPRIO_MIN         1
+/* CKI_ROOTSECTORATE needs to be made configurable from outside */
+#define CKI_ROOTSECTORATE      100000
+#define CKI_MINSECTORATE       100
 
 #define CKI_IOUSAGE_UNIT       512
 
@@ -52,7 +49,12 @@ typedef struct ckrm_io_stats{
        unsigned long        blksz;  /* size of bandwidth unit */
        atomic_t             blkrd;  /* read units submitted to DD */
        atomic_t             blkwr; /* write units submitted to DD */
-       
+
+       int nskip;                      /* # times q skipped    */
+       unsigned long navsec;           /* avg sectors serviced */
+       int timedout;                   /* # times gap > epoch  */
+       u64 sec[2];                     /* sectors serviced in 
+                                          prev & curr epochs   */
 } cki_stats_t;          /* per class I/O statistics */
 
 /* Note
@@ -75,8 +77,12 @@ typedef struct ckrm_io_class {
         * in local units. 
         */
 
+       cfqlim_t cfqpriv;       /* Data common with cfq priolvl's */    
+
+
        int cnt_guarantee; /* Allocation as parent */
        int cnt_unused;    /* Allocation to default subclass */
+       int cnt_limit;
 
        /* Statistics, for class and default subclass */
        cki_stats_t stats; 
@@ -85,19 +91,16 @@ typedef struct ckrm_io_class {
 } cki_icls_t;
 
 
-
 /* Internal functions */
 static inline void cki_reset_stats(cki_stats_t *usg);
 static inline void init_icls_one(cki_icls_t *icls);
-static inline int cki_div(int *a, int b, int c);
-//static inline int cki_recalc(cki_icls_t *icls, int rel2abs);
 static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres);
 
 /* External functions e.g. interface to ioscheduler */
 void *cki_tsk_icls (struct task_struct *tsk);
 int cki_tsk_ioprio (struct task_struct *tsk);
 
-extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio);
+extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv);
 
 /* CKRM Resource Controller API functions */
 static void * cki_alloc(struct ckrm_core_class *this,
@@ -139,45 +142,27 @@ static inline void init_icls_stats(cki_icls_t *icls)
 
 static inline void init_icls_one(cki_icls_t *icls)
 {
-       // Assign zero as initial guarantee otherwise creations
-       // could fail due to inadequate share
-
-       //icls->shares.my_guarantee = 
-       //      (CKI_IOPRIO_MIN * CKRM_SHARE_DFLT_TOTAL_GUARANTEE) / 
-       //      CKI_IOPRIO_DIV ;
-       icls->shares.my_guarantee = 0;
-       icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       /* Zero initial guarantee for scalable creation of
+          multiple classes */
 
-       icls->shares.unused_guarantee = icls->shares.total_guarantee - 
-               icls->shares.my_guarantee;
-       icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-
-       icls->cnt_guarantee = icls->cnt_unused = IOPRIO_IDLE;
+       /* Try out a new set */
+       
+       icls->shares.my_guarantee = CKRM_SHARE_DONTCARE;
+       icls->shares.my_limit = CKRM_SHARE_DONTCARE;
+       icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
+       icls->shares.unused_guarantee = icls->shares.total_guarantee;
+       icls->shares.cur_max_limit = 0;
 
-       //Same rationale icls->ioprio = CKI_IOPRIO_MIN;
-       //IOPRIO_IDLE equivalence to zero my_guarantee (set above) relies
-       //on former being zero.
+       icls->cnt_guarantee = CKRM_SHARE_DONTCARE;
+       icls->cnt_unused = CKRM_SHARE_DONTCARE;
+       icls->cnt_limit = CKRM_SHARE_DONTCARE;
        
        init_icls_stats(icls);
 }
 
-
-static inline int cki_div(int *a, int b, int c)
-{
-       u64 temp = (u64) b * c ;
-       do_div(temp,CKI_IOPRIO_DIV);
-       *a = (int) temp;
-
-       return 0;
-}
-       
-
-/* Recalculate absolute shares from relative (rel2abs=1)
- * or vice versa (rel2abs=0) 
- * Caller should have a lock on icls
+/* Recalculate absolute shares from relative
+ * Caller should hold a lock on icls
  */
 
 static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
@@ -186,17 +171,17 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
        ckrm_core_class_t *child = NULL;
        cki_icls_t *childres;
        int resid = cki_rcbs.resid;
+       u64 temp;
 
        if (parres) {
                struct ckrm_shares *par = &parres->shares;
                struct ckrm_shares *self = &res->shares;
 
 
-
                if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                        res->cnt_guarantee = CKRM_SHARE_DONTCARE;
                } else if (par->total_guarantee) {
-                       u64 temp = (u64) self->my_guarantee * 
+                       temp = (u64) self->my_guarantee * 
                                parres->cnt_guarantee;
                        do_div(temp, par->total_guarantee);
                        res->cnt_guarantee = (int) temp;
@@ -204,16 +189,36 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
                        res->cnt_guarantee = 0;
                }
 
+
+               if (parres->cnt_limit == CKRM_SHARE_DONTCARE) {
+                       res->cnt_limit = CKRM_SHARE_DONTCARE;
+                       atomic_set(&res->cfqpriv.sectorate,CKI_MINSECTORATE);
+               } else {
+                       if (par->max_limit) {
+                               temp = (u64) self->my_limit * 
+                                       parres->cnt_limit;
+                               do_div(temp, par->max_limit);
+                               res->cnt_limit = (int) temp;
+                       } else {
+                               res->cnt_limit = 0;
+                       }
+                       atomic_set(&res->cfqpriv.sectorate,res->cnt_limit);
+               }
+               
                if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                        res->cnt_unused = CKRM_SHARE_DONTCARE;
-               } else if (self->total_guarantee) {
-                       u64 temp = (u64) self->unused_guarantee * 
-                               res->cnt_guarantee;
-                       do_div(temp, self->total_guarantee);
-                       res->cnt_unused = (int) temp;
                } else {
-                       res->cnt_unused = 0;
+                       if (self->total_guarantee) {
+                               temp = (u64) self->unused_guarantee * 
+                                       res->cnt_guarantee;
+                               do_div(temp, self->total_guarantee);
+                               res->cnt_unused = (int) temp;
+                       } else {
+                               res->cnt_unused = 0;
+                       }
+
                }
+               
        }
        // propagate to children
        ckrm_lock_hier(res->core);
@@ -228,50 +233,6 @@ static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
        ckrm_unlock_hier(res->core);
 }
 
-#if 0
-static inline int cki_recalc(cki_icls_t *icls, int rel2abs)
-{
-       u64 temp;
-
-       if (icls->parent == NULL) {
-               /* Root, as parent, always gets all */
-
-               temp = icls->shares.my_guarantee * (IOPRIO_NR-1);
-               do_div(temp, icls->shares.total_guarantee);
-
-               icls->total = IOPRIO_NR-1;
-               icls->ioprio = temp ;
-               icls->unused = icls->total - icls->ioprio;
-//             icls->unused = (IOPRIO_NR-1)-icls->ioprio;
-
-       } else {
-               cki_icls_t *parres;
-               int partot ;
-               
-               parres = ckrm_get_res_class(icls->parent,
-                                           cki_rcbs.resid,
-                                           cki_icls_t);
-               if (!parres) {
-                       printk(KERN_ERR "cki_recalc: error getting "
-                              "resclass from core \n");
-                       return -EINVAL;
-               }
-
-
-               temp = (icls->shares.my_guarantee * 
-                       parres->total);
-               do_div(temp, parres->shares.total_guarantee);
-
-               icls->ioprio = temp;
-               icls->unused = 0;
-
-       }
-       
-       return 0;
-
-}
-#endif
-
 void *cki_tsk_icls(struct task_struct *tsk)
 {
        return (void *) ckrm_get_res_class(class_core(tsk->taskclass),
@@ -279,12 +240,19 @@ void *cki_tsk_icls(struct task_struct *tsk)
 }
 
 int cki_tsk_ioprio(struct task_struct *tsk)
+{
+       /* Don't use I/O priorities for now */
+       return IOPRIO_NORM;
+}
+
+void *cki_tsk_cfqpriv(struct task_struct *tsk)
 {
        cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass),
                                           cki_rcbs.resid, cki_icls_t);
-       return icls->cnt_unused;
+       return (void *)&(icls->cfqpriv);
 }
 
+
 static void *cki_alloc(struct ckrm_core_class *core,
                         struct ckrm_core_class *parent)
 {
@@ -301,43 +269,13 @@ static void *cki_alloc(struct ckrm_core_class *core,
        icls->parent = parent;
        icls->shares_lock = SPIN_LOCK_UNLOCKED;
 
-       if (parent == NULL) {
-
-               /* Root class gets same as "normal" CFQ priorities to
-                * retain compatibility of behaviour in the absence of 
-                * other classes
-                */
-
-               icls->cnt_guarantee = icls->cnt_unused = IOPRIO_NR-1; 
-
-               /* Default gets normal, not minimum */
-               //icls->unused = IOPRIO_NORM;
-               //icls->unused = icls->guarantee-icls->myguarantee;
-               //icls->limit = icls->mylimit = IOPRIO_NR;
-
-               /* Compute shares in abstract units */
-               icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-               // my_guarantee for root is meaningless. Set to default
-               icls->shares.my_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       init_icls_one(icls);
 
-               icls->shares.unused_guarantee = 
-                       CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-               //temp = (u64) icls->cnt_unused * icls->shares.total_guarantee;
-               //do_div(temp, CKI_IOPRIO_DIV); 
-               // temp now has root's default's share
-               //icls->shares.unused_guarantee = 
-               // icls->shares.total_guarantee - temp; 
-
-               icls->shares.my_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-               icls->shares.max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-               icls->shares.cur_max_limit = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-
-       } else {
-               init_icls_one(icls);
-               /* No propagation to parent needed if icls'
-                  initial share is zero */
+       if (parent == NULL) {
+               icls->cnt_guarantee =  CKI_ROOTSECTORATE;
+               icls->cnt_unused = CKI_ROOTSECTORATE;
+               icls->cnt_limit = CKI_ROOTSECTORATE;
+               atomic_set(&(icls->cfqpriv.sectorate),icls->cnt_limit);
        }
        try_module_get(THIS_MODULE);
        return icls;
@@ -345,7 +283,10 @@ static void *cki_alloc(struct ckrm_core_class *core,
 
 static void cki_free(void *res)
 {
-       cki_icls_t *icls = res, *parres;
+       cki_icls_t *icls = res, *parres, *childres;
+       ckrm_core_class_t *child = NULL;
+       int maxlimit, resid = cki_rcbs.resid;
+
        
        if (!res)
                return;
@@ -361,9 +302,7 @@ static void cki_free(void *res)
         *
         */
 
-       parres = ckrm_get_res_class(icls->parent,
-                                   cki_rcbs.resid,
-                                   cki_icls_t);
+       parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t);
        if (!parres) {
                printk(KERN_ERR "cki_free: error getting "
                       "resclass from core \n");
@@ -372,8 +311,23 @@ static void cki_free(void *res)
 
        /* Update parent's shares */
        spin_lock(&parres->shares_lock);
+
        child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0);
        parres->cnt_unused += icls->cnt_guarantee;
+
+       // run thru parent's children and get the new max_limit of the parent
+       ckrm_lock_hier(parres->core);
+       maxlimit = 0;
+       while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
+               childres = ckrm_get_res_class(child, resid, cki_icls_t);
+               if (maxlimit < childres->shares.my_limit) {
+                       maxlimit = childres->shares.my_limit;
+               }
+       }
+       ckrm_unlock_hier(parres->core);
+       if (parres->shares.cur_max_limit < maxlimit) {
+               parres->shares.cur_max_limit = maxlimit;
+       }
        spin_unlock(&parres->shares_lock);
 
        kfree(res);
@@ -388,26 +342,15 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
        struct ckrm_shares *cur, *par;
        int rc = -EINVAL, resid = cki_rcbs.resid;
 
-       if (!icls) {
-               printk(KERN_ERR "No class\n");
+       if (!icls) 
                return rc;
-       }
 
        cur = &icls->shares; 
-
-       /* limits not supported */
-       if ((new->max_limit != CKRM_SHARE_UNCHANGED)
-           || (new->my_limit != CKRM_SHARE_UNCHANGED)) {
-               printk(KERN_ERR "limits not supported\n");
-               return -EINVAL;
-       }
-
        if (icls->parent) {
                parres =
                    ckrm_get_res_class(icls->parent, resid, cki_icls_t);
                if (!parres) {
-                       printk(KERN_ERR "cki_setshare: error getting "
-                              "resclass from core \n");
+                       pr_debug("cki_setshare: invalid resclass\n");
                        return -EINVAL;
                }
                spin_lock(&parres->shares_lock);
@@ -420,10 +363,8 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
        }
 
        rc = set_shares(new, cur, par);
-       printk(KERN_ERR "rc from set_shares %d\n", rc);
 
        if ((!rc) && parres) {
-               
                if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
                        parres->cnt_unused = CKRM_SHARE_DONTCARE;
                } else if (par->total_guarantee) {
@@ -435,17 +376,6 @@ static int cki_setshare(void *res, struct ckrm_shares *new)
                        parres->cnt_unused = 0;
                }
                cki_recalc_propagate(res, parres);
-       
-#if 0
-               int old = icls->ioprio;
-               
-               rc = cki_recalc(icls,0);
-
-               if (!rc && parres) {
-                       int raise_tot = icls->ioprio - old ;
-                       parres->unused -= raise_tot ;
-               }
-#endif
        }
        spin_unlock(&icls->shares_lock);
        if (icls->parent) {
@@ -471,15 +401,15 @@ static int cki_getstats(void *res, struct seq_file *sfile)
        if (!icls)
                return -EINVAL;
 
-/*     
-       seq_printf(sfile, "%d my_read\n",atomic_read(&icls->mystats.blkrd));
-       seq_printf(sfile, "%d my_write\n",atomic_read(&icls->mystats.blkwr));
-       seq_printf(sfile, "%d total_read\n",atomic_read(&icls->stats.blkrd));
-       seq_printf(sfile, "%d total_write\n",atomic_read(&icls->stats.blkwr));
-*/
-       
-       seq_printf(sfile, "%d total ioprio\n",icls->cnt_guarantee);
-       seq_printf(sfile, "%d unused/default ioprio\n",icls->cnt_unused);
+       seq_printf(sfile, "abs limit %d\n",icls->cnt_limit);
+       seq_printf(sfile, "skip %d timdout %d avsec %lu rate %ld "
+                  " sec0 %ld sec1 %ld\n",
+                  icls->cfqpriv.nskip,
+                  icls->cfqpriv.timedout,
+                  icls->cfqpriv.navsec,
+                  atomic_read(&(icls->cfqpriv.sectorate)),
+                  (unsigned long)icls->cfqpriv.sec[0],
+                  (unsigned long)icls->cfqpriv.sec[1]);
 
        return 0;
 }
@@ -554,7 +484,7 @@ int __init cki_init(void)
                resid = ckrm_register_res_ctlr(clstype, &cki_rcbs);
                if (resid != -1) {
                        cki_rcbs.classtype = clstype;
-                       cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio);
+                       cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio,cki_tsk_cfqpriv);
                }
        }
        
@@ -566,7 +496,7 @@ void __exit cki_exit(void)
        ckrm_unregister_res_ctlr(&cki_rcbs);
        cki_rcbs.resid = -1;
        cki_rcbs.classtype = NULL; 
-       cki_cfq_set(NULL,NULL);
+       cki_cfq_set(NULL,NULL,NULL);
 }
 
 module_init(cki_init)
index 63beff3..f401254 100644 (file)
@@ -25,17 +25,18 @@ static spinlock_t stub_lock = SPIN_LOCK_UNLOCKED;
 
 static icls_tsk_t tskiclstub;
 static icls_ioprio_t tskiopriostub;
+static icls_tsk_t tskcfqprivstub;
 
-
-void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio)
+void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv)
 {
        spin_lock(&stub_lock);
        tskiclstub = tskicls;
        tskiopriostub = tskioprio;
+       tskcfqprivstub = tskcfqpriv;
        spin_unlock(&stub_lock);
 }
 
-inline void *cki_hash_key(struct task_struct *tsk)
+void *cki_hash_key(struct task_struct *tsk)
 {
        void *ret;
        spin_lock(&stub_lock);
@@ -47,7 +48,7 @@ inline void *cki_hash_key(struct task_struct *tsk)
        return ret;
 }
 
-inline int cki_ioprio(struct task_struct *tsk)
+int cki_ioprio(struct task_struct *tsk)
 {
        int ret;
        spin_lock(&stub_lock);
@@ -59,6 +60,19 @@ inline int cki_ioprio(struct task_struct *tsk)
        return ret;
 }
 
+void *cki_cfqpriv(struct task_struct *tsk)
+{
+       void *ret;
+       spin_lock(&stub_lock);
+       if (tskiclstub)
+               ret = (*tskcfqprivstub)(tsk);
+       else 
+               ret = NULL;
+       spin_unlock(&stub_lock);
+       return ret;
+}    
+
 EXPORT_SYMBOL(cki_cfq_set);
 EXPORT_SYMBOL(cki_hash_key);
 EXPORT_SYMBOL(cki_ioprio);
+EXPORT_SYMBOL(cki_cfqpriv);
index 36040b9..70277c7 100644 (file)
@@ -34,6 +34,7 @@ typedef int (*icls_ioprio_t) (struct task_struct *tsk);
 
 extern void *cki_tsk_icls (struct task_struct *tsk);
 extern int cki_tsk_ioprio (struct task_struct *tsk);
+extern void *cki_tsk_cfqpriv (struct task_struct *tsk);
 
 #endif /* CONFIG_CKRM_RES_BLKIO */
 
index ece31a7..11067b7 100644 (file)
@@ -1603,6 +1603,15 @@ static inline void free_secdata(void *secdata)
 asmlinkage int sys_ioprio_set(int ioprio);
 asmlinkage int sys_ioprio_get(void);
 
+/* common structure for cfq & ckrm I/O controller */
+typedef struct cfqlim {
+       int nskip;
+       unsigned long navsec;
+       int timedout;
+       atomic_t sectorate;
+       u64 sec[2];
+} cfqlim_t ;
+
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FS_H */
index b325309..4956dcb 100644 (file)
@@ -8,6 +8,6 @@ endif
     obj-$(CONFIG_CKRM_TYPE_TASKCLASS)  += ckrm_tc.o
     obj-$(CONFIG_CKRM_RES_NUMTASKS)    += ckrm_numtasks.o
     obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o
-    obj-$(CONFIG_CKRM_RES_LISTENAQ)    += ckrm_laq.o
+    obj-$(CONFIG_CKRM_RES_LISTENAQ)    += ckrm_listenaq.o
     obj-$(CONFIG_CKRM_CPU_SCHEDULE)     += ckrm_cpu_class.o ckrm_cpu_monitor.o
     obj-$(CONFIG_CKRM_RES_MEM)                 += ckrm_mem.o
diff --git a/kernel/ckrm/ckrm_laq.c b/kernel/ckrm/ckrm_laq.c
deleted file mode 100644 (file)
index b64205a..0000000
+++ /dev/null
@@ -1,495 +0,0 @@
-/* ckrm_socketaq.c - accept queue resource controller
- *
- * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
- * 
- * Latest version, more details at http://ckrm.sf.net
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- */
-
-/* Changes
- * Initial version
- */
-
-/* Code Description: TBD
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/errno.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/ckrm.h>
-#include <linux/ckrm_rc.h>
-#include <net/tcp.h>
-
-#include <linux/ckrm_net.h>
-
-#define hnode_2_core(ptr) \
-        ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL)
-
-#define CKRM_SAQ_MAX_DEPTH     3       // 0 => /rcfs
-                                 // 1 => socket_aq
-                                 // 2 => socket_aq/listen_class
-                                 // 3 => socket_aq/listen_class/accept_queues
-                                 // 4 => Not allowed
-
-typedef struct ckrm_laq_res {
-       spinlock_t reslock;
-       atomic_t refcnt;
-       struct ckrm_shares shares;
-       struct ckrm_core_class *core;
-       struct ckrm_core_class *pcore;
-       int my_depth;
-       int my_id;
-       unsigned int min_ratio;
-} ckrm_laq_res_t;
-
-static int my_resid = -1;
-
-extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int);
-extern struct ckrm_core_class *rcfs_make_core(struct dentry *,
-                                             struct ckrm_core_class *);
-
-void laq_res_hold(struct ckrm_laq_res *res)
-{
-       atomic_inc(&res->refcnt);
-       return;
-}
-
-void laq_res_put(struct ckrm_laq_res *res)
-{
-       if (atomic_dec_and_test(&res->refcnt))
-               kfree(res);
-       return;
-}
-
-/* Initialize rescls values
- */
-static void laq_res_initcls(void *my_res)
-{
-       ckrm_laq_res_t *res = my_res;
-
-       res->shares.my_guarantee = CKRM_SHARE_DONTCARE;
-       res->shares.my_limit = CKRM_SHARE_DONTCARE;
-       res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
-       res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       res->shares.cur_max_limit = 0;
-}
-
-static int atoi(char *s)
-{
-       int k = 0;
-       while (*s)
-               k = *s++ - '0' + (k * 10);
-       return k;
-}
-
-static char *laq_get_name(struct ckrm_core_class *c)
-{
-       char *p = (char *)c->name;
-
-       while (*p)
-               p++;
-       while (*p != '/' && p != c->name)
-               p--;
-
-       return ++p;
-}
-
-static void *laq_res_alloc(struct ckrm_core_class *core,
-                          struct ckrm_core_class *parent)
-{
-       ckrm_laq_res_t *res, *pres;
-       int pdepth;
-
-       if (parent)
-               pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t);
-       else
-               pres = NULL;
-
-       if (core == core->classtype->default_class)
-               pdepth = 1;
-       else {
-               if (!parent)
-                       return NULL;
-               pdepth = 1 + pres->my_depth;
-       }
-
-       res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC);
-       if (res) {
-               memset(res, 0, sizeof(res));
-               spin_lock_init(&res->reslock);
-               laq_res_hold(res);
-               res->my_depth = pdepth;
-               if (pdepth == 2)        // listen class
-                       res->my_id = 0;
-               else if (pdepth == 3)
-                       res->my_id = atoi(laq_get_name(core));
-               res->core = core;
-               res->pcore = parent;
-
-               // rescls in place, now initialize contents other than 
-               // hierarchy pointers
-               laq_res_initcls(res);   // acts as initialising value
-       }
-
-       return res;
-}
-
-static void laq_res_free(void *my_res)
-{
-       ckrm_laq_res_t *res = (ckrm_laq_res_t *) my_res;
-       ckrm_laq_res_t *parent;
-
-       if (!res)
-               return;
-
-       if (res->my_depth != 3) {
-               kfree(res);
-               return;
-       }
-
-       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)            // Should never happen
-               return;
-
-       spin_lock(&parent->reslock);
-       spin_lock(&res->reslock);
-
-       // return child's guarantee to parent node
-       // Limits have no meaning for accept queue control
-       child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0);
-
-       spin_unlock(&res->reslock);
-       laq_res_put(res);
-       spin_unlock(&parent->reslock);
-       return;
-}
-
-/**************************************************************************
- *                     SHARES                                          ***
- **************************************************************************/
-
-void laq_set_aq_value(struct ckrm_net_struct *ns, unsigned int *aq_ratio)
-{
-       int i;
-       struct tcp_opt *tp;
-
-       tp = tcp_sk(ns->ns_sk);
-       for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
-               tp->acceptq[i].aq_ratio = aq_ratio[i];
-       return;
-}
-void laq_set_aq_values(ckrm_laq_res_t * parent, unsigned int *aq_ratio)
-{
-
-       struct ckrm_net_struct *ns;
-       struct ckrm_core_class *core = parent->core;
-
-       class_lock(core);
-       list_for_each_entry(ns, &core->objlist, ckrm_link) {
-               laq_set_aq_value(ns, aq_ratio);
-       }
-       class_unlock(core);
-       return;
-}
-
-static void calculate_aq_ratios(ckrm_laq_res_t * res, unsigned int *aq_ratio)
-{
-       struct ckrm_hnode *chnode;
-       ckrm_laq_res_t *child;
-       unsigned int min;
-       int i;
-
-       min = aq_ratio[0] = (unsigned int)res->shares.unused_guarantee;
-
-       list_for_each_entry(chnode, &res->core->hnode.children, siblings) {
-               child = hnode_2_core(chnode)->res_class[my_resid];
-
-               aq_ratio[child->my_id] =
-                   (unsigned int)child->shares.my_guarantee;
-               if (aq_ratio[child->my_id] == CKRM_SHARE_DONTCARE)
-                       aq_ratio[child->my_id] = 0;
-               if (aq_ratio[child->my_id] &&
-                   ((unsigned int)aq_ratio[child->my_id] < min))
-                       min = (unsigned int)child->shares.my_guarantee;
-       }
-
-       if (min == 0) {
-               min = 1;
-               // default takes all if nothing specified
-               aq_ratio[0] = 1;        
-       }
-       res->min_ratio = min;
-
-       for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
-               aq_ratio[i] = aq_ratio[i] / min;
-}
-
-static int laq_set_share_values(void *my_res, struct ckrm_shares *shares)
-{
-       ckrm_laq_res_t *res = my_res;
-       ckrm_laq_res_t *parent;
-       unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
-       int rc = 0;
-
-       if (!res)
-               return -EINVAL;
-
-       if (!res->pcore) {
-               // something is badly wrong
-               printk(KERN_ERR "socketaq internal inconsistency\n");
-               return -EBADF;
-       }
-
-       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)            // socketclass does not have a share interface
-               return -EINVAL;
-
-       // Ensure that we ignore limit values
-       shares->my_limit = CKRM_SHARE_DONTCARE;
-       shares->max_limit = CKRM_SHARE_UNCHANGED;
-
-       if (res->my_depth == 0) {
-               printk(KERN_ERR "socketaq bad entry\n");
-               return -EBADF;
-       } else if (res->my_depth == 1) {
-               // can't be written to. This is an internal default.
-               return -EINVAL;
-       } else if (res->my_depth == 2) {
-               //nothin to inherit
-               if (!shares->total_guarantee) {
-                       return -EINVAL;
-               }
-               parent = res;
-               shares->my_guarantee = CKRM_SHARE_DONTCARE;
-       } else if (res->my_depth == 3) {
-               // accept queue itself. 
-               shares->total_guarantee = CKRM_SHARE_UNCHANGED;
-       }
-
-       ckrm_lock_hier(parent->pcore);
-       spin_lock(&parent->reslock);
-       rc = set_shares(shares, &res->shares,
-                       (parent == res) ? NULL : &parent->shares);
-       if (rc) {
-               spin_unlock(&res->reslock);
-               ckrm_unlock_hier(res->pcore);
-               return rc;
-       }
-       calculate_aq_ratios(parent, aq_ratio);
-       laq_set_aq_values(parent, aq_ratio);
-       spin_unlock(&parent->reslock);
-       ckrm_unlock_hier(parent->pcore);
-
-       return rc;
-}
-
-static int laq_get_share_values(void *my_res, struct ckrm_shares *shares)
-{
-       ckrm_laq_res_t *res = my_res;
-
-       if (!res)
-               return -EINVAL;
-       *shares = res->shares;
-       return 0;
-}
-
-/**************************************************************************
- *                     STATS                                           ***
- **************************************************************************/
-
-void
-laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i)
-{
-       seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t"
-                  "queued: %u\n\twait_time: %u\n",
-                  i, taq->acceptq_count, taq->acceptq_qcount,
-                  jiffies_to_msecs(taq->acceptq_wait_time));
-
-       if (i)
-               return;
-
-       for (i = 1; i < NUM_ACCEPT_QUEUES; i++) {
-               taq[0].acceptq_wait_time += taq[i].acceptq_wait_time;
-               taq[0].acceptq_qcount += taq[i].acceptq_qcount;
-               taq[0].acceptq_count += taq[i].acceptq_count;
-       }
-
-       seq_printf(sfile, "Totals :\n\taccepted: %u\n\t"
-                  "queued: %u\n\twait_time: %u\n",
-                  taq->acceptq_count, taq->acceptq_qcount,
-                  jiffies_to_msecs(taq->acceptq_wait_time));
-
-       return;
-}
-
-void
-laq_get_aq_stats(ckrm_laq_res_t * pres, ckrm_laq_res_t * mres,
-                struct tcp_acceptq_info *taq)
-{
-       struct ckrm_net_struct *ns;
-       struct ckrm_core_class *core = pres->core;
-       struct tcp_opt *tp;
-       int a = mres->my_id;
-       int z;
-
-       if (a == 0)
-               z = NUM_ACCEPT_QUEUES;
-       else
-               z = a + 1;
-
-       // XXX Instead of holding a  class_lock introduce a rw
-       // lock to be write locked by listen callbacks and read locked here.
-       // - VK
-       class_lock(pres->core);
-       list_for_each_entry(ns, &core->objlist, ckrm_link) {
-               tp = tcp_sk(ns->ns_sk);
-               for (; a < z; a++) {
-                       taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time;
-                       taq->acceptq_qcount += tp->acceptq[a].aq_qcount;
-                       taq->acceptq_count += tp->acceptq[a].aq_count;
-                       taq++;
-               }
-       }
-       class_unlock(pres->core);
-}
-
-static int laq_get_stats(void *my_res, struct seq_file *sfile)
-{
-       ckrm_laq_res_t *res = my_res;
-       ckrm_laq_res_t *parent;
-       struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES];
-       int rc = 0;
-
-       if (!res)
-               return -EINVAL;
-
-       if (!res->pcore) {
-               // something is badly wrong
-               printk(KERN_ERR "socketaq internal inconsistency\n");
-               return -EBADF;
-       }
-
-       parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent) {          // socketclass does not have a stat interface
-               printk(KERN_ERR "socketaq internal fs inconsistency\n");
-               return -EINVAL;
-       }
-
-       memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES);
-
-       switch (res->my_depth) {
-
-       default:
-       case 0:
-               printk(KERN_ERR "socket class bad entry\n");
-               rc = -EBADF;
-               break;
-
-       case 1:         // can't be read from. this is internal default.
-               // return -EINVAL
-               rc = -EINVAL;
-               break;
-
-       case 2:         // return the default and total
-               ckrm_lock_hier(res->core);      // block any deletes
-               laq_get_aq_stats(res, res, &taq[0]);
-               laq_print_aq_stats(sfile, &taq[0], 0);
-               ckrm_unlock_hier(res->core);    // block any deletes
-               break;
-
-       case 3:
-               ckrm_lock_hier(parent->core);   // block any deletes
-               laq_get_aq_stats(parent, res, &taq[res->my_id]);
-               laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id);
-               ckrm_unlock_hier(parent->core); // block any deletes
-               break;
-       }
-
-       return rc;
-}
-
-/*
- * The network connection is reclassified to this class. Update its shares.
- * The socket lock is held. 
- */
-static void laq_change_resclass(void *n, void *old, void *r)
-{
-       struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n;
-       struct ckrm_laq_res *res = (struct ckrm_laq_res *)r;
-       unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
-
-       if (res->my_depth != 2)
-               return;
-
-       // a change to my_depth == 3 ie. the accept classes cannot happen.
-       // there is no target file
-       if (res->my_depth == 2) {       // it is one of the socket classes
-               ckrm_lock_hier(res->pcore);
-               // share rule: hold parent resource lock. then self.
-               // However, since my_depth == 1 is a generic class it is not
-               // needed here. Self lock is enough.
-               spin_lock(&res->reslock);
-               calculate_aq_ratios(res, aq_ratio);
-               class_lock(res->pcore);
-               laq_set_aq_value(ns, aq_ratio);
-               class_unlock(res->pcore);
-               spin_unlock(&res->reslock);
-               ckrm_unlock_hier(res->pcore);
-       }
-
-       return;
-}
-
-struct ckrm_res_ctlr laq_rcbs = {
-       .res_name = "laq",
-       .resid = -1,            // dynamically assigned
-       .res_alloc = laq_res_alloc,
-       .res_free = laq_res_free,
-       .set_share_values = laq_set_share_values,
-       .get_share_values = laq_get_share_values,
-       .get_stats = laq_get_stats,
-       .change_resclass = laq_change_resclass,
-       //.res_initcls       = laq_res_initcls,  //HUBERTUS: unnecessary !!
-};
-
-int __init init_ckrm_laq_res(void)
-{
-       struct ckrm_classtype *clstype;
-       int resid;
-
-       clstype = ckrm_find_classtype_by_name("socketclass");
-       if (clstype == NULL) {
-               printk(KERN_INFO " Unknown ckrm classtype<socketclass>");
-               return -ENOENT;
-       }
-
-       if (my_resid == -1) {
-               resid = ckrm_register_res_ctlr(clstype, &laq_rcbs);
-               if (resid >= 0)
-                       my_resid = resid;
-               printk(KERN_DEBUG "........init_ckrm_listen_aq_res -> %d\n", my_resid);
-       }
-       return 0;
-
-}
-
-void __exit exit_ckrm_laq_res(void)
-{
-       ckrm_unregister_res_ctlr(&laq_rcbs);
-       my_resid = -1;
-}
-
-module_init(init_ckrm_laq_res)
-    module_exit(exit_ckrm_laq_res)
-
-    MODULE_LICENSE("GPL");
index 235ac06..103e3f9 100644 (file)
@@ -1,4 +1,4 @@
-/* ckrm_socketaq.c - accept queue resource controller
+/* ckrm_listenaq.c - accept queue resource controller
  *
  * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
  * 
 #include <linux/ckrm_net.h>
 
 #define hnode_2_core(ptr) \
-                ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL)
+        ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL)
 
-
-#define CKRM_SAQ_MAX_DEPTH     3 // 0 => /rcfs
+#define CKRM_SAQ_MAX_DEPTH     3       // 0 => /rcfs
                                  // 1 => socket_aq
                                  // 2 => socket_aq/listen_class
                                  // 3 => socket_aq/listen_class/accept_queues
                                  // 4 => Not allowed
 
 typedef struct ckrm_laq_res {
-       spinlock_t              reslock;
-       atomic_t                refcnt;
-       struct ckrm_shares      shares;
+       spinlock_t reslock;
+       atomic_t refcnt;
+       struct ckrm_shares shares;
        struct ckrm_core_class *core;
        struct ckrm_core_class *pcore;
-       int                     my_depth;
-       int                     my_id;
+       int my_depth;
+       int my_id;
+       unsigned int min_ratio;
 } ckrm_laq_res_t;
 
 static int my_resid = -1;
 
-extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int);
-extern struct ckrm_core_class *rcfs_make_core(struct dentry *, 
-                                               struct ckrm_core_class * ) ;
+extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int);
+extern struct ckrm_core_class *rcfs_make_core(struct dentry *,
+                                             struct ckrm_core_class *);
 
-void
-laq_res_hold(struct ckrm_laq_res *res)
+void laq_res_hold(struct ckrm_laq_res *res)
 {
-        atomic_inc(&res->refcnt);
+       atomic_inc(&res->refcnt);
        return;
 }
 
-void
-laq_res_put(struct ckrm_laq_res *res)
+void laq_res_put(struct ckrm_laq_res *res)
 {
        if (atomic_dec_and_test(&res->refcnt))
                kfree(res);
@@ -74,43 +72,40 @@ laq_res_put(struct ckrm_laq_res *res)
 
 /* Initialize rescls values
  */
-static void
-laq_res_initcls(void *my_res)
+static void laq_res_initcls(void *my_res)
 {
        ckrm_laq_res_t *res = my_res;
 
-       res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
-       res->shares.my_limit         = CKRM_SHARE_DONTCARE;
-       res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+       res->shares.my_guarantee = CKRM_SHARE_DONTCARE;
+       res->shares.my_limit = CKRM_SHARE_DONTCARE;
+       res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+       res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
        res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
-       res->shares.cur_max_limit    = 0;
+       res->shares.cur_max_limit = 0;
 }
 
-static int 
-atoi(char *s)
+static int atoi(char *s)
 {
        int k = 0;
-       while(*s) 
+       while (*s)
                k = *s++ - '0' + (k * 10);
        return k;
 }
 
-static char *
-laq_get_name(struct ckrm_core_class *c)
+static char *laq_get_name(struct ckrm_core_class *c)
 {
-        char *p = (char *)c->name;
+       char *p = (char *)c->name;
 
-        while(*p)
-                p++;
-        while( *p != '/' && p != c->name)
-                p--;
+       while (*p)
+               p++;
+       while (*p != '/' && p != c->name)
+               p--;
 
-        return ++p;
+       return ++p;
 }
 
-static void *
-laq_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
+static void *laq_res_alloc(struct ckrm_core_class *core,
+                          struct ckrm_core_class *parent)
 {
        ckrm_laq_res_t *res, *pres;
        int pdepth;
@@ -120,7 +115,7 @@ laq_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
        else
                pres = NULL;
 
-       if (core == core->classtype->default_class)    
+       if (core == core->classtype->default_class)
                pdepth = 1;
        else {
                if (!parent)
@@ -133,7 +128,7 @@ laq_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
                memset(res, 0, sizeof(res));
                spin_lock_init(&res->reslock);
                laq_res_hold(res);
-               res->my_depth  = pdepth;
+               res->my_depth = pdepth;
                if (pdepth == 2)        // listen class
                        res->my_id = 0;
                else if (pdepth == 3)
@@ -143,19 +138,18 @@ laq_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
 
                // rescls in place, now initialize contents other than 
                // hierarchy pointers
-               laq_res_initcls(res); // acts as initialising value
+               laq_res_initcls(res);   // acts as initialising value
        }
 
        return res;
 }
 
-static void
-laq_res_free(void *my_res)
+static void laq_res_free(void *my_res)
 {
-       ckrm_laq_res_t *res = (ckrm_laq_res_t *)my_res;
+       ckrm_laq_res_t *res = (ckrm_laq_res_t *) my_res;
        ckrm_laq_res_t *parent;
 
-       if (!res) 
+       if (!res)
                return;
 
        if (res->my_depth != 3) {
@@ -164,7 +158,7 @@ laq_res_free(void *my_res)
        }
 
        parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)    // Should never happen
+       if (!parent)            // Should never happen
                return;
 
        spin_lock(&parent->reslock);
@@ -175,7 +169,7 @@ laq_res_free(void *my_res)
        child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0);
 
        spin_unlock(&res->reslock);
-       laq_res_put(res);       
+       laq_res_put(res);
        spin_unlock(&parent->reslock);
        return;
 }
@@ -184,112 +178,126 @@ laq_res_free(void *my_res)
  *                     SHARES                                          ***
  **************************************************************************/
 
-void
-laq_set_aq_values(ckrm_laq_res_t *my_res, ckrm_laq_res_t *parent, int updatep)
+void laq_set_aq_value(struct ckrm_net_struct *ns, unsigned int *aq_ratio)
+{
+       int i;
+       struct tcp_opt *tp;
+
+       tp = tcp_sk(ns->ns_sk);
+       for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
+               tp->acceptq[i].aq_ratio = aq_ratio[i];
+       return;
+}
+void laq_set_aq_values(ckrm_laq_res_t * parent, unsigned int *aq_ratio)
 {
 
        struct ckrm_net_struct *ns;
        struct ckrm_core_class *core = parent->core;
-       struct tcp_opt *tp;
-       
-       if (my_res->my_depth < 2) 
-               return;
-       
-       // XXX Instead of holding a  class_lock introduce a rw
-       // lock to be write locked by listen callbacks and read locked here.
-       // - VK
+
        class_lock(core);
-       list_for_each_entry(ns, &core->objlist,ckrm_link) { 
-               tp = tcp_sk(ns->ns_sk);
-               if (updatep)
-                       tp->acceptq[0].aq_ratio =
-                              parent->shares.total_guarantee/
-                               parent->shares.unused_guarantee;               
-
-               tp->acceptq[my_res->my_id].aq_ratio =
-                      my_res->shares.total_guarantee/
-                       parent->shares.my_guarantee;           
+       list_for_each_entry(ns, &core->objlist, ckrm_link) {
+               laq_set_aq_value(ns, aq_ratio);
        }
        class_unlock(core);
        return;
 }
 
-static int
-laq_set_share_values(void *my_res, struct ckrm_shares *shares)
+static void calculate_aq_ratios(ckrm_laq_res_t * res, unsigned int *aq_ratio)
+{
+       struct ckrm_hnode *chnode;
+       ckrm_laq_res_t *child;
+       unsigned int min;
+       int i;
+
+       min = aq_ratio[0] = (unsigned int)res->shares.unused_guarantee;
+
+       list_for_each_entry(chnode, &res->core->hnode.children, siblings) {
+               child = hnode_2_core(chnode)->res_class[my_resid];
+
+               aq_ratio[child->my_id] =
+                   (unsigned int)child->shares.my_guarantee;
+               if (aq_ratio[child->my_id] == CKRM_SHARE_DONTCARE)
+                       aq_ratio[child->my_id] = 0;
+               if (aq_ratio[child->my_id] &&
+                   ((unsigned int)aq_ratio[child->my_id] < min))
+                       min = (unsigned int)child->shares.my_guarantee;
+       }
+
+       if (min == 0) {
+               min = 1;
+               // default takes all if nothing specified
+               aq_ratio[0] = 1;        
+       }
+       res->min_ratio = min;
+
+       for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
+               aq_ratio[i] = aq_ratio[i] / min;
+}
+
+static int laq_set_share_values(void *my_res, struct ckrm_shares *shares)
 {
        ckrm_laq_res_t *res = my_res;
-       ckrm_laq_res_t *parent, *child;
-       struct ckrm_hnode *chnode; 
+       ckrm_laq_res_t *parent;
+       unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
        int rc = 0;
 
-       if (!res) 
+       if (!res)
                return -EINVAL;
 
-       if (!res->pcore) { 
+       if (!res->pcore) {
                // something is badly wrong
                printk(KERN_ERR "socketaq internal inconsistency\n");
                return -EBADF;
        }
 
        parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent)    // socket_class does not have a share interface
+       if (!parent)            // socketclass does not have a share interface
                return -EINVAL;
 
        // Ensure that we ignore limit values
-       shares->my_limit = shares->max_limit = CKRM_SHARE_UNCHANGED;
-
-       switch (res->my_depth) {
-
-       case 0: printk(KERN_ERR "socketaq bad entry\n");
-               rc = -EBADF;
-               break;
+       shares->my_limit = CKRM_SHARE_DONTCARE;
+       shares->max_limit = CKRM_SHARE_UNCHANGED;
 
-       case 1: // can't be written to. this is internal default.
-               // return -EINVAL
-               rc = -EINVAL;
-               break;
-
-       case 2: // nothing to inherit
+       if (res->my_depth == 0) {
+               printk(KERN_ERR "socketaq bad entry\n");
+               return -EBADF;
+       } else if (res->my_depth == 1) {
+               // can't be written to. This is an internal default.
+               return -EINVAL;
+       } else if (res->my_depth == 2) {
+               //nothin to inherit
                if (!shares->total_guarantee) {
-                       rc = -EINVAL;
-                       break;
+                       return -EINVAL;
                }
+               parent = res;
+               shares->my_guarantee = CKRM_SHARE_DONTCARE;
+       } else if (res->my_depth == 3) {
+               // accept queue itself. 
+               shares->total_guarantee = CKRM_SHARE_UNCHANGED;
+       }
 
-               ckrm_lock_hier(res->pcore);
-               spin_lock(&res->reslock);
-               rc = set_shares(shares, &res->shares, NULL);
-               if (!rc) {
-                       list_for_each_entry(chnode,
-                                       &res->core->hnode.children,siblings){
-                               child=hnode_2_core(chnode)->res_class[my_resid];
-                               laq_set_aq_values(child,res,(child->my_id==1));
-                       }
-               }
+       ckrm_lock_hier(parent->pcore);
+       spin_lock(&parent->reslock);
+       rc = set_shares(shares, &res->shares,
+                       (parent == res) ? NULL : &parent->shares);
+       if (rc) {
                spin_unlock(&res->reslock);
                ckrm_unlock_hier(res->pcore);
-               break;
-
-       case 3: // accept queue itself. Check against parent.
-               ckrm_lock_hier(parent->pcore);
-               spin_lock(&parent->reslock);
-               rc = set_shares(shares, &res->shares, &parent->shares);
-               if (!rc) {
-                       laq_set_aq_values(res,parent,1);
-               }
-               spin_unlock(&parent->reslock);
-               ckrm_unlock_hier(parent->pcore);
-               break;
+               return rc;
        }
+       calculate_aq_ratios(parent, aq_ratio);
+       laq_set_aq_values(parent, aq_ratio);
+       spin_unlock(&parent->reslock);
+       ckrm_unlock_hier(parent->pcore);
 
        return rc;
 }
 
-static int
-laq_get_share_values(void *my_res, struct ckrm_shares *shares)
+static int laq_get_share_values(void *my_res, struct ckrm_shares *shares)
 {
        ckrm_laq_res_t *res = my_res;
 
-       if (!res) 
+       if (!res)
                return -EINVAL;
        *shares = res->shares;
        return 0;
@@ -303,9 +311,9 @@ void
 laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i)
 {
        seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t"
-                         "queued: %u\n\twait_time: %lu\n\t",
-                         i, taq->acceptq_count, taq->acceptq_qcount,
-                         taq->acceptq_wait_time);
+                  "queued: %u\n\twait_time: %u\n",
+                  i, taq->acceptq_count, taq->acceptq_qcount,
+                  jiffies_to_msecs(taq->acceptq_wait_time));
 
        if (i)
                return;
@@ -317,16 +325,16 @@ laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i)
        }
 
        seq_printf(sfile, "Totals :\n\taccepted: %u\n\t"
-                         "queued: %u\n\twait_time: %lu\n",
-                          taq->acceptq_count, taq->acceptq_qcount,
-                         taq->acceptq_wait_time);
+                  "queued: %u\n\twait_time: %u\n",
+                  taq->acceptq_count, taq->acceptq_qcount,
+                  jiffies_to_msecs(taq->acceptq_wait_time));
 
        return;
 }
 
 void
-laq_get_aq_stats(ckrm_laq_res_t *pres, ckrm_laq_res_t *mres, 
-                                       struct tcp_acceptq_info *taq)
+laq_get_aq_stats(ckrm_laq_res_t * pres, ckrm_laq_res_t * mres,
+                struct tcp_acceptq_info *taq)
 {
        struct ckrm_net_struct *ns;
        struct ckrm_core_class *core = pres->core;
@@ -337,15 +345,15 @@ laq_get_aq_stats(ckrm_laq_res_t *pres, ckrm_laq_res_t *mres,
        if (a == 0)
                z = NUM_ACCEPT_QUEUES;
        else
-               z = a+1;
+               z = a + 1;
 
        // XXX Instead of holding a  class_lock introduce a rw
        // lock to be write locked by listen callbacks and read locked here.
        // - VK
        class_lock(pres->core);
-       list_for_each_entry(ns, &core->objlist,ckrm_link) { 
+       list_for_each_entry(ns, &core->objlist, ckrm_link) {
                tp = tcp_sk(ns->ns_sk);
-               for (; a< z; a++) {
+               for (; a < z; a++) {
                        taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time;
                        taq->acceptq_qcount += tp->acceptq[a].aq_qcount;
                        taq->acceptq_count += tp->acceptq[a].aq_count;
@@ -355,26 +363,24 @@ laq_get_aq_stats(ckrm_laq_res_t *pres, ckrm_laq_res_t *mres,
        class_unlock(pres->core);
 }
 
-
-static int  
-laq_get_stats(void *my_res, struct seq_file *sfile)
+static int laq_get_stats(void *my_res, struct seq_file *sfile)
 {
        ckrm_laq_res_t *res = my_res;
        ckrm_laq_res_t *parent;
        struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES];
        int rc = 0;
 
-       if (!res) 
+       if (!res)
                return -EINVAL;
-       
-       if (!res->pcore) { 
+
+       if (!res->pcore) {
                // something is badly wrong
                printk(KERN_ERR "socketaq internal inconsistency\n");
                return -EBADF;
        }
 
        parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
-       if (!parent) {  // socket_class does not have a stat interface
+       if (!parent) {          // socketclass does not have a stat interface
                printk(KERN_ERR "socketaq internal fs inconsistency\n");
                return -EINVAL;
        }
@@ -384,23 +390,24 @@ laq_get_stats(void *my_res, struct seq_file *sfile)
        switch (res->my_depth) {
 
        default:
-       case 0: printk(KERN_ERR "socket class bad entry\n");
+       case 0:
+               printk(KERN_ERR "socket class bad entry\n");
                rc = -EBADF;
                break;
 
-       case 1: // can't be read from. this is internal default.
+       case 1:         // can't be read from. this is internal default.
                // return -EINVAL
                rc = -EINVAL;
                break;
 
-       case 2: // return the default and total
+       case 2:         // return the default and total
                ckrm_lock_hier(res->core);      // block any deletes
                laq_get_aq_stats(res, res, &taq[0]);
                laq_print_aq_stats(sfile, &taq[0], 0);
                ckrm_unlock_hier(res->core);    // block any deletes
                break;
 
-       case 3: 
+       case 3:
                ckrm_lock_hier(parent->core);   // block any deletes
                laq_get_aq_stats(parent, res, &taq[res->my_id]);
                laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id);
@@ -415,89 +422,74 @@ laq_get_stats(void *my_res, struct seq_file *sfile)
  * The network connection is reclassified to this class. Update its shares.
  * The socket lock is held. 
  */
-static void
-laq_change_resclass(void *n, void *old, void *r)
+static void laq_change_resclass(void *n, void *old, void *r)
 {
        struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n;
        struct ckrm_laq_res *res = (struct ckrm_laq_res *)r;
-       struct ckrm_hnode  *chnode = NULL;
+       unsigned int aq_ratio[NUM_ACCEPT_QUEUES];
 
-
-       if (res->my_depth != 2) 
-               return; 
+       if (res->my_depth != 2)
+               return;
 
        // a change to my_depth == 3 ie. the accept classes cannot happen.
        // there is no target file
-       if (res->my_depth == 2) { // it is one of the socket classes
-               struct ckrm_laq_res *reschild;
-               struct sock *sk = ns->ns_sk; 
-               struct tcp_opt *tp = tcp_sk(sk);
-
+       if (res->my_depth == 2) {       // it is one of the socket classes
+               ckrm_lock_hier(res->pcore);
                // share rule: hold parent resource lock. then self.
                // However, since my_depth == 1 is a generic class it is not
                // needed here. Self lock is enough.
                spin_lock(&res->reslock);
-               tp->acceptq[0].aq_ratio = res->shares.total_guarantee/
-                               res->shares.unused_guarantee;
-               list_for_each_entry(chnode,&res->core->hnode.children,siblings){
-                       reschild = hnode_2_core(chnode)->res_class[my_resid];
-
-                       spin_lock(&reschild->reslock);
-                       tp->acceptq[reschild->my_id].aq_ratio=
-                               reschild->shares.total_guarantee/
-                                       res->shares.my_guarantee;
-                       spin_unlock(&reschild->reslock);
-               }
+               calculate_aq_ratios(res, aq_ratio);
+               class_lock(res->pcore);
+               laq_set_aq_value(ns, aq_ratio);
+               class_unlock(res->pcore);
                spin_unlock(&res->reslock);
+               ckrm_unlock_hier(res->pcore);
        }
-       
+
        return;
 }
 
 struct ckrm_res_ctlr laq_rcbs = {
-       .res_name          = "laq",
-       .resid             = -1 , // dynamically assigned
-       .res_alloc         = laq_res_alloc,
-       .res_free          = laq_res_free,
-       .set_share_values  = laq_set_share_values,
-       .get_share_values  = laq_get_share_values,
-       .get_stats         = laq_get_stats,
-       .change_resclass   = laq_change_resclass,
-       //      .res_initcls       = laq_res_initcls,         // LAQ_HUBERTUS: no need for this !!
+       .res_name = "listenaq",
+       .resid = -1,            // dynamically assigned
+       .res_alloc = laq_res_alloc,
+       .res_free = laq_res_free,
+       .set_share_values = laq_set_share_values,
+       .get_share_values = laq_get_share_values,
+       .get_stats = laq_get_stats,
+       .change_resclass = laq_change_resclass,
+       //.res_initcls       = laq_res_initcls,  //HUBERTUS: unnecessary !!
 };
 
-int __init
-init_ckrm_laq_res(void)
+int __init init_ckrm_laq_res(void)
 {
        struct ckrm_classtype *clstype;
        int resid;
 
-       clstype = ckrm_find_classtype_by_name("socket_class");
+       clstype = ckrm_find_classtype_by_name("socketclass");
        if (clstype == NULL) {
-               printk(KERN_INFO " Unknown ckrm classtype<socket_class>");
+               printk(KERN_INFO " Unknown ckrm classtype<socketclass>");
                return -ENOENT;
        }
 
        if (my_resid == -1) {
-               resid = ckrm_register_res_ctlr(clstype,&laq_rcbs);
+               resid = ckrm_register_res_ctlr(clstype, &laq_rcbs);
                if (resid >= 0)
                        my_resid = resid;
-               printk("........init_ckrm_listen_aq_res -> %d\n",my_resid);
+               printk("........init_ckrm_listen_aq_res -> %d\n", my_resid);
        }
        return 0;
 
-}      
+}
 
-void __exit
-exit_ckrm_laq_res(void)
+void __exit exit_ckrm_laq_res(void)
 {
        ckrm_unregister_res_ctlr(&laq_rcbs);
        my_resid = -1;
 }
 
-
 module_init(init_ckrm_laq_res)
-module_exit(exit_ckrm_laq_res)
-
-MODULE_LICENSE("GPL");
+    module_exit(exit_ckrm_laq_res)
 
+    MODULE_LICENSE("GPL");