From 2c55dee25ca3b7ace814e8a8dbb5b4ac1418cd28 Mon Sep 17 00:00:00 2001 From: Marc Fiuczynski Date: Fri, 24 Sep 2004 10:23:31 +0000 Subject: [PATCH] ckrm_E16rc1 mem controller --- Makefile | 2 +- arch/i386/kernel/entry.S | 2 - arch/ppc/kernel/misc.S | 2 - drivers/block/Makefile | 3 +- drivers/block/cfq-iosched.c | 992 ++++++++------------------------ drivers/block/elevator.c | 10 +- drivers/block/ll_rw_blk.c | 45 +- fs/exec.c | 13 + include/asm-i386/unistd.h | 4 +- include/asm-ppc/unistd.h | 4 +- include/asm-x86_64/unistd.h | 6 +- include/linux/ckrm_mem.h | 23 +- include/linux/ckrm_mem_inline.h | 95 ++- include/linux/elevator.h | 5 - include/linux/fs.h | 12 - include/linux/init_task.h | 1 - include/linux/mm.h | 3 + include/linux/mm_inline.h | 7 + include/linux/page-flags.h | 1 + include/linux/sched.h | 11 +- init/Kconfig | 29 +- kernel/ckrm/Makefile | 1 + kernel/ckrm/ckrm_mem.c | 193 +++++-- kernel/ckrm/ckrmutils.c | 19 - kernel/exit.c | 7 + kernel/fork.c | 21 +- mm/page_alloc.c | 7 + mm/vmscan.c | 162 +++++- 28 files changed, 710 insertions(+), 970 deletions(-) diff --git a/Makefile b/Makefile index f86ab3508..86d8af58a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 8 -EXTRAVERSION = .1-test +EXTRAVERSION = .1 NAME=Zonked Quokka # *DOCUMENTATION* diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index d66058134..7b6856363 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -886,7 +886,5 @@ ENTRY(sys_call_table) .long sys_mq_notify .long sys_mq_getsetattr .long sys_ni_syscall /* reserved for kexec */ - .long sys_ioprio_set - .long sys_ioprio_get /* 285 */ syscall_table_size=(.-sys_call_table) diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S index 32e1e4059..873199e43 100644 --- a/arch/ppc/kernel/misc.S +++ b/arch/ppc/kernel/misc.S @@ -1450,5 +1450,3 @@ _GLOBAL(sys_call_table) .long sys_mq_notify .long sys_mq_getsetattr .long sys_ni_syscall /* 268 reserved for sys_kexec_load */ - .long sys_ioprio_set - .long sys_ioprio_get diff --git a/drivers/block/Makefile b/drivers/block/Makefile index c66498bad..2654b5b76 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -13,13 +13,12 @@ # kblockd threads # -obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o ckrm-iostub.o +obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_CKRM_RES_BLKIO) += ckrm-io.o obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o obj-$(CONFIG_BLK_DEV_FD98) += floppy98.o diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c index d37911a64..068f4eae0 100644 --- a/drivers/block/cfq-iosched.c +++ b/drivers/block/cfq-iosched.c @@ -6,18 +6,6 @@ * Based on ideas from a previously unfinished io * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. * - * IO priorities are supported, from 0% to 100% in 5% increments. Both of - * those values have special meaning - 0% class is allowed to do io if - * noone else wants to use the disk. 100% is considered real-time io, and - * always get priority. Default process io rate is 95%. In absence of other - * io, a class may consume 100% disk bandwidth regardless. Withing a class, - * bandwidth is distributed equally among the citizens. - * - * TODO: - * - cfq_select_requests() needs some work for 5-95% io - * - barriers not supported - * - export grace periods in ms, not jiffies - * * Copyright (C) 2003 Jens Axboe */ #include @@ -34,176 +22,99 @@ #include #include -#if IOPRIO_NR > BITS_PER_LONG -#error Cannot support this many io priority levels -#endif - /* * tunables */ -static int cfq_quantum = 6; -static int cfq_quantum_io = 256; -static int cfq_idle_quantum = 1; -static int cfq_idle_quantum_io = 64; -static int cfq_queued = 4; -static int cfq_grace_rt = HZ / 100 ?: 1; -static int cfq_grace_idle = HZ / 10; +static int cfq_quantum = 4; +static int cfq_queued = 8; #define CFQ_QHASH_SHIFT 6 #define CFQ_QHASH_ENTRIES (1 << CFQ_QHASH_SHIFT) -#define list_entry_qhash(entry) hlist_entry((entry), struct cfq_queue, cfq_hash) +#define list_entry_qhash(entry) list_entry((entry), struct cfq_queue, cfq_hash) #define CFQ_MHASH_SHIFT 8 #define CFQ_MHASH_BLOCK(sec) ((sec) >> 3) #define CFQ_MHASH_ENTRIES (1 << CFQ_MHASH_SHIFT) #define CFQ_MHASH_FN(sec) (hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT)) +#define ON_MHASH(crq) !list_empty(&(crq)->hash) #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) -#define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) +#define list_entry_hash(ptr) list_entry((ptr), struct cfq_rq, hash) #define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) -#define list_entry_prio(ptr) list_entry((ptr), struct cfq_rq, prio_list) - -#define cfq_account_io(crq) \ - ((crq)->ioprio != IOPRIO_IDLE && (crq)->ioprio != IOPRIO_RT) -/* - * defines how we distribute bandwidth (can be tgid, uid, etc) - */ - -/* FIXME: change hash_key to be sizeof(void *) rather than sizeof(int) - * otherwise the cast of cki_tsk_icls will not work reliably on 64-bit arches. - * OR, change cki_tsk_icls to return ints (will need another id space to be - * managed) - */ - -#if defined(CONFIG_CKRM_RES_BLKIO) || defined(CONFIG_CKRM_RES_BLKIO_MODULE) -extern inline void *cki_hash_key(struct task_struct *tsk); -extern inline int cki_ioprio(struct task_struct *tsk); -#define cfq_hash_key(current) ((int)cki_hash_key((current))) -#define cfq_ioprio(current) (cki_ioprio((current))) - -#else -#define cfq_hash_key(current) ((current)->tgid) -/* - * move to io_context - */ -#define cfq_ioprio(current) ((current)->ioprio) -#endif - -#define CFQ_WAIT_RT 0 -#define CFQ_WAIT_NORM 1 +#define RQ_DATA(rq) ((struct cfq_rq *) (rq)->elevator_private) static kmem_cache_t *crq_pool; static kmem_cache_t *cfq_pool; static mempool_t *cfq_mpool; -/* - * defines an io priority level - */ -struct io_prio_data { - struct list_head rr_list; - int busy_queues; - int busy_rq; - unsigned long busy_sectors; - - /* Statistics on requests, sectors and queues - * added to (in) and dispatched from (out) - * this priority level. Reinsertion of previously - * dispatched crq's into cfq's results in double counting - * which is ignored for now as in-out should - * still be accurate. - */ - atomic_t cum_rq_in,cum_rq_out; - atomic_t cum_sectors_in,cum_sectors_out; - atomic_t cum_queues_in,cum_queues_out; - - struct list_head prio_list; - int last_rq; - int last_sectors; -}; - -/* - * per-request queue structure - */ struct cfq_data { + struct list_head rr_list; struct list_head *dispatch; - struct hlist_head *cfq_hash; - struct hlist_head *crq_hash; - mempool_t *crq_pool; + struct list_head *cfq_hash; - struct io_prio_data cid[IOPRIO_NR]; + struct list_head *crq_hash; - /* - * total number of busy queues and requests - */ - int busy_rq; - int busy_queues; - unsigned long busy_sectors; + unsigned int busy_queues; + unsigned int max_queued; - unsigned long rq_starved_mask; + mempool_t *crq_pool; - /* - * grace period handling - */ - struct timer_list timer; - unsigned long wait_end; - unsigned long flags; - struct work_struct work; + request_queue_t *queue; /* * tunables */ unsigned int cfq_quantum; - unsigned int cfq_quantum_io; - unsigned int cfq_idle_quantum; - unsigned int cfq_idle_quantum_io; unsigned int cfq_queued; - unsigned int cfq_grace_rt; - unsigned int cfq_grace_idle; }; -/* - * per-class structure - */ struct cfq_queue { + struct list_head cfq_hash; struct list_head cfq_list; - struct hlist_node cfq_hash; - int hash_key; struct rb_root sort_list; + int pid; int queued[2]; - int ioprio; +#if 0 + /* + * with a simple addition like this, we can do io priorities. almost. + * does need a split request free list, too. + */ + int io_prio +#endif }; -/* - * per-request structure - */ struct cfq_rq { - struct cfq_queue *cfq_queue; struct rb_node rb_node; - struct hlist_node hash; sector_t rb_key; struct request *request; - struct list_head prio_list; - unsigned long nr_sectors; - int ioprio; + struct cfq_queue *cfq_queue; + + struct list_head hash; }; static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq); static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid); -static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq); +static void cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_rq *crq); /* * lots of deadline iosched dupes, can be abstracted later... */ +static inline void __cfq_del_crq_hash(struct cfq_rq *crq) +{ + list_del_init(&crq->hash); +} + static inline void cfq_del_crq_hash(struct cfq_rq *crq) { - hlist_del_init(&crq->hash); + if (ON_MHASH(crq)) + __cfq_del_crq_hash(crq); } -static inline void -cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) +static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) { cfq_del_crq_hash(crq); @@ -214,26 +125,27 @@ cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq) static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) { struct request *rq = crq->request; - const int hash_idx = CFQ_MHASH_FN(rq_hash_key(rq)); - BUG_ON(!hlist_unhashed(&crq->hash)); + BUG_ON(ON_MHASH(crq)); - hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); + list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]); } static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) { - struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; - struct hlist_node *entry, *next; + struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)]; + struct list_head *entry, *next = hash_list->next; - hlist_for_each_safe(entry, next, hash_list) { + while ((entry = next) != hash_list) { struct cfq_rq *crq = list_entry_hash(entry); struct request *__rq = crq->request; - BUG_ON(hlist_unhashed(&crq->hash)); + next = entry->next; + + BUG_ON(!ON_MHASH(crq)); if (!rq_mergeable(__rq)) { - cfq_del_crq_hash(crq); + __cfq_del_crq_hash(crq); continue; } @@ -247,27 +159,20 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset) /* * rb tree support functions */ -#define RB_EMPTY(node) ((node)->rb_node == NULL) +#define RB_NONE (2) +#define RB_EMPTY(node) ((node)->rb_node == NULL) +#define RB_CLEAR(node) ((node)->rb_color = RB_NONE) +#define RB_CLEAR_ROOT(root) ((root)->rb_node = NULL) +#define ON_RB(node) ((node)->rb_color != RB_NONE) #define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) #define rq_rb_key(rq) (rq)->sector -static void -cfq_del_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq) +static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) { - if (crq->cfq_queue) { - crq->cfq_queue = NULL; - - if (cfq_account_io(crq)) { - cfqd->busy_rq--; - cfqd->busy_sectors -= crq->nr_sectors; - cfqd->cid[crq->ioprio].busy_rq--; - atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_out)); - cfqd->cid[crq->ioprio].busy_sectors -= crq->nr_sectors; - atomic_add(crq->nr_sectors,&(cfqd->cid[crq->ioprio].cum_sectors_out)); - } - + if (ON_RB(&crq->rb_node)) { cfqq->queued[rq_data_dir(crq->request)]--; rb_erase(&crq->rb_node, &cfqq->sort_list); + crq->cfq_queue = NULL; } } @@ -291,7 +196,7 @@ __cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq) } rb_link_node(&crq->rb_node, parent, p); - return 0; + return NULL; } static void @@ -300,33 +205,24 @@ cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq) struct request *rq = crq->request; struct cfq_rq *__alias; + crq->rb_key = rq_rb_key(rq); cfqq->queued[rq_data_dir(rq)]++; - if (cfq_account_io(crq)) { - cfqd->busy_rq++; - cfqd->busy_sectors += crq->nr_sectors; - cfqd->cid[crq->ioprio].busy_rq++; - atomic_inc(&(cfqd->cid[crq->ioprio].cum_rq_in)); - cfqd->cid[crq->ioprio].busy_sectors += crq->nr_sectors; - atomic_add(crq->nr_sectors,&(cfqd->cid[crq->ioprio].cum_sectors_in)); - } retry: __alias = __cfq_add_crq_rb(cfqq, crq); if (!__alias) { rb_insert_color(&crq->rb_node, &cfqq->sort_list); - crq->rb_key = rq_rb_key(rq); crq->cfq_queue = cfqq; return; } - cfq_del_crq_rb(cfqd, cfqq, __alias); - cfq_dispatch_sort(cfqd->dispatch, __alias); + cfq_dispatch_sort(cfqd, cfqq, __alias); goto retry; } static struct request * cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) { - struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current)); + struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid); struct rb_node *n; if (!cfqq) @@ -351,30 +247,16 @@ out: static void cfq_remove_request(request_queue_t *q, struct request *rq) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); if (crq) { + struct cfq_queue *cfqq = crq->cfq_queue; + cfq_remove_merge_hints(q, crq); - list_del_init(&crq->prio_list); list_del_init(&rq->queuelist); - /* - * set a grace period timer to allow realtime io to make real - * progress, if we release an rt request. for normal request, - * set timer so idle io doesn't interfere with other io - */ - if (crq->ioprio == IOPRIO_RT) { - set_bit(CFQ_WAIT_RT, &cfqd->flags); - cfqd->wait_end = jiffies + cfqd->cfq_grace_rt; - } else if (crq->ioprio != IOPRIO_IDLE) { - set_bit(CFQ_WAIT_NORM, &cfqd->flags); - cfqd->wait_end = jiffies + cfqd->cfq_grace_idle; - } - - if (crq->cfq_queue) { - struct cfq_queue *cfqq = crq->cfq_queue; - - cfq_del_crq_rb(cfqd, cfqq, crq); + if (cfqq) { + cfq_del_crq_rb(cfqq, crq); if (RB_EMPTY(&cfqq->sort_list)) cfq_put_queue(cfqd, cfqq); @@ -424,22 +306,18 @@ out_insert: static void cfq_merged_request(request_queue_t *q, struct request *req) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(req); + struct cfq_rq *crq = RQ_DATA(req); cfq_del_crq_hash(crq); cfq_add_crq_hash(cfqd, crq); - if (crq->cfq_queue && (rq_rb_key(req) != crq->rb_key)) { + if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) { struct cfq_queue *cfqq = crq->cfq_queue; - cfq_del_crq_rb(cfqd, cfqq, crq); + cfq_del_crq_rb(cfqq, crq); cfq_add_crq_rb(cfqd, cfqq, crq); } - cfqd->busy_sectors += req->hard_nr_sectors - crq->nr_sectors; - cfqd->cid[crq->ioprio].busy_sectors += req->hard_nr_sectors - crq->nr_sectors; - crq->nr_sectors = req->hard_nr_sectors; - q->last_merge = req; } @@ -451,14 +329,16 @@ cfq_merged_requests(request_queue_t *q, struct request *req, cfq_remove_request(q, next); } -/* - * sort into dispatch list, in optimal ascending order - */ -static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq) +static void +cfq_dispatch_sort(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct cfq_rq *crq) { - struct list_head *entry = head; + struct list_head *head = cfqd->dispatch, *entry = head; struct request *__rq; + cfq_del_crq_rb(cfqq, crq); + cfq_remove_merge_hints(cfqd->queue, crq); + if (!list_empty(head)) { __rq = list_entry_rq(head->next); @@ -479,164 +359,47 @@ link: list_add_tail(&crq->request->queuelist, entry); } -/* - * remove from io scheduler core and put on dispatch list for service - */ -static inline int +static inline void __cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_rq *crq; - - crq = rb_entry_crq(rb_first(&cfqq->sort_list)); + struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list)); - cfq_del_crq_rb(cfqd, cfqq, crq); - cfq_remove_merge_hints(q, crq); - cfq_dispatch_sort(cfqd->dispatch, crq); - - /* - * technically, for IOPRIO_RT we don't need to add it to the list. - */ - list_add_tail(&crq->prio_list, &cfqd->cid[cfqq->ioprio].prio_list); - return crq->nr_sectors; + cfq_dispatch_sort(cfqd, cfqq, crq); } -static int -cfq_dispatch_requests(request_queue_t *q, int prio, int max_rq, int max_sectors) +static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd) { - struct cfq_data *cfqd = q->elevator.elevator_data; - struct list_head *plist = &cfqd->cid[prio].rr_list; - struct list_head *entry, *nxt; - int q_rq, q_io; + struct cfq_queue *cfqq; + struct list_head *entry, *tmp; + int ret, queued, good_queues; - /* - * for each queue at this prio level, dispatch a request - */ - q_rq = q_io = 0; - list_for_each_safe(entry, nxt, plist) { - struct cfq_queue *cfqq = list_entry_cfqq(entry); + if (list_empty(&cfqd->rr_list)) + return 0; + + queued = ret = 0; +restart: + good_queues = 0; + list_for_each_safe(entry, tmp, &cfqd->rr_list) { + cfqq = list_entry_cfqq(cfqd->rr_list.next); BUG_ON(RB_EMPTY(&cfqq->sort_list)); - q_io += __cfq_dispatch_requests(q, cfqd, cfqq); - q_rq++; + __cfq_dispatch_requests(q, cfqd, cfqq); if (RB_EMPTY(&cfqq->sort_list)) cfq_put_queue(cfqd, cfqq); + else + good_queues++; - /* - * if we hit the queue limit, put the string of serviced - * queues at the back of the pending list - */ - if (q_io >= max_sectors || q_rq >= max_rq) { - struct list_head *prv = nxt->prev; - - if (prv != plist) { - list_del(plist); - list_add(plist, prv); - } - break; - } - } - - cfqd->cid[prio].last_rq = q_rq; - cfqd->cid[prio].last_sectors = q_io; - return q_rq; -} - -/* - * try to move some requests to the dispatch list. return 0 on success - */ -static int cfq_select_requests(request_queue_t *q, struct cfq_data *cfqd) -{ - int queued, busy_rq, busy_sectors, i; - - /* - * if there's any realtime io, only schedule that - */ - if (cfq_dispatch_requests(q, IOPRIO_RT, cfqd->cfq_quantum, cfqd->cfq_quantum_io)) - return 1; - - /* - * if RT io was last serviced and grace time hasn't expired, - * arm the timer to restart queueing if no other RT io has been - * submitted in the mean time - */ - if (test_bit(CFQ_WAIT_RT, &cfqd->flags)) { - if (time_before(jiffies, cfqd->wait_end)) { - mod_timer(&cfqd->timer, cfqd->wait_end); - return 0; - } - clear_bit(CFQ_WAIT_RT, &cfqd->flags); - } - - /* - * for each priority level, calculate number of requests we - * are allowed to put into service. - */ - queued = 0; - busy_rq = cfqd->busy_rq; - busy_sectors = cfqd->busy_sectors; - for (i = IOPRIO_RT - 1; i > IOPRIO_IDLE; i--) { - const int o_rq = busy_rq - cfqd->cid[i].busy_rq; - const int o_sectors = busy_sectors - cfqd->cid[i].busy_sectors; - int q_rq = cfqd->cfq_quantum * (i + 1) / IOPRIO_NR; - int q_io = cfqd->cfq_quantum_io * (i + 1) / IOPRIO_NR; - - /* - * no need to keep iterating the list, if there are no - * requests pending anymore - */ - if (!cfqd->busy_rq) - break; - - /* - * find out how many requests and sectors we are allowed to - * service - */ - if (o_rq) - q_rq = o_sectors * (i + 1) / IOPRIO_NR; - if (q_rq > cfqd->cfq_quantum) - q_rq = cfqd->cfq_quantum; - - if (o_sectors) - q_io = o_sectors * (i + 1) / IOPRIO_NR; - if (q_io > cfqd->cfq_quantum_io) - q_io = cfqd->cfq_quantum_io; - - /* - * average with last dispatched for fairness - */ - if (cfqd->cid[i].last_rq != -1) - q_rq = (cfqd->cid[i].last_rq + q_rq) / 2; - if (cfqd->cid[i].last_sectors != -1) - q_io = (cfqd->cid[i].last_sectors + q_io) / 2; - - queued += cfq_dispatch_requests(q, i, q_rq, q_io); - } - - if (queued) - return 1; - - /* - * only allow dispatch of idle io, if the queue has been idle from - * servicing RT or normal io for the grace period - */ - if (test_bit(CFQ_WAIT_NORM, &cfqd->flags)) { - if (time_before(jiffies, cfqd->wait_end)) { - mod_timer(&cfqd->timer, cfqd->wait_end); - return 0; - } - clear_bit(CFQ_WAIT_NORM, &cfqd->flags); + queued++; + ret = 1; } - /* - * if we found nothing to do, allow idle io to be serviced - */ - if (cfq_dispatch_requests(q, IOPRIO_IDLE, cfqd->cfq_idle_quantum, cfqd->cfq_idle_quantum_io)) - return 1; + if ((queued < cfqd->cfq_quantum) && good_queues) + goto restart; - return 0; + return ret; } static struct request *cfq_next_request(request_queue_t *q) @@ -647,170 +410,121 @@ static struct request *cfq_next_request(request_queue_t *q) if (!list_empty(cfqd->dispatch)) { struct cfq_rq *crq; dispatch: - /* - * end grace period, we are servicing a request - */ - del_timer(&cfqd->timer); - clear_bit(CFQ_WAIT_RT, &cfqd->flags); - clear_bit(CFQ_WAIT_NORM, &cfqd->flags); - - BUG_ON(list_empty(cfqd->dispatch)); rq = list_entry_rq(cfqd->dispatch->next); - BUG_ON(q->last_merge == rq); - crq = RQ_ELV_DATA(rq); - if (crq) { - BUG_ON(!hlist_unhashed(&crq->hash)); - list_del_init(&crq->prio_list); - } + crq = RQ_DATA(rq); + if (crq) + cfq_remove_merge_hints(q, crq); return rq; } - /* - * we moved requests to dispatch list, go back end serve one - */ - if (cfq_select_requests(q, cfqd)) + if (cfq_dispatch_requests(q, cfqd)) goto dispatch; return NULL; } static inline struct cfq_queue * -__cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey, const int hashval) +__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval) { - struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; - struct hlist_node *entry; + struct list_head *hash_list = &cfqd->cfq_hash[hashval]; + struct list_head *entry; - hlist_for_each(entry, hash_list) { + list_for_each(entry, hash_list) { struct cfq_queue *__cfqq = list_entry_qhash(entry); - if (__cfqq->hash_key == hashkey) + if (__cfqq->pid == pid) return __cfqq; } return NULL; } -static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int hashkey) +static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid) { - const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); + const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); - return __cfq_find_cfq_hash(cfqd, hashkey, hashval); + return __cfq_find_cfq_hash(cfqd, pid, hashval); } static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { cfqd->busy_queues--; - WARN_ON(cfqd->busy_queues < 0); - - cfqd->cid[cfqq->ioprio].busy_queues--; - WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0); - atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out)); - list_del(&cfqq->cfq_list); - hlist_del(&cfqq->cfq_hash); + list_del(&cfqq->cfq_hash); mempool_free(cfqq, cfq_mpool); } -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int hashkey) +static struct cfq_queue *__cfq_get_queue(struct cfq_data *cfqd, int pid, + int gfp_mask) { - const int hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); - struct cfq_queue *cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval); + const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT); + struct cfq_queue *cfqq, *new_cfqq = NULL; + request_queue_t *q = cfqd->queue; - if (!cfqq) { - cfqq = mempool_alloc(cfq_mpool, GFP_NOIO); +retry: + cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval); - memset(cfqq, 0, sizeof(*cfqq)); - INIT_HLIST_NODE(&cfqq->cfq_hash); + if (!cfqq) { + if (new_cfqq) { + cfqq = new_cfqq; + new_cfqq = NULL; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(q->queue_lock); + new_cfqq = mempool_alloc(cfq_mpool, gfp_mask); + spin_lock_irq(q->queue_lock); + goto retry; + } else + return NULL; + + INIT_LIST_HEAD(&cfqq->cfq_hash); INIT_LIST_HEAD(&cfqq->cfq_list); + RB_CLEAR_ROOT(&cfqq->sort_list); - cfqq->hash_key = cfq_hash_key(current); - cfqq->ioprio = cfq_ioprio(current); - hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); + cfqq->pid = pid; + cfqq->queued[0] = cfqq->queued[1] = 0; + list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); } + if (new_cfqq) + mempool_free(new_cfqq, cfq_mpool); + return cfqq; } -static void -__cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq) +static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid, + int gfp_mask) { - const int prio = crq->ioprio; + request_queue_t *q = cfqd->queue; struct cfq_queue *cfqq; - cfqq = cfq_get_queue(cfqd, cfq_hash_key(current)); + spin_lock_irq(q->queue_lock); + cfqq = __cfq_get_queue(cfqd, pid, gfp_mask); + spin_unlock_irq(q->queue_lock); - /* - * not too good... - */ - if (prio > cfqq->ioprio) { - printk("prio hash collision %d %d\n", prio, cfqq->ioprio); - if (!list_empty(&cfqq->cfq_list)) { - cfqd->cid[cfqq->ioprio].busy_queues--; - WARN_ON(cfqd->cid[cfqq->ioprio].busy_queues < 0); - atomic_inc(&(cfqd->cid[cfqq->ioprio].cum_queues_out)); - cfqd->cid[prio].busy_queues++; - atomic_inc(&(cfqd->cid[prio].cum_queues_in)); - list_move_tail(&cfqq->cfq_list, &cfqd->cid[prio].rr_list); - } - cfqq->ioprio = prio; - } - - cfq_add_crq_rb(cfqd, cfqq, crq); - - if (list_empty(&cfqq->cfq_list)) { - list_add_tail(&cfqq->cfq_list, &cfqd->cid[prio].rr_list); - cfqd->cid[prio].busy_queues++; - atomic_inc(&(cfqd->cid[prio].cum_queues_in)); - cfqd->busy_queues++; - } - - if (rq_mergeable(crq->request)) { - cfq_add_crq_hash(cfqd, crq); - - if (!q->last_merge) - q->last_merge = crq->request; - } - -} - -static void cfq_reenqueue(request_queue_t *q, struct cfq_data *cfqd, int prio) -{ - struct list_head *prio_list = &cfqd->cid[prio].prio_list; - struct list_head *entry, *tmp; - - list_for_each_safe(entry, tmp, prio_list) { - struct cfq_rq *crq = list_entry_prio(entry); - - list_del_init(entry); - list_del_init(&crq->request->queuelist); - __cfq_enqueue(q, cfqd, crq); - } + return cfqq; } -static void -cfq_enqueue(request_queue_t *q, struct cfq_data *cfqd, struct cfq_rq *crq) +static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) { - const int prio = cfq_ioprio(current); - - crq->ioprio = prio; - crq->nr_sectors = crq->request->hard_nr_sectors; - __cfq_enqueue(q, cfqd, crq); + struct cfq_queue *cfqq; - if (prio == IOPRIO_RT) { - int i; + cfqq = __cfq_get_queue(cfqd, current->tgid, GFP_ATOMIC); + if (cfqq) { + cfq_add_crq_rb(cfqd, cfqq, crq); + if (list_empty(&cfqq->cfq_list)) { + list_add(&cfqq->cfq_list, &cfqd->rr_list); + cfqd->busy_queues++; + } + } else { /* - * realtime io gets priority, move all other io back - */ - for (i = IOPRIO_IDLE; i < IOPRIO_RT; i++) - cfq_reenqueue(q, cfqd, i); - } else if (prio != IOPRIO_IDLE) { - /* - * check if we need to move idle io back into queue + * should can only happen if the request wasn't allocated + * through blk_alloc_request(), eg stack requests from ide-cd + * (those should be removed) _and_ we are in OOM. */ - cfq_reenqueue(q, cfqd, IOPRIO_IDLE); + list_add_tail(&crq->request->queuelist, cfqd->dispatch); } } @@ -818,14 +532,12 @@ static void cfq_insert_request(request_queue_t *q, struct request *rq, int where) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); switch (where) { case ELEVATOR_INSERT_BACK: -#if 0 while (cfq_dispatch_requests(q, cfqd)) ; -#endif list_add_tail(&rq->queuelist, cfqd->dispatch); break; case ELEVATOR_INSERT_FRONT: @@ -833,19 +545,26 @@ cfq_insert_request(request_queue_t *q, struct request *rq, int where) break; case ELEVATOR_INSERT_SORT: BUG_ON(!blk_fs_request(rq)); - cfq_enqueue(q, cfqd, crq); + cfq_enqueue(cfqd, crq); break; default: printk("%s: bad insert point %d\n", __FUNCTION__,where); return; } + + if (rq_mergeable(rq)) { + cfq_add_crq_hash(cfqd, crq); + + if (!q->last_merge) + q->last_merge = rq; + } } static int cfq_queue_empty(request_queue_t *q) { struct cfq_data *cfqd = q->elevator.elevator_data; - if (list_empty(cfqd->dispatch) && !cfqd->busy_queues) + if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list)) return 1; return 0; @@ -854,7 +573,7 @@ static int cfq_queue_empty(request_queue_t *q) static struct request * cfq_former_request(request_queue_t *q, struct request *rq) { - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); struct rb_node *rbprev = rb_prev(&crq->rb_node); if (rbprev) @@ -866,7 +585,7 @@ cfq_former_request(request_queue_t *q, struct request *rq) static struct request * cfq_latter_request(request_queue_t *q, struct request *rq) { - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); struct rb_node *rbnext = rb_next(&crq->rb_node); if (rbnext) @@ -875,47 +594,27 @@ cfq_latter_request(request_queue_t *q, struct request *rq) return NULL; } -static void cfq_queue_congested(request_queue_t *q) -{ - struct cfq_data *cfqd = q->elevator.elevator_data; - - set_bit(cfq_ioprio(current), &cfqd->rq_starved_mask); -} - static int cfq_may_queue(request_queue_t *q, int rw) { struct cfq_data *cfqd = q->elevator.elevator_data; struct cfq_queue *cfqq; - const int prio = cfq_ioprio(current); - int limit, ret = 1; + int ret = 1; if (!cfqd->busy_queues) goto out; - cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current)); - if (!cfqq) - goto out; - - cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(current)); - if (!cfqq) - goto out; - - /* - * if higher or equal prio io is sleeping waiting for a request, don't - * allow this one to allocate one. as long as ll_rw_blk does fifo - * waitqueue wakeups this should work... - */ - if (cfqd->rq_starved_mask & ~((1 << prio) - 1)) - goto out; - - if (cfqq->queued[rw] < cfqd->cfq_queued || !cfqd->cid[prio].busy_queues) - goto out; + cfqq = cfq_find_cfq_hash(cfqd, current->tgid); + if (cfqq) { + int limit = (q->nr_requests - cfqd->cfq_queued) / cfqd->busy_queues; - limit = q->nr_requests * (prio + 1) / IOPRIO_NR; - limit /= cfqd->cid[prio].busy_queues; - if (cfqq->queued[rw] > limit) - ret = 0; + if (limit < 3) + limit = 3; + else if (limit > cfqd->max_queued) + limit = cfqd->max_queued; + if (cfqq->queued[rw] > limit) + ret = 0; + } out: return ret; } @@ -923,32 +622,56 @@ out: static void cfq_put_request(request_queue_t *q, struct request *rq) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = RQ_ELV_DATA(rq); + struct cfq_rq *crq = RQ_DATA(rq); + struct request_list *rl; + int other_rw; if (crq) { BUG_ON(q->last_merge == rq); - BUG_ON(!hlist_unhashed(&crq->hash)); + BUG_ON(ON_MHASH(crq)); mempool_free(crq, cfqd->crq_pool); rq->elevator_private = NULL; } + + /* + * work-around for may_queue "bug": if a read gets issued and refused + * to queue because writes ate all the allowed slots and no other + * reads are pending for this queue, it could get stuck infinitely + * since freed_request() only checks the waitqueue for writes when + * freeing them. or vice versa for a single write vs many reads. + * so check here whether "the other" data direction might be able + * to queue and wake them + */ + rl = &q->rq; + other_rw = rq_data_dir(rq) ^ 1; + if (rl->count[other_rw] <= q->nr_requests) { + smp_mb(); + if (waitqueue_active(&rl->wait[other_rw])) + wake_up(&rl->wait[other_rw]); + } } static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) { struct cfq_data *cfqd = q->elevator.elevator_data; - struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask); + struct cfq_queue *cfqq; + struct cfq_rq *crq; - if (crq) { - /* - * process now has one request - */ - clear_bit(cfq_ioprio(current), &cfqd->rq_starved_mask); + /* + * prepare a queue up front, so cfq_enqueue() doesn't have to + */ + cfqq = cfq_get_queue(cfqd, current->tgid, gfp_mask); + if (!cfqq) + return 1; + crq = mempool_alloc(cfqd->crq_pool, gfp_mask); + if (crq) { memset(crq, 0, sizeof(*crq)); + RB_CLEAR(&crq->rb_node); crq->request = rq; - INIT_HLIST_NODE(&crq->hash); - INIT_LIST_HEAD(&crq->prio_list); + crq->cfq_queue = NULL; + INIT_LIST_HEAD(&crq->hash); rq->elevator_private = crq; return 0; } @@ -967,26 +690,6 @@ static void cfq_exit(request_queue_t *q, elevator_t *e) kfree(cfqd); } -static void cfq_timer(unsigned long data) -{ - struct cfq_data *cfqd = (struct cfq_data *) data; - - clear_bit(CFQ_WAIT_RT, &cfqd->flags); - clear_bit(CFQ_WAIT_NORM, &cfqd->flags); - kblockd_schedule_work(&cfqd->work); -} - -static void cfq_work(void *data) -{ - request_queue_t *q = data; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (cfq_next_request(q)) - q->request_fn(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - static int cfq_init(request_queue_t *q, elevator_t *e) { struct cfq_data *cfqd; @@ -997,34 +700,13 @@ static int cfq_init(request_queue_t *q, elevator_t *e) return -ENOMEM; memset(cfqd, 0, sizeof(*cfqd)); + INIT_LIST_HEAD(&cfqd->rr_list); - init_timer(&cfqd->timer); - cfqd->timer.function = cfq_timer; - cfqd->timer.data = (unsigned long) cfqd; - - INIT_WORK(&cfqd->work, cfq_work, q); - - for (i = 0; i < IOPRIO_NR; i++) { - struct io_prio_data *cid = &cfqd->cid[i]; - - INIT_LIST_HEAD(&cid->rr_list); - INIT_LIST_HEAD(&cid->prio_list); - cid->last_rq = -1; - cid->last_sectors = -1; - - atomic_set(&cid->cum_rq_in,0); - atomic_set(&cid->cum_rq_out,0); - atomic_set(&cid->cum_sectors_in,0); - atomic_set(&cid->cum_sectors_out,0); - atomic_set(&cid->cum_queues_in,0); - atomic_set(&cid->cum_queues_out,0); - } - - cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); + cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); if (!cfqd->crq_hash) goto out_crqhash; - cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); + cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL); if (!cfqd->cfq_hash) goto out_cfqhash; @@ -1033,22 +715,23 @@ static int cfq_init(request_queue_t *q, elevator_t *e) goto out_crqpool; for (i = 0; i < CFQ_MHASH_ENTRIES; i++) - INIT_HLIST_HEAD(&cfqd->crq_hash[i]); + INIT_LIST_HEAD(&cfqd->crq_hash[i]); for (i = 0; i < CFQ_QHASH_ENTRIES; i++) - INIT_HLIST_HEAD(&cfqd->cfq_hash[i]); - - cfqd->cfq_queued = cfq_queued; - cfqd->cfq_quantum = cfq_quantum; - cfqd->cfq_quantum_io = cfq_quantum_io; - cfqd->cfq_idle_quantum = cfq_idle_quantum; - cfqd->cfq_idle_quantum_io = cfq_idle_quantum_io; - cfqd->cfq_grace_rt = cfq_grace_rt; - cfqd->cfq_grace_idle = cfq_grace_idle; - - q->nr_requests <<= 2; + INIT_LIST_HEAD(&cfqd->cfq_hash[i]); cfqd->dispatch = &q->queue_head; e->elevator_data = cfqd; + cfqd->queue = q; + + /* + * just set it to some high value, we want anyone to be able to queue + * some requests. fairness is handled differently + */ + cfqd->max_queued = q->nr_requests; + q->nr_requests = 8192; + + cfqd->cfq_queued = cfq_queued; + cfqd->cfq_quantum = cfq_quantum; return 0; out_crqpool: @@ -1114,12 +797,7 @@ static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ return cfq_var_show(__VAR, (page)); \ } SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum); -SHOW_FUNCTION(cfq_quantum_io_show, cfqd->cfq_quantum_io); -SHOW_FUNCTION(cfq_idle_quantum_show, cfqd->cfq_idle_quantum); -SHOW_FUNCTION(cfq_idle_quantum_io_show, cfqd->cfq_idle_quantum_io); SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued); -SHOW_FUNCTION(cfq_grace_rt_show, cfqd->cfq_grace_rt); -SHOW_FUNCTION(cfq_grace_idle_show, cfqd->cfq_grace_idle); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ @@ -1133,236 +811,23 @@ static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count) \ return ret; \ } STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, INT_MAX); -STORE_FUNCTION(cfq_quantum_io_store, &cfqd->cfq_quantum_io, 4, INT_MAX); -STORE_FUNCTION(cfq_idle_quantum_store, &cfqd->cfq_idle_quantum, 1, INT_MAX); -STORE_FUNCTION(cfq_idle_quantum_io_store, &cfqd->cfq_idle_quantum_io, 4, INT_MAX); STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, INT_MAX); -STORE_FUNCTION(cfq_grace_rt_store, &cfqd->cfq_grace_rt, 0, INT_MAX); -STORE_FUNCTION(cfq_grace_idle_store, &cfqd->cfq_grace_idle, 0, INT_MAX); #undef STORE_FUNCTION - -/* Additional entries to get priority level data */ -static ssize_t -cfq_prio_show(struct cfq_data *cfqd, char *page, unsigned int priolvl) -{ - int r1,r2,s1,s2,q1,q2; - - if (!(priolvl >= IOPRIO_IDLE && priolvl <= IOPRIO_RT)) - return 0; - - r1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_in)); - r2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_rq_out)); - s1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_in)); - s2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_sectors_out)); - q1 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_in)); - q2 = (int)atomic_read(&(cfqd->cid[priolvl].cum_queues_out)); - - - /* - return sprintf(page,"rq %d (%d,%d) sec %d (%d,%d) q %d (%d,%d)\n", - r1-r2,r1,r2, - s1-s2,s1,s2, - q1-q2,q1,q2); - */ - - return sprintf(page,"rq (%d,%d) sec (%d,%d) q (%d,%d)\n", - r1,r2, - s1,s2, - q1,q2); - -} - -#define SHOW_PRIO_DATA(__PRIOLVL) \ -static ssize_t cfq_prio_##__PRIOLVL##_show(struct cfq_data *cfqd, char *page) \ -{ \ - return cfq_prio_show(cfqd,page,__PRIOLVL); \ -} -SHOW_PRIO_DATA(0); -SHOW_PRIO_DATA(1); -SHOW_PRIO_DATA(2); -SHOW_PRIO_DATA(3); -SHOW_PRIO_DATA(4); -SHOW_PRIO_DATA(5); -SHOW_PRIO_DATA(6); -SHOW_PRIO_DATA(7); -SHOW_PRIO_DATA(8); -SHOW_PRIO_DATA(9); -SHOW_PRIO_DATA(10); -SHOW_PRIO_DATA(11); -SHOW_PRIO_DATA(12); -SHOW_PRIO_DATA(13); -SHOW_PRIO_DATA(14); -SHOW_PRIO_DATA(15); -SHOW_PRIO_DATA(16); -SHOW_PRIO_DATA(17); -SHOW_PRIO_DATA(18); -SHOW_PRIO_DATA(19); -SHOW_PRIO_DATA(20); -#undef SHOW_PRIO_DATA - - -static ssize_t cfq_prio_store(struct cfq_data *cfqd, const char *page, size_t count, int priolvl) -{ - atomic_set(&(cfqd->cid[priolvl].cum_rq_in),0); - atomic_set(&(cfqd->cid[priolvl].cum_rq_out),0); - atomic_set(&(cfqd->cid[priolvl].cum_sectors_in),0); - atomic_set(&(cfqd->cid[priolvl].cum_sectors_out),0); - atomic_set(&(cfqd->cid[priolvl].cum_queues_in),0); - atomic_set(&(cfqd->cid[priolvl].cum_queues_out),0); - - return count; -} - - -#define STORE_PRIO_DATA(__PRIOLVL) \ -static ssize_t cfq_prio_##__PRIOLVL##_store(struct cfq_data *cfqd, const char *page, size_t count) \ -{ \ - return cfq_prio_store(cfqd,page,count,__PRIOLVL); \ -} -STORE_PRIO_DATA(0); -STORE_PRIO_DATA(1); -STORE_PRIO_DATA(2); -STORE_PRIO_DATA(3); -STORE_PRIO_DATA(4); -STORE_PRIO_DATA(5); -STORE_PRIO_DATA(6); -STORE_PRIO_DATA(7); -STORE_PRIO_DATA(8); -STORE_PRIO_DATA(9); -STORE_PRIO_DATA(10); -STORE_PRIO_DATA(11); -STORE_PRIO_DATA(12); -STORE_PRIO_DATA(13); -STORE_PRIO_DATA(14); -STORE_PRIO_DATA(15); -STORE_PRIO_DATA(16); -STORE_PRIO_DATA(17); -STORE_PRIO_DATA(18); -STORE_PRIO_DATA(19); -STORE_PRIO_DATA(20); -#undef STORE_PRIO_DATA - - - static struct cfq_fs_entry cfq_quantum_entry = { .attr = {.name = "quantum", .mode = S_IRUGO | S_IWUSR }, .show = cfq_quantum_show, .store = cfq_quantum_store, }; -static struct cfq_fs_entry cfq_quantum_io_entry = { - .attr = {.name = "quantum_io", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_quantum_io_show, - .store = cfq_quantum_io_store, -}; -static struct cfq_fs_entry cfq_idle_quantum_entry = { - .attr = {.name = "idle_quantum", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_idle_quantum_show, - .store = cfq_idle_quantum_store, -}; -static struct cfq_fs_entry cfq_idle_quantum_io_entry = { - .attr = {.name = "idle_quantum_io", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_idle_quantum_io_show, - .store = cfq_idle_quantum_io_store, -}; static struct cfq_fs_entry cfq_queued_entry = { .attr = {.name = "queued", .mode = S_IRUGO | S_IWUSR }, .show = cfq_queued_show, .store = cfq_queued_store, }; -static struct cfq_fs_entry cfq_grace_rt_entry = { - .attr = {.name = "grace_rt", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_grace_rt_show, - .store = cfq_grace_rt_store, -}; -static struct cfq_fs_entry cfq_grace_idle_entry = { - .attr = {.name = "grace_idle", .mode = S_IRUGO | S_IWUSR }, - .show = cfq_grace_idle_show, - .store = cfq_grace_idle_store, -}; - -#define P_0_STR "p0" -#define P_1_STR "p1" -#define P_2_STR "p2" -#define P_3_STR "p3" -#define P_4_STR "p4" -#define P_5_STR "p5" -#define P_6_STR "p6" -#define P_7_STR "p7" -#define P_8_STR "p8" -#define P_9_STR "p9" -#define P_10_STR "p10" -#define P_11_STR "p11" -#define P_12_STR "p12" -#define P_13_STR "p13" -#define P_14_STR "p14" -#define P_15_STR "p15" -#define P_16_STR "p16" -#define P_17_STR "p17" -#define P_18_STR "p18" -#define P_19_STR "p19" -#define P_20_STR "p20" - - -#define CFQ_PRIO_SYSFS_ENTRY(__PRIOLVL) \ -static struct cfq_fs_entry cfq_prio_##__PRIOLVL##_entry = { \ - .attr = {.name = P_##__PRIOLVL##_STR, .mode = S_IRUGO | S_IWUSR }, \ - .show = cfq_prio_##__PRIOLVL##_show, \ - .store = cfq_prio_##__PRIOLVL##_store, \ -}; -CFQ_PRIO_SYSFS_ENTRY(0); -CFQ_PRIO_SYSFS_ENTRY(1); -CFQ_PRIO_SYSFS_ENTRY(2); -CFQ_PRIO_SYSFS_ENTRY(3); -CFQ_PRIO_SYSFS_ENTRY(4); -CFQ_PRIO_SYSFS_ENTRY(5); -CFQ_PRIO_SYSFS_ENTRY(6); -CFQ_PRIO_SYSFS_ENTRY(7); -CFQ_PRIO_SYSFS_ENTRY(8); -CFQ_PRIO_SYSFS_ENTRY(9); -CFQ_PRIO_SYSFS_ENTRY(10); -CFQ_PRIO_SYSFS_ENTRY(11); -CFQ_PRIO_SYSFS_ENTRY(12); -CFQ_PRIO_SYSFS_ENTRY(13); -CFQ_PRIO_SYSFS_ENTRY(14); -CFQ_PRIO_SYSFS_ENTRY(15); -CFQ_PRIO_SYSFS_ENTRY(16); -CFQ_PRIO_SYSFS_ENTRY(17); -CFQ_PRIO_SYSFS_ENTRY(18); -CFQ_PRIO_SYSFS_ENTRY(19); -CFQ_PRIO_SYSFS_ENTRY(20); -#undef CFQ_PRIO_SYSFS_ENTRY - static struct attribute *default_attrs[] = { &cfq_quantum_entry.attr, - &cfq_quantum_io_entry.attr, - &cfq_idle_quantum_entry.attr, - &cfq_idle_quantum_io_entry.attr, &cfq_queued_entry.attr, - &cfq_grace_rt_entry.attr, - &cfq_grace_idle_entry.attr, - &cfq_prio_0_entry.attr, - &cfq_prio_1_entry.attr, - &cfq_prio_2_entry.attr, - &cfq_prio_3_entry.attr, - &cfq_prio_4_entry.attr, - &cfq_prio_5_entry.attr, - &cfq_prio_6_entry.attr, - &cfq_prio_7_entry.attr, - &cfq_prio_8_entry.attr, - &cfq_prio_9_entry.attr, - &cfq_prio_10_entry.attr, - &cfq_prio_11_entry.attr, - &cfq_prio_12_entry.attr, - &cfq_prio_13_entry.attr, - &cfq_prio_14_entry.attr, - &cfq_prio_15_entry.attr, - &cfq_prio_16_entry.attr, - &cfq_prio_17_entry.attr, - &cfq_prio_18_entry.attr, - &cfq_prio_19_entry.attr, - &cfq_prio_20_entry.attr, NULL, }; @@ -1418,7 +883,6 @@ elevator_t iosched_cfq = { .elevator_set_req_fn = cfq_set_request, .elevator_put_req_fn = cfq_put_request, .elevator_may_queue_fn = cfq_may_queue, - .elevator_set_congested_fn = cfq_queue_congested, .elevator_init_fn = cfq_init, .elevator_exit_fn = cfq_exit, }; diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index 950eb9923..35c9385ac 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -339,14 +339,6 @@ void elv_put_request(request_queue_t *q, struct request *rq) e->elevator_put_req_fn(q, rq); } -void elv_set_congested(request_queue_t *q) -{ - elevator_t *e = &q->elevator; - - if (e->elevator_set_congested_fn) - e->elevator_set_congested_fn(q); -} - int elv_may_queue(request_queue_t *q, int rw) { elevator_t *e = &q->elevator; @@ -354,7 +346,7 @@ int elv_may_queue(request_queue_t *q, int rw) if (e->elevator_may_queue_fn) return e->elevator_may_queue_fn(q, rw); - return 1; + return 0; } void elv_completed_request(request_queue_t *q, struct request *rq) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index b6ff3448b..17c403ebd 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1594,10 +1594,6 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) struct io_context *ioc = get_io_context(gfp_mask); spin_lock_irq(q->queue_lock); - - if (!elv_may_queue(q, rw)) - goto out_lock; - if (rl->count[rw]+1 >= q->nr_requests) { /* * The queue will fill after this allocation, so set it as @@ -1611,12 +1607,15 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) } } - /* - * The queue is full and the allocating process is not a - * "batcher", and not exempted by the IO scheduler - */ - if (blk_queue_full(q, rw) && !ioc_batching(ioc)) - goto out_lock; + if (blk_queue_full(q, rw) + && !ioc_batching(ioc) && !elv_may_queue(q, rw)) { + /* + * The queue is full and the allocating process is not a + * "batcher", and not exempted by the IO scheduler + */ + spin_unlock_irq(q->queue_lock); + goto out; + } rl->count[rw]++; if (rl->count[rw] >= queue_congestion_on_threshold(q)) @@ -1634,7 +1633,8 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) */ spin_lock_irq(q->queue_lock); freed_request(q, rw); - goto out_lock; + spin_unlock_irq(q->queue_lock); + goto out; } if (ioc_batching(ioc)) @@ -1664,11 +1664,6 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) out: put_io_context(ioc); return rq; -out_lock: - if (!rq) - elv_set_congested(q); - spin_unlock_irq(q->queue_lock); - goto out; } /* @@ -3172,21 +3167,3 @@ void blk_unregister_queue(struct gendisk *disk) kobject_put(&disk->kobj); } } - -asmlinkage int sys_ioprio_set(int ioprio) -{ - if (ioprio < IOPRIO_IDLE || ioprio > IOPRIO_RT) - return -EINVAL; - if (ioprio == IOPRIO_RT && !capable(CAP_SYS_ADMIN)) - return -EACCES; - - printk("%s: set ioprio %d\n", current->comm, ioprio); - current->ioprio = ioprio; - return 0; -} - -asmlinkage int sys_ioprio_get(void) -{ - return current->ioprio; -} - diff --git a/fs/exec.c b/fs/exec.c index b0acd4297..b0a98b43f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -547,6 +548,18 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); +#ifdef CONFIG_CKRM_RES_MEM + if (old_mm) { + spin_lock(&old_mm->peertask_lock); + list_del(&tsk->mm_peers); + ckrm_mem_evaluate_mm(old_mm); + spin_unlock(&old_mm->peertask_lock); + } + spin_lock(&mm->peertask_lock); + list_add_tail(&tsk->mm_peers, &mm->tasklist); + ckrm_mem_evaluate_mm(mm); + spin_unlock(&mm->peertask_lock); +#endif if (old_mm) { if (active_mm != old_mm) BUG(); mmput(old_mm); diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index 30bbe7fcd..ef936b861 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -289,10 +289,8 @@ #define __NR_mq_notify (__NR_mq_open+4) #define __NR_mq_getsetattr (__NR_mq_open+5) #define __NR_sys_kexec_load 283 -#define __NR_ioprio_set 284 -#define __NR_ioprio_get 285 -#define NR_syscalls 286 +#define NR_syscalls 284 /* user-visible error numbers are in the range -1 - -124: see */ diff --git a/include/asm-ppc/unistd.h b/include/asm-ppc/unistd.h index bdf4ebe9b..57fb02c6c 100644 --- a/include/asm-ppc/unistd.h +++ b/include/asm-ppc/unistd.h @@ -273,10 +273,8 @@ #define __NR_mq_notify 266 #define __NR_mq_getsetattr 267 #define __NR_kexec_load 268 -#define __NR_ioprio_set 269 -#define __NR_ioprio_get 270 -#define __NR_syscalls 271 +#define __NR_syscalls 269 #define __NR(n) #n diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 0b0a6a10d..26e0aa30b 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -554,12 +554,8 @@ __SYSCALL(__NR_mq_notify, sys_mq_notify) __SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) #define __NR_kexec_load 246 __SYSCALL(__NR_kexec_load, sys_ni_syscall) -#define __NR_ioprio_set 247 -__SYSCALL(__NR_ioprio_set, sys_ioprio_set); -#define __NR_ioprio_get 248 -__SYSCALL(__NR_ioprio_get, sys_ioprio_get); -#define __NR_syscall_max __NR_ioprio_get +#define __NR_syscall_max __NR_kexec_load #ifndef __NO_STUBS /* user-visible error numbers are in the range -1 - -4095 */ diff --git a/include/linux/ckrm_mem.h b/include/linux/ckrm_mem.h index 52dc949ec..4efebb993 100644 --- a/include/linux/ckrm_mem.h +++ b/include/linux/ckrm_mem.h @@ -49,6 +49,7 @@ typedef struct ckrm_mem_res { // more than this is needed. int nr_active[MAX_NR_ZONES]; int nr_inactive[MAX_NR_ZONES]; + int tmp_cnt; int shrink_count; unsigned long last_shrink; int over_limit_failures; @@ -66,17 +67,19 @@ extern struct ckrm_res_ctlr mem_rcbs; // used to fill reclaim_flags, used only when memory is low in the system #define CLS_CLEAR (0) // class under its guarantee #define CLS_OVER_GUAR (1 << 0) // class is over its guarantee -#define CLS_PARENT_OVER (1 << 1) // parent is over 120% mark over limit -#define CLS_OVER_75 (1 << 2) // class over 75% mark bet guar(0) & limit(100) -#define CLS_OVER_100 (1 << 3) // class over its limit -#define CLS_OVER_110 (1 << 4) // class over 110% mark over limit -#define CLS_FLAGS_ALL ( CLS_OVER_GUAR | CLS_PARENT_OVER | CLS_OVER_75 | \ - CLS_OVER_100 | CLS_OVER_110 ) +#define CLS_PARENT_OVER (1 << 1) // parent is over 110% mark over limit +#define CLS_OVER_25 (1 << 2) // class over 25% mark bet guar(0) & limit(100) +#define CLS_OVER_50 (1 << 3) // class over 50% mark bet guar(0) & limit(100) +#define CLS_OVER_75 (1 << 4) // class over 75% mark bet guar(0) & limit(100) +#define CLS_OVER_100 (1 << 5) // class over its limit +#define CLS_OVER_110 (1 << 6) // class over 110% mark over limit +#define CLS_FLAGS_ALL ( CLS_OVER_GUAR | CLS_PARENT_OVER | CLS_OVER_25 | \ + CLS_OVER_50 | CLS_OVER_75 | CLS_OVER_100 | CLS_OVER_110 ) #define CLS_SHRINK_BIT (31) // used to both lock and set the bit #define CLS_SHRINK (1 << CLS_SHRINK_BIT) // shrink the given class // used in flags. set when a class is more than 90% of its maxlimit -#define MEM_NEAR_LIMIT 1 +#define MEM_AT_LIMIT 1 extern void ckrm_set_aggressive(ckrm_mem_res_t *); extern unsigned int ckrm_setup_reclamation(void); @@ -84,16 +87,14 @@ extern void ckrm_teardown_reclamation(void); extern void ckrm_get_reclaim_bits(unsigned int *, unsigned int *); extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *); extern void ckrm_mem_evaluate_mm(struct mm_struct *); -extern void ckrm_mem_evaluate_page_byadd(struct page *, struct mm_struct *); -extern void ckrm_near_limit(ckrm_mem_res_t *); +extern void ckrm_at_limit(ckrm_mem_res_t *); +extern int ckrm_memclass_valid(ckrm_mem_res_t *); #define ckrm_get_reclaim_flags(cls) ((cls)->reclaim_flags) #else #define ckrm_init_mm_to_current(a) do {} while (0) #define ckrm_mem_evaluate_mm(a) do {} while (0) -#define ckrm_mem_evaluate_page_byadd(a,b) do {} while (0) -#define page_class(page) (NULL) #define ckrm_get_reclaim_flags(a) (0) #define ckrm_setup_reclamation() (0) #define ckrm_teardown_reclamation() do {} while (0) diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h index 0eb4e49c0..3a9dd55e7 100644 --- a/include/linux/ckrm_mem_inline.h +++ b/include/linux/ckrm_mem_inline.h @@ -56,6 +56,10 @@ ckrm_mem_share_compare(ckrm_mem_res_t *a, ckrm_mem_res_t *b) return -(b != NULL) ; if (b == NULL) return 0; + if (a->pg_guar == CKRM_SHARE_DONTCARE) + return 1; + if (b->pg_guar == CKRM_SHARE_DONTCARE) + return -1; return (a->pg_unused - b->pg_unused); } @@ -69,34 +73,38 @@ mem_class_get(ckrm_mem_res_t *cls) static inline void mem_class_put(ckrm_mem_res_t *cls) { + if (cls && atomic_dec_and_test(&(cls->nr_users)) ) { printk("freeing memclass %p of \n", cls, cls->core->name); + BUG_ON(ckrm_memclass_valid(cls)); //kfree(cls); } } -static inline int +static inline void incr_use_count(ckrm_mem_res_t *cls, int borrow) { - int over_limit; - atomic_inc(&cls->pg_total); - over_limit = (atomic_read(&cls->pg_total) > ((9 * cls->pg_limit) / 10)); if (borrow) cls->pg_lent++; - if ((cls->pg_guar != CKRM_SHARE_DONTCARE) && + if ((cls->pg_guar == CKRM_SHARE_DONTCARE) || (atomic_read(&cls->pg_total) > cls->pg_unused)) { ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent, mem_rcbs.resid, ckrm_mem_res_t); if (parcls) { - over_limit |= incr_use_count(parcls, 1); + incr_use_count(parcls, 1); cls->pg_borrowed++; - return over_limit; } + } else { + atomic_inc(&ckrm_mem_real_count); + } + if ((cls->pg_limit != CKRM_SHARE_DONTCARE) && + (atomic_read(&cls->pg_total) >= cls->pg_limit) && + ((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT)) { + ckrm_at_limit(cls); } - atomic_inc(&ckrm_mem_real_count); - return over_limit; + return; } static inline void @@ -159,10 +167,26 @@ ckrm_clear_pages_class(struct page *pages, int numpages) } static inline void -ckrm_change_page_class(struct page *page, ckrm_mem_res_t *cls) +ckrm_change_page_class(struct page *page, ckrm_mem_res_t *newcls) { + ckrm_mem_res_t *oldcls = page_class(page); + + if (!newcls || oldcls == newcls) + return; + ckrm_clear_page_class(page); - ckrm_set_page_class(page, cls); + ckrm_set_page_class(page, newcls); + if (test_bit(PG_ckrm_account, &page->flags)) { + decr_use_count(oldcls, 0); + incr_use_count(newcls, 0); + if (PageActive(page)) { + oldcls->nr_active[page_zonenum(page)]--; + newcls->nr_active[page_zonenum(page)]++; + } else { + oldcls->nr_inactive[page_zonenum(page)]--; + newcls->nr_inactive[page_zonenum(page)]++; + } + } } static inline void @@ -178,42 +202,65 @@ ckrm_change_pages_class(struct page *pages, int numpages, static inline void ckrm_mem_inc_active(struct page *page) { - ckrm_mem_res_t *cls = page_class(page); + ckrm_mem_res_t *cls = page_class(page), *curcls; + if (mem_rcbs.resid == -1) { + return; + } BUG_ON(cls == NULL); - cls->nr_active[page_zonenum(page)]++; - if (incr_use_count(cls, 0)) { - ckrm_near_limit(cls); + BUG_ON(test_bit(PG_ckrm_account, &page->flags)); + if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) { + cls = curcls; + ckrm_change_page_class(page, cls); } + cls->nr_active[page_zonenum(page)]++; + incr_use_count(cls, 0); + set_bit(PG_ckrm_account, &page->flags); } static inline void ckrm_mem_dec_active(struct page *page) { ckrm_mem_res_t *cls = page_class(page); + if (mem_rcbs.resid == -1) { + return; + } BUG_ON(cls == NULL); + BUG_ON(!test_bit(PG_ckrm_account, &page->flags)); cls->nr_active[page_zonenum(page)]--; decr_use_count(cls, 0); + clear_bit(PG_ckrm_account, &page->flags); } static inline void ckrm_mem_inc_inactive(struct page *page) { - ckrm_mem_res_t *cls = page_class(page); + ckrm_mem_res_t *cls = page_class(page), *curcls; + if (mem_rcbs.resid == -1) { + return; + } BUG_ON(cls == NULL); - cls->nr_inactive[page_zonenum(page)]++; - if (incr_use_count(cls, 0) && - ((cls->flags & MEM_NEAR_LIMIT) != MEM_NEAR_LIMIT)) { - ckrm_near_limit(cls); + BUG_ON(test_bit(PG_ckrm_account, &page->flags)); + if (unlikely(cls != (curcls = GET_MEM_CLASS(current)))) { + cls = curcls; + ckrm_change_page_class(page, cls); } + cls->nr_inactive[page_zonenum(page)]++; + incr_use_count(cls, 0); + set_bit(PG_ckrm_account, &page->flags); } static inline void ckrm_mem_dec_inactive(struct page *page) { ckrm_mem_res_t *cls = page_class(page); + if (mem_rcbs.resid == -1) { + return; + } BUG_ON(cls == NULL); + BUG_ON(!test_bit(PG_ckrm_account, &page->flags)); cls->nr_inactive[page_zonenum(page)]--; decr_use_count(cls, 0); + clear_bit(PG_ckrm_account, &page->flags); } static inline int @@ -232,7 +279,13 @@ ckrm_class_limit_ok(ckrm_mem_res_t *cls) if ((mem_rcbs.resid == -1) || !cls) { return 1; } - return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10); + if (cls->pg_limit == CKRM_SHARE_DONTCARE) { + ckrm_mem_res_t *parcls = ckrm_get_res_class(cls->parent, + mem_rcbs.resid, ckrm_mem_res_t); + return (!parcls ?: ckrm_class_limit_ok(parcls)); + } else { + return (atomic_read(&cls->pg_total) <= (11 * cls->pg_limit) / 10); + } } #else // !CONFIG_CKRM_RES_MEM diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b42a9c4e2..27e8183f4 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -17,7 +17,6 @@ typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *); typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *); typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); typedef int (elevator_may_queue_fn) (request_queue_t *, int); -typedef void (elevator_set_congested_fn) (request_queue_t *); typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int); typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); @@ -46,7 +45,6 @@ struct elevator_s elevator_put_req_fn *elevator_put_req_fn; elevator_may_queue_fn *elevator_may_queue_fn; - elevator_set_congested_fn *elevator_set_congested_fn; elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; @@ -76,7 +74,6 @@ extern struct request *elv_latter_request(request_queue_t *, struct request *); extern int elv_register_queue(request_queue_t *q); extern void elv_unregister_queue(request_queue_t *q); extern int elv_may_queue(request_queue_t *, int); -extern void elv_set_congested(request_queue_t *); extern void elv_completed_request(request_queue_t *, struct request *); extern int elv_set_request(request_queue_t *, struct request *, int); extern void elv_put_request(request_queue_t *, struct request *); @@ -122,6 +119,4 @@ extern int elv_try_last_merge(request_queue_t *, struct bio *); #define ELEVATOR_INSERT_BACK 2 #define ELEVATOR_INSERT_SORT 3 -#define RQ_ELV_DATA(rq) (rq)->elevator_private - #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 0b4e2114a..7e10a252a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1570,17 +1570,5 @@ static inline void free_secdata(void *secdata) { } #endif /* CONFIG_SECURITY */ -/* io priorities */ - -#define IOPRIO_NR 21 - -#define IOPRIO_IDLE 0 -#define IOPRIO_NORM 10 -#define IOPRIO_RT 20 - -asmlinkage int sys_ioprio_set(int ioprio); -asmlinkage int sys_ioprio_get(void); - - #endif /* __KERNEL__ */ #endif /* _LINUX_FS_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 5d6206327..9937c8df8 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -112,7 +112,6 @@ extern struct group_info init_groups; .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ - .ioprio = IOPRIO_NORM, \ } diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c584cced..0e7989075 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -229,6 +229,9 @@ struct page { void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_CKRM_RES_MEM + void *memclass; +#endif // CONFIG_CKRM_RES_MEM }; /* diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 47762ca69..5edb739b4 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,9 +1,11 @@ +#include static inline void add_page_to_active_list(struct zone *zone, struct page *page) { list_add(&page->lru, &zone->active_list); zone->nr_active++; + ckrm_mem_inc_active(page); } static inline void @@ -11,6 +13,7 @@ add_page_to_inactive_list(struct zone *zone, struct page *page) { list_add(&page->lru, &zone->inactive_list); zone->nr_inactive++; + ckrm_mem_inc_inactive(page); } static inline void @@ -18,6 +21,7 @@ del_page_from_active_list(struct zone *zone, struct page *page) { list_del(&page->lru); zone->nr_active--; + ckrm_mem_dec_active(page); } static inline void @@ -25,6 +29,7 @@ del_page_from_inactive_list(struct zone *zone, struct page *page) { list_del(&page->lru); zone->nr_inactive--; + ckrm_mem_dec_inactive(page); } static inline void @@ -34,7 +39,9 @@ del_page_from_lru(struct zone *zone, struct page *page) if (PageActive(page)) { ClearPageActive(page); zone->nr_active--; + ckrm_mem_dec_active(page); } else { zone->nr_inactive--; + ckrm_mem_dec_inactive(page); } } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c6f5063f0..c70f46a4e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -77,6 +77,7 @@ #define PG_compound 19 /* Part of a compound page */ #define PG_anon 20 /* Anonymous: anon_vma in mapping */ +#define PG_ckrm_account 21 /* This page is accounted by CKRM */ /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4dd9fbded..f975c7693 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -230,6 +230,11 @@ struct mm_struct { struct kioctx *ioctx_list; struct kioctx default_kioctx; +#ifdef CONFIG_CKRM_RES_MEM + struct ckrm_mem_res *memclass; + struct list_head tasklist; /* list of all tasks sharing this address space */ + spinlock_t peertask_lock; /* protect above tasklist */ +#endif }; extern int mmlist_nr; @@ -521,8 +526,6 @@ struct task_struct { struct io_context *io_context; - int ioprio; - unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ @@ -539,8 +542,10 @@ struct task_struct { struct ckrm_task_class *taskclass; struct list_head taskclass_link; #endif // CONFIG_CKRM_TYPE_TASKCLASS +#ifdef CONFIG_CKRM_RES_MEM + struct list_head mm_peers; // list of tasks using same mm_struct +#endif // CONFIG_CKRM_RES_MEM #endif // CONFIG_CKRM - struct task_delay_info delays; }; diff --git a/init/Kconfig b/init/Kconfig index 45a39b1ad..4fdce31f9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -172,18 +172,25 @@ config CKRM_RES_NUMTASKS Say N if unsure, Y to use the feature. -config CKRM_RES_BLKIO - tristate " Disk I/O Resource Controller" - depends on CKRM_TYPE_TASKCLASS && IOSCHED_CFQ - default m +config CKRM_RES_MEM + bool "Class based physical memory controller" + default y + depends on CKRM help - Provides a resource controller for best-effort block I/O - bandwidth control. The controller attempts this by proportional - servicing of requests in the I/O scheduler. However, seek - optimizations and reordering by device drivers/disk controllers may - alter the actual bandwidth delivered to a class. - - Say N if unsure, Y to use the feature. + Provide the basic support for collecting physical memory usage information + among classes. Say Y if you want to know the memory usage of each class. + +config CKRM_MEM_LRUORDER_CHANGE + bool "Change the LRU ordering of scanned pages" + default n + depends on CKRM_RES_MEM + help + While trying to free pages, by default(n), scanned pages are left were they + are found if they belong to relatively under-used class. In this case the + LRU ordering of the memory subsystemis left intact. If this option is chosen, + then the scanned pages are moved to the tail of the list(active or inactive). + Changing this to yes reduces the checking overhead but violates the approximate + LRU order that is maintained by the paging subsystem. config CKRM_TYPE_SOCKETCLASS bool "Class Manager for socket groups" diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile index 008b6c6e0..da0055430 100644 --- a/kernel/ckrm/Makefile +++ b/kernel/ckrm/Makefile @@ -9,3 +9,4 @@ endif obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_laq.o + obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o diff --git a/kernel/ckrm/ckrm_mem.c b/kernel/ckrm/ckrm_mem.c index 667ac9c67..34bbe623c 100644 --- a/kernel/ckrm/ckrm_mem.c +++ b/kernel/ckrm/ckrm_mem.c @@ -52,6 +52,7 @@ EXPORT_SYMBOL(ckrm_tot_lru_pages); static ckrm_mem_res_t *ckrm_mem_root_class; atomic_t ckrm_mem_real_count = ATOMIC_INIT(0); EXPORT_SYMBOL(ckrm_mem_real_count); +static void ckrm_mem_evaluate_all_pages(void); /* Initialize rescls values * May be called on each rcfs unmount or as part of error recovery @@ -89,7 +90,7 @@ mem_res_initcls_one(void *my_res) res->pg_guar = CKRM_SHARE_DONTCARE; res->pg_limit = CKRM_SHARE_DONTCARE; - res->pg_unused = CKRM_SHARE_DONTCARE; + res->pg_unused = 0; } static void * @@ -179,20 +180,23 @@ mem_res_free(void *my_res) if (!res) return; - parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t); + res->shares.my_guarantee = 0; + res->shares.my_limit = 0; + res->pg_guar = 0; + res->pg_limit = 0; + res->pg_unused = 0; + parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, ckrm_mem_res_t); // return child's limit/guarantee to parent node if (parres) { child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0); child_maxlimit_changed_local(parres); } - res->shares.my_guarantee = 0; - res->shares.my_limit = 0; spin_lock(&ckrm_mem_lock); list_del(&res->mcls_list); spin_unlock(&ckrm_mem_lock); mem_class_put(res); - + ckrm_mem_evaluate_all_pages(); return; } @@ -355,8 +359,14 @@ mem_change_resclass(void *tsk, void *old, void *new) } } - ckrm_mem_evaluate_mm(mm); spin_unlock(&mm->peertask_lock); + ckrm_mem_evaluate_mm(mm); + /* + printk("chg_cls: task <%s:%d> mm %p oldmm %s newmm %s o %s n %s\n", + task->comm, task->pid, mm, prev_mmcls ? prev_mmcls->core->name: + "NULL", mm->memclass ? mm->memclass->core->name : "NULL", + o ? o->core->name: "NULL", n ? n->core->name: "NULL"); + */ return; } @@ -485,7 +495,7 @@ set_usage_flags(ckrm_mem_res_t *res) guar = (res->pg_guar > 0) ? res->pg_guar : 0; range = res->pg_limit - guar; - if ((tot_usage > (guar + ((120 * range) / 100))) && + if ((tot_usage > (guar + ((110 * range) / 100))) && (res->pg_lent > (guar + ((25 * range) / 100)))) { set_flags_of_children(res, CLS_PARENT_OVER); } @@ -496,6 +506,10 @@ set_usage_flags(ckrm_mem_res_t *res) res->reclaim_flags |= CLS_OVER_100; } else if (cls_usage > (guar + ((3 * range) / 4))) { res->reclaim_flags |= CLS_OVER_75; + } else if (cls_usage > (guar + (range / 2))) { + res->reclaim_flags |= CLS_OVER_50; + } else if (cls_usage > (guar + (range / 4))) { + res->reclaim_flags |= CLS_OVER_25; } else if (cls_usage > guar) { res->reclaim_flags |= CLS_OVER_GUAR; } else { @@ -566,12 +580,13 @@ ckrm_get_reclaim_bits(unsigned int *flags, unsigned int *extract) } void -ckrm_near_limit(ckrm_mem_res_t *cls) +ckrm_at_limit(ckrm_mem_res_t *cls) { struct zone *zone; unsigned long now = jiffies; - if (!cls || ((cls->flags & MEM_NEAR_LIMIT) == MEM_NEAR_LIMIT)) { + if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || + ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) { return; } if ((cls->last_shrink + (10 * HZ)) < now) { // 10 seconds since last ? @@ -585,14 +600,16 @@ ckrm_near_limit(ckrm_mem_res_t *cls) spin_lock(&ckrm_mem_lock); list_add(&cls->shrink_list, &ckrm_shrink_list); spin_unlock(&ckrm_mem_lock); - cls->flags |= MEM_NEAR_LIMIT; + cls->flags |= MEM_AT_LIMIT; for_each_zone(zone) { wakeup_kswapd(zone); break; // only once is enough } } -static int +static int unmapped = 0, changed = 0, unchanged = 0, maxnull = 0, +anovma = 0, fnovma = 0; +static void ckrm_mem_evaluate_page_anon(struct page* page) { ckrm_mem_res_t* pgcls = page_class(page); @@ -600,10 +617,12 @@ ckrm_mem_evaluate_page_anon(struct page* page) struct anon_vma *anon_vma = (struct anon_vma *) page->mapping; struct vm_area_struct *vma; struct mm_struct* mm; + int v = 0; spin_lock(&anon_vma->lock); BUG_ON(list_empty(&anon_vma->head)); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + v++; mm = vma->vm_mm; if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, mm->memclass) < 0) { @@ -611,15 +630,20 @@ ckrm_mem_evaluate_page_anon(struct page* page) } } spin_unlock(&anon_vma->lock); + if (!v) + anovma++; + if (!maxshareclass) + maxnull++; if (maxshareclass && (pgcls != maxshareclass)) { ckrm_change_page_class(page, maxshareclass); - return 1; - } - return 0; + changed++; + } else + unchanged++; + return; } -static int +static void ckrm_mem_evaluate_page_file(struct page* page) { ckrm_mem_res_t* pgcls = page_class(page); @@ -629,69 +653,132 @@ ckrm_mem_evaluate_page_file(struct page* page) pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct prio_tree_iter iter; struct mm_struct* mm; + int v = 0; if (!mapping) - return 0; + return; if (!spin_trylock(&mapping->i_mmap_lock)) - return 0; + return; while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap, &iter, pgoff, pgoff)) != NULL) { + v++; mm = vma->vm_mm; if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,mm->memclass)<0) maxshareclass = mm->memclass; } spin_unlock(&mapping->i_mmap_lock); + if (!v) + fnovma++; + if (!maxshareclass) + maxnull++; + if (maxshareclass && pgcls != maxshareclass) { ckrm_change_page_class(page, maxshareclass); - return 1; - } - return 0; + changed++; + } else + unchanged++; + return; } -static int +static void ckrm_mem_evaluate_page(struct page* page) { - int changed = 0; - if (page->mapping) { if (PageAnon(page)) - changed = ckrm_mem_evaluate_page_anon(page); + ckrm_mem_evaluate_page_anon(page); else - changed = ckrm_mem_evaluate_page_file(page); + ckrm_mem_evaluate_page_file(page); + } else + unmapped++; + return; +} + +static void +ckrm_mem_evaluate_all_pages() +{ + struct page *page; + struct zone *zone; + int active = 0, inactive = 0, cleared = 0; + int act_cnt, inact_cnt, idx; + ckrm_mem_res_t *res; + + spin_lock(&ckrm_mem_lock); + list_for_each_entry(res, &ckrm_memclass_list, mcls_list) { + res->tmp_cnt = 0; } - return changed; + spin_unlock(&ckrm_mem_lock); + + for_each_zone(zone) { + spin_lock_irq(&zone->lru_lock); + list_for_each_entry(page, &zone->inactive_list, lru) { + ckrm_mem_evaluate_page(page); + active++; + page_class(page)->tmp_cnt++; + if (!test_bit(PG_ckrm_account, &page->flags)) + cleared++; + } + list_for_each_entry(page, &zone->active_list, lru) { + ckrm_mem_evaluate_page(page); + inactive++; + page_class(page)->tmp_cnt++; + if (!test_bit(PG_ckrm_account, &page->flags)) + cleared++; + } + spin_unlock_irq(&zone->lru_lock); + } + printk("all_pages: active %d inactive %d cleared %d\n", + active, inactive, cleared); + spin_lock(&ckrm_mem_lock); + list_for_each_entry(res, &ckrm_memclass_list, mcls_list) { + act_cnt = 0; inact_cnt = 0; idx = 0; + for_each_zone(zone) { + act_cnt += res->nr_active[idx]; + inact_cnt += res->nr_inactive[idx]; + idx++; + } + printk("all_pages: %s: tmp_cnt %d; act_cnt %d inact_cnt %d\n", + res->core->name, res->tmp_cnt, act_cnt, inact_cnt); + } + spin_unlock(&ckrm_mem_lock); + + // check all mm's in the system to see which memclass they are attached + // to. + return; } -static inline int +static /*inline*/ int class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma, pmd_t* pmdir, unsigned long address, unsigned long end) { - pte_t* pte; + pte_t *pte, *orig_pte; unsigned long pmd_end; if (pmd_none(*pmdir)) return 0; BUG_ON(pmd_bad(*pmdir)); - pte = pte_offset_map(pmdir,address); + orig_pte = pte = pte_offset_map(pmdir,address); pmd_end = (address+PMD_SIZE)&PMD_MASK; if (end>pmd_end) end = pmd_end; do { if (pte_present(*pte)) { - ckrm_mem_evaluate_page(pte_page(*pte)); + BUG_ON(mm->memclass == NULL); + ckrm_change_page_class(pte_page(*pte), mm->memclass); + // ckrm_mem_evaluate_page(pte_page(*pte)); } address += PAGE_SIZE; pte++; } while(address && (addressmemclass != (void *)maxshareclass) { - mem_class_get(maxshareclass); + if (maxshareclass && (mm->memclass != (void *)maxshareclass)) { if (mm->memclass) mem_class_put(mm->memclass); mm->memclass = maxshareclass; + mem_class_get(maxshareclass); /* Go through all VMA to migrate pages */ down_read(&mm->mmap_sem); @@ -776,26 +863,6 @@ ckrm_mem_evaluate_mm(struct mm_struct* mm) return; } -void -ckrm_mem_evaluate_page_byadd(struct page* page, struct mm_struct* mm) -{ - ckrm_mem_res_t *pgcls = page_class(page); - ckrm_mem_res_t *chgcls = mm->memclass ? mm->memclass : GET_MEM_CLASS(current); - - if (!chgcls || pgcls == chgcls) - return; - - if (!page->mapcount) { - ckrm_change_page_class(page, chgcls); - return; - } - if (ckrm_mem_share_compare(pgcls, chgcls) < 0) { - ckrm_change_page_class(page, chgcls); - return; - } - return; -} - void ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task) { @@ -805,10 +872,26 @@ ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task) list_del_init(&task->mm_peers); } list_add_tail(&task->mm_peers, &mm->tasklist); + spin_unlock(&mm->peertask_lock); if (mm->memclass != GET_MEM_CLASS(task)) ckrm_mem_evaluate_mm(mm); - spin_unlock(&mm->peertask_lock); return; } +int +ckrm_memclass_valid(ckrm_mem_res_t *cls) +{ + ckrm_mem_res_t *tmp; + + spin_lock(&ckrm_mem_lock); + list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) { + if (tmp == cls) { + spin_unlock(&ckrm_mem_lock); + return 1; + } + } + spin_unlock(&ckrm_mem_lock); + return 0; +} + MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrmutils.c b/kernel/ckrm/ckrmutils.c index c56a2ae1c..d54e7b563 100644 --- a/kernel/ckrm/ckrmutils.c +++ b/kernel/ckrm/ckrmutils.c @@ -96,7 +96,6 @@ void child_maxlimit_changed(struct ckrm_shares *parent, int new_limit) return; } - /* * Caller is responsible for holding any lock to protect the data * structures passed to this function @@ -111,26 +110,18 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, // Check total_guarantee for correctness if (new->total_guarantee <= CKRM_SHARE_DONTCARE) { - printk(KERN_ERR "new->total_guarantee %d <= CKRM_SHARE_DONTCARE\n", - new->total_guarantee); goto set_share_err; } else if (new->total_guarantee == CKRM_SHARE_UNCHANGED) { ; // do nothing } else if (cur_usage_guar > new->total_guarantee) { - printk(KERN_ERR "cur_usage_guar %d > new->total_guarantee %d\n", - cur_usage_guar,new->total_guarantee); goto set_share_err; } // Check max_limit for correctness if (new->max_limit <= CKRM_SHARE_DONTCARE) { - printk(KERN_ERR "new->max_limit %d <= CKRM_SHARE_DONTCARE\n", - new->max_limit); goto set_share_err; } else if (new->max_limit == CKRM_SHARE_UNCHANGED) { ; // do nothing } else if (cur->cur_max_limit > new->max_limit) { - printk(KERN_ERR "cur->cur_max_limit %d > new->max_limit %d\n", - cur->cur_max_limit, new->max_limit); goto set_share_err; } // Check my_guarantee for correctness @@ -139,8 +130,6 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, } else if (new->my_guarantee == CKRM_SHARE_DONTCARE) { ; // do nothing } else if (par && increase_by > par->unused_guarantee) { - printk(KERN_ERR "increase_by %d > par->unused_guarantee %d\n", - increase_by, par->unused_guarantee); goto set_share_err; } // Check my_limit for correctness @@ -150,8 +139,6 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, ; // do nothing } else if (par && new->my_limit > par->max_limit) { // I can't get more limit than my parent's limit - printk(KERN_ERR "new->my_limit %d > par->max_limit %d\n", - new->my_limit,par->max_limit); goto set_share_err; } @@ -165,8 +152,6 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, ; // do nothing earlier setting would've // taken care of it } else if (new->my_guarantee > cur->my_limit) { - printk(KERN_ERR "new->my_guarantee %d > cur->my_limit %d\n", - new->my_guarantee,par->max_limit); goto set_share_err; } } else { // new->my_limit has a valid value @@ -174,13 +159,9 @@ set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, ; // do nothing } else if (new->my_guarantee == CKRM_SHARE_UNCHANGED) { if (cur->my_guarantee > new->my_limit) { - printk(KERN_ERR "cur->my_guarantee %d > new->my_limit %d\n", - cur->my_guarantee,new->my_limit); goto set_share_err; } } else if (new->my_guarantee > new->my_limit) { - printk(KERN_ERR "new->my_guarantee %d > new->my_limit %d\n", - new->my_guarantee,new->my_limit); goto set_share_err; } } diff --git a/kernel/exit.c b/kernel/exit.c index ca75e5ea5..70c92e58b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -513,6 +514,12 @@ static inline void __exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); +#ifdef CONFIG_CKRM_RES_MEM + spin_lock(&mm->peertask_lock); + list_del_init(&tsk->mm_peers); + ckrm_mem_evaluate_mm(mm); + spin_unlock(&mm->peertask_lock); +#endif enter_lazy_tlb(mm, current); task_unlock(tsk); mmput(mm); diff --git a/kernel/fork.c b/kernel/fork.c index d665090c8..e639ce1c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -265,6 +266,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) ckrm_cb_newtask(tsk); /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); +#ifdef CONFIG_CKRM_RES_MEM + INIT_LIST_HEAD(&tsk->mm_peers); +#endif return tsk; } @@ -417,6 +421,10 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; +#ifdef CONFIG_CKRM_RES_MEM + INIT_LIST_HEAD(&mm->tasklist); + mm->peertask_lock = SPIN_LOCK_UNLOCKED; +#endif if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -437,6 +445,10 @@ struct mm_struct * mm_alloc(void) if (mm) { memset(mm, 0, sizeof(*mm)); mm = mm_init(mm); +#ifdef CONFIG_CKRM_RES_MEM + mm->memclass = GET_MEM_CLASS(current); + mem_class_get(mm->memclass); +#endif } return mm; } @@ -451,6 +463,13 @@ void fastcall __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); +#ifdef CONFIG_CKRM_RES_MEM + /* class can be null and mm's tasklist can be empty here */ + if (mm->memclass) { + mem_class_put(mm->memclass); + mm->memclass = NULL; + } +#endif free_mm(mm); } @@ -578,6 +597,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) good_mm: tsk->mm = mm; tsk->active_mm = mm; + ckrm_init_mm_to_task(mm, tsk); return 0; free_pt: @@ -1096,7 +1116,6 @@ struct task_struct *copy_process(unsigned long clone_flags, } else link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); - p->ioprio = current->ioprio; nr_threads++; write_unlock_irq(&tasklist_lock); retval = 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6708f4f80..0ccf1ee0a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -268,6 +269,7 @@ free_pages_bulk(struct zone *zone, int count, /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); __free_pages_bulk(page, base, zone, area, order); + ckrm_clear_page_class(page); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -610,6 +612,10 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, might_sleep_if(wait); + if (!ckrm_class_limit_ok((GET_MEM_CLASS(current)))) { + return NULL; + } + zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ if (zones[0] == NULL) /* no zones in the zonelist */ return NULL; @@ -739,6 +745,7 @@ nopage: return NULL; got_pg: kernel_map_pages(page, 1 << order, 1); + ckrm_set_pages_class(page, 1 << order, GET_MEM_CLASS(current)); return page; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e3b69342..4911729ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -37,6 +37,7 @@ #include #include +#include /* possible outcome of pageout() */ typedef enum { @@ -71,6 +72,9 @@ struct scan_control { /* This context's GFP mask */ unsigned int gfp_mask; + /* Flag used by CKRM */ + unsigned int ckrm_flags; + int may_writepage; }; @@ -542,19 +546,23 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - int max_scan = sc->nr_to_scan; + int max_scan = sc->nr_to_scan, nr_pass; + unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); +redo: + ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); + nr_pass = zone->nr_inactive; while (max_scan > 0) { struct page *page; int nr_taken = 0; int nr_scan = 0; int nr_freed; - while (nr_scan++ < SWAP_CLUSTER_MAX && + while (nr_pass-- && nr_scan++ < SWAP_CLUSTER_MAX && !list_empty(&zone->inactive_list)) { page = lru_to_page(&zone->inactive_list); @@ -572,15 +580,25 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) SetPageLRU(page); list_add(&page->lru, &zone->inactive_list); continue; + } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { + __put_page(page); + SetPageLRU(page); +#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE + list_add_tail(&page->lru, &zone->inactive_list); +#else + list_add(&page->lru, &zone->inactive_list); +#endif + continue; } list_add(&page->lru, &page_list); + ckrm_mem_dec_inactive(page); nr_taken++; } zone->nr_inactive -= nr_taken; zone->pages_scanned += nr_taken; spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) + if ((bit_flag == 0) && (nr_taken == 0)) goto done; max_scan -= nr_scan; @@ -613,6 +631,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } + if (ckrm_flags && (nr_pass <= 0)) { + goto redo; + } } spin_unlock_irq(&zone->lru_lock); done: @@ -652,11 +673,17 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) long mapped_ratio; long distress; long swap_tendency; + unsigned int ckrm_flags = sc->ckrm_flags, bit_flag; + int nr_pass; lru_add_drain(); pgmoved = 0; spin_lock_irq(&zone->lru_lock); - while (pgscanned < nr_pages && !list_empty(&zone->active_list)) { +redo: + ckrm_get_reclaim_bits(&ckrm_flags, &bit_flag); + nr_pass = zone->nr_active; + while (pgscanned < nr_pages && !list_empty(&zone->active_list) && + nr_pass) { page = lru_to_page(&zone->active_list); prefetchw_prev_lru_page(page, &zone->active_list, flags); if (!TestClearPageLRU(page)) @@ -672,11 +699,24 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) __put_page(page); SetPageLRU(page); list_add(&page->lru, &zone->active_list); + pgscanned++; + } else if (bit_flag && !ckrm_kick_page(page, bit_flag)) { + __put_page(page); + SetPageLRU(page); +#ifdef CONFIG_CKRM_MEM_LRUORDER_CHANGE + list_add_tail(&page->lru, &zone->active_list); +#else + list_add(&page->lru, &zone->active_list); +#endif } else { list_add(&page->lru, &l_hold); + ckrm_mem_dec_active(page); pgmoved++; - } pgscanned++; + } + if (!--nr_pass && ckrm_flags) { + goto redo; + } } zone->nr_active -= pgmoved; spin_unlock_irq(&zone->lru_lock); @@ -750,6 +790,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) if (!TestClearPageActive(page)) BUG(); list_move(&page->lru, &zone->inactive_list); + ckrm_mem_inc_inactive(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_inactive += pgmoved; @@ -778,6 +819,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) BUG(); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); + ckrm_mem_inc_active(page); pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_active += pgmoved; @@ -825,6 +867,7 @@ shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_to_reclaim = SWAP_CLUSTER_MAX; while (nr_active || nr_inactive) { + sc->ckrm_flags = ckrm_setup_reclamation(); if (nr_active) { sc->nr_to_scan = min(nr_active, (unsigned long)SWAP_CLUSTER_MAX); @@ -840,9 +883,113 @@ shrink_zone(struct zone *zone, struct scan_control *sc) if (sc->nr_to_reclaim <= 0) break; } + ckrm_teardown_reclamation(); + } +} + +#ifdef CONFIG_CKRM_RES_MEM +// This function needs to be given more thought. +// Shrink the class to be at 90% of its limit +static void +ckrm_shrink_class(ckrm_mem_res_t *cls) +{ + struct scan_control sc; + struct zone *zone; + int zindex = 0, active_credit = 0, inactive_credit = 0; + + if (ckrm_test_set_shrink(cls)) { // set the SHRINK bit atomically + // if it is already set somebody is working on it. so... leave + return; + } + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + sc.ckrm_flags = ckrm_get_reclaim_flags(cls); + sc.nr_reclaimed = 0; + sc.priority = 0; // always very high priority + + for_each_zone(zone) { + int zone_total, zone_limit, active_limit, inactive_limit; + int active_over, inactive_over; + unsigned long nr_active, nr_inactive; + u64 temp; + + zone->temp_priority = zone->prev_priority; + zone->prev_priority = sc.priority; + + zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; + + temp = (u64) cls->pg_limit * zone_total; + do_div(temp, ckrm_tot_lru_pages); + zone_limit = (int) temp; + active_limit = (6 * zone_limit) / 10; // 2/3rd in active list + inactive_limit = (3 * zone_limit) / 10; // 1/3rd in inactive list + + active_over = cls->nr_active[zindex] - active_limit + active_credit; + inactive_over = active_over + + (cls->nr_inactive[zindex] - inactive_limit) + inactive_credit; + + if (active_over > 0) { + zone->nr_scan_active += active_over + 1; + nr_active = zone->nr_scan_active; + active_credit = 0; + } else { + active_credit += active_over; + nr_active = 0; + } + + if (inactive_over > 0) { + zone->nr_scan_inactive += inactive_over; + nr_inactive = zone->nr_scan_inactive; + inactive_credit = 0; + } else { + inactive_credit += inactive_over; + nr_inactive = 0; + } + while (nr_active || nr_inactive) { + if (nr_active) { + sc.nr_to_scan = min(nr_active, + (unsigned long)SWAP_CLUSTER_MAX); + nr_active -= sc.nr_to_scan; + refill_inactive_zone(zone, &sc); + } + + if (nr_inactive) { + sc.nr_to_scan = min(nr_inactive, + (unsigned long)SWAP_CLUSTER_MAX); + nr_inactive -= sc.nr_to_scan; + shrink_cache(zone, &sc); + if (sc.nr_to_reclaim <= 0) + break; + } + } + zone->prev_priority = zone->temp_priority; + zindex++; } + ckrm_clear_shrink(cls); } +static void +ckrm_shrink_classes(void) +{ + ckrm_mem_res_t *cls; + + spin_lock(&ckrm_mem_lock); + while (!ckrm_shrink_list_empty()) { + cls = list_entry(ckrm_shrink_list.next, ckrm_mem_res_t, + shrink_list); + spin_unlock(&ckrm_mem_lock); + ckrm_shrink_class(cls); + spin_lock(&ckrm_mem_lock); + list_del(&cls->shrink_list); + cls->flags &= ~MEM_AT_LIMIT; + } + spin_unlock(&ckrm_mem_lock); +} + +#else +#define ckrm_shrink_classes() do { } while(0) +#endif + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1148,6 +1295,9 @@ static int kswapd(void *p) schedule(); finish_wait(&pgdat->kswapd_wait, &wait); + if (!ckrm_shrink_list_empty()) + ckrm_shrink_classes(); + else balance_pgdat(pgdat, 0); } return 0; @@ -1158,7 +1308,7 @@ static int kswapd(void *p) */ void wakeup_kswapd(struct zone *zone) { - if (zone->free_pages > zone->pages_low) + if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty()) return; if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) return; -- 2.47.0