1 /* linux/drivers/block/ckrm_io.c : Block I/O Resource Controller for CKRM
3 * Copyright (C) Shailabh Nagar, IBM Corp. 2004
6 * Provides best-effort block I/O bandwidth control for CKRM
7 * This file provides the CKRM API. The underlying scheduler is a
8 * modified Complete-Fair Queueing (CFQ) iosched.
10 * Latest version, more details at http://ckrm.sf.net
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
22 * Third complete rewrite for CKRM's current API
26 #include <linux/module.h>
27 #include <linux/slab.h>
28 #include <linux/string.h>
29 #include <asm/errno.h>
30 #include <asm/div64.h>
31 #include <linux/list.h>
32 #include <linux/spinlock.h>
35 #include <linux/ckrm_tc.h>
36 #include <linux/ckrm-io.h>
38 /* sectorate == 512 byte sectors served in CFQ_EPOCH ns*/
40 /* CKI_ROOTSECTORATE needs to be made configurable from outside */
41 #define CKI_ROOTSECTORATE 100000
42 #define CKI_MINSECTORATE 100
44 #define CKI_IOUSAGE_UNIT 512
46 typedef struct ckrm_io_stats{
47 struct timeval epochstart ; /* all measurements relative to this
49 unsigned long blksz; /* size of bandwidth unit */
50 atomic_t blkrd; /* read units submitted to DD */
51 atomic_t blkwr; /* write units submitted to DD */
53 int nskip; /* # times q skipped */
54 unsigned long navsec; /* avg sectors serviced */
55 int timedout; /* # times gap > epoch */
56 u64 sec[2]; /* sectors serviced in
58 } cki_stats_t; /* per class I/O statistics */
61 * Currently local unit == CFQ I/O priority directly.
62 * CFQ ionice values have an implied bandwidth share so they
63 * can be added, subdivided etc. as long as the initial allocation
64 * of the systemwide default's total is set to the highest CFQ ionice
65 * value (== 100% of disk bandwidth)
68 typedef struct ckrm_io_class {
70 struct ckrm_core_class *core;
71 struct ckrm_core_class *parent;
73 struct ckrm_shares shares;
74 spinlock_t shares_lock; /* protect share changes */
76 /* Absolute shares of this class
80 cfqlim_t cfqpriv; /* Data common with cfq priolvl's */
83 int cnt_guarantee; /* Allocation as parent */
84 int cnt_unused; /* Allocation to default subclass */
87 /* Statistics, for class and default subclass */
94 /* Internal functions */
95 static inline void cki_reset_stats(cki_stats_t *usg);
96 static inline void init_icls_one(cki_icls_t *icls);
97 static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres);
99 /* External functions e.g. interface to ioscheduler */
100 void *cki_tsk_icls (struct task_struct *tsk);
101 int cki_tsk_ioprio (struct task_struct *tsk);
103 extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv);
105 /* CKRM Resource Controller API functions */
106 static void * cki_alloc(struct ckrm_core_class *this,
107 struct ckrm_core_class * parent);
108 static void cki_free(void *res);
109 static int cki_setshare(void *res, struct ckrm_shares * shares);
110 static int cki_getshare(void *res, struct ckrm_shares * shares);
111 static int cki_getstats(void *res, struct seq_file *);
112 static int cki_resetstats(void *res);
113 static int cki_showconfig(void *res, struct seq_file *sfile);
114 static int cki_setconfig(void *res, const char *cfgstr);
115 static void cki_chgcls(void *tsk, void *oldres, void *newres);
118 struct ckrm_res_ctlr cki_rcbs;
120 static inline void cki_reset_stats(cki_stats_t *stats)
123 atomic_set(&stats->blkrd,0);
124 atomic_set(&stats->blkwr,0);
128 static inline void init_icls_stats(cki_icls_t *icls)
132 do_gettimeofday(&tv);
133 icls->stats.epochstart = icls->mystats.epochstart = tv;
134 icls->stats.blksz = icls->mystats.blksz = CKI_IOUSAGE_UNIT;
135 cki_reset_stats(&icls->stats);
136 cki_reset_stats(&icls->mystats);
139 /* Initialize icls to default values
140 * No other classes touched, locks not reinitialized.
143 static inline void init_icls_one(cki_icls_t *icls)
145 /* Zero initial guarantee for scalable creation of
148 /* Try out a new set */
150 icls->shares.my_guarantee = CKRM_SHARE_DONTCARE;
151 icls->shares.my_limit = CKRM_SHARE_DONTCARE;
152 icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
153 icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT;
154 icls->shares.unused_guarantee = icls->shares.total_guarantee;
155 icls->shares.cur_max_limit = 0;
157 icls->cnt_guarantee = CKRM_SHARE_DONTCARE;
158 icls->cnt_unused = CKRM_SHARE_DONTCARE;
159 icls->cnt_limit = CKRM_SHARE_DONTCARE;
161 init_icls_stats(icls);
164 /* Recalculate absolute shares from relative
165 * Caller should hold a lock on icls
168 static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres)
171 ckrm_core_class_t *child = NULL;
172 cki_icls_t *childres;
173 int resid = cki_rcbs.resid;
177 struct ckrm_shares *par = &parres->shares;
178 struct ckrm_shares *self = &res->shares;
181 if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
182 res->cnt_guarantee = CKRM_SHARE_DONTCARE;
183 } else if (par->total_guarantee) {
184 temp = (u64) self->my_guarantee *
185 parres->cnt_guarantee;
186 do_div(temp, par->total_guarantee);
187 res->cnt_guarantee = (int) temp;
189 res->cnt_guarantee = 0;
193 if (parres->cnt_limit == CKRM_SHARE_DONTCARE) {
194 res->cnt_limit = CKRM_SHARE_DONTCARE;
195 atomic_set(&res->cfqpriv.sectorate,CKI_MINSECTORATE);
197 if (par->max_limit) {
198 temp = (u64) self->my_limit *
200 do_div(temp, par->max_limit);
201 res->cnt_limit = (int) temp;
205 atomic_set(&res->cfqpriv.sectorate,res->cnt_limit);
208 if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) {
209 res->cnt_unused = CKRM_SHARE_DONTCARE;
211 if (self->total_guarantee) {
212 temp = (u64) self->unused_guarantee *
214 do_div(temp, self->total_guarantee);
215 res->cnt_unused = (int) temp;
223 // propagate to children
224 ckrm_lock_hier(res->core);
225 while ((child = ckrm_get_next_child(res->core,child)) != NULL){
226 childres = ckrm_get_res_class(child, resid,
229 spin_lock(&childres->shares_lock);
230 cki_recalc_propagate(childres, res);
231 spin_unlock(&childres->shares_lock);
233 ckrm_unlock_hier(res->core);
236 void *cki_tsk_icls(struct task_struct *tsk)
238 return (void *) ckrm_get_res_class(class_core(tsk->taskclass),
239 cki_rcbs.resid, cki_icls_t);
242 int cki_tsk_ioprio(struct task_struct *tsk)
244 /* Don't use I/O priorities for now */
248 void *cki_tsk_cfqpriv(struct task_struct *tsk)
250 cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass),
251 cki_rcbs.resid, cki_icls_t);
252 return (void *)&(icls->cfqpriv);
256 static void *cki_alloc(struct ckrm_core_class *core,
257 struct ckrm_core_class *parent)
261 icls = kmalloc(sizeof(cki_icls_t), GFP_ATOMIC);
263 printk(KERN_ERR "cki_res_alloc failed GFP_ATOMIC\n");
267 memset(icls, 0, sizeof(cki_icls_t));
269 icls->parent = parent;
270 icls->shares_lock = SPIN_LOCK_UNLOCKED;
274 if (parent == NULL) {
275 icls->cnt_guarantee = CKI_ROOTSECTORATE;
276 icls->cnt_unused = CKI_ROOTSECTORATE;
277 icls->cnt_limit = CKI_ROOTSECTORATE;
278 atomic_set(&(icls->cfqpriv.sectorate),icls->cnt_limit);
280 try_module_get(THIS_MODULE);
284 static void cki_free(void *res)
286 cki_icls_t *icls = res, *parres, *childres;
287 ckrm_core_class_t *child = NULL;
288 int maxlimit, resid = cki_rcbs.resid;
294 /* Deallocate CFQ queues */
296 /* Currently CFQ queues are deallocated when empty. Since no task
297 * should belong to this icls, no new requests will get added to the
300 * When CFQ switches to persistent queues, call its "put" function
301 * so it gets deallocated after the last pending request is serviced.
305 parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t);
307 printk(KERN_ERR "cki_free: error getting "
308 "resclass from core \n");
312 /* Update parent's shares */
313 spin_lock(&parres->shares_lock);
315 child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0);
316 parres->cnt_unused += icls->cnt_guarantee;
318 // run thru parent's children and get the new max_limit of the parent
319 ckrm_lock_hier(parres->core);
321 while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
322 childres = ckrm_get_res_class(child, resid, cki_icls_t);
323 if (maxlimit < childres->shares.my_limit) {
324 maxlimit = childres->shares.my_limit;
327 ckrm_unlock_hier(parres->core);
328 if (parres->shares.cur_max_limit < maxlimit) {
329 parres->shares.cur_max_limit = maxlimit;
331 spin_unlock(&parres->shares_lock);
334 module_put(THIS_MODULE);
339 static int cki_setshare(void *res, struct ckrm_shares *new)
341 cki_icls_t *icls = res, *parres;
342 struct ckrm_shares *cur, *par;
343 int rc = -EINVAL, resid = cki_rcbs.resid;
351 ckrm_get_res_class(icls->parent, resid, cki_icls_t);
353 pr_debug("cki_setshare: invalid resclass\n");
356 spin_lock(&parres->shares_lock);
357 spin_lock(&icls->shares_lock);
358 par = &parres->shares;
360 spin_lock(&icls->shares_lock);
365 rc = set_shares(new, cur, par);
367 if ((!rc) && parres) {
368 if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) {
369 parres->cnt_unused = CKRM_SHARE_DONTCARE;
370 } else if (par->total_guarantee) {
371 u64 temp = (u64) par->unused_guarantee *
372 parres->cnt_guarantee;
373 do_div(temp, par->total_guarantee);
374 parres->cnt_unused = (int) temp;
376 parres->cnt_unused = 0;
378 cki_recalc_propagate(res, parres);
380 spin_unlock(&icls->shares_lock);
382 spin_unlock(&parres->shares_lock);
387 static int cki_getshare(void *res, struct ckrm_shares * shares)
389 cki_icls_t *icls = res;
393 *shares = icls->shares;
397 static int cki_getstats(void *res, struct seq_file *sfile)
399 cki_icls_t *icls = res;
404 seq_printf(sfile, "abs limit %d\n",icls->cnt_limit);
405 seq_printf(sfile, "skip %d timdout %d avsec %lu rate %ld "
406 " sec0 %ld sec1 %ld\n",
408 icls->cfqpriv.timedout,
409 icls->cfqpriv.navsec,
410 atomic_read(&(icls->cfqpriv.sectorate)),
411 (unsigned long)icls->cfqpriv.sec[0],
412 (unsigned long)icls->cfqpriv.sec[1]);
417 static int cki_resetstats(void *res)
419 cki_icls_t *icls = res;
424 init_icls_stats(icls);
428 static int cki_showconfig(void *res, struct seq_file *sfile)
433 static int cki_setconfig(void *res, const char *cfgstr)
438 static void cki_chgcls(void *tsk, void *oldres, void *newres)
440 /* cki_icls_t *oldicls = oldres, *newicls = newres; */
442 /* Nothing needs to be done
443 * Future requests from task will go to the new class's CFQ q
444 * Old ones will continue to get satisfied from the original q
446 * Once CFQ moves to a persistent queue model and if refcounts on
447 * icls's CFQ queues are used, a decrement op would be needed here
455 struct ckrm_res_ctlr cki_rcbs = {
459 .res_alloc = cki_alloc,
460 .res_free = cki_free,
461 .set_share_values = cki_setshare,
462 .get_share_values = cki_getshare,
463 .get_stats = cki_getstats,
464 .reset_stats = cki_resetstats,
465 .show_config = cki_showconfig,
466 .set_config = cki_setconfig,
467 .change_resclass = cki_chgcls,
472 int __init cki_init(void)
474 struct ckrm_classtype *clstype;
475 int resid = cki_rcbs.resid;
477 clstype = ckrm_find_classtype_by_name("taskclass");
478 if (clstype == NULL) {
479 printk(KERN_INFO "init_cki: classtype<taskclass> not found\n");
484 resid = ckrm_register_res_ctlr(clstype, &cki_rcbs);
486 cki_rcbs.classtype = clstype;
487 cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio,cki_tsk_cfqpriv);
494 void __exit cki_exit(void)
496 ckrm_unregister_res_ctlr(&cki_rcbs);
498 cki_rcbs.classtype = NULL;
499 cki_cfq_set(NULL,NULL,NULL);
502 module_init(cki_init)
503 module_exit(cki_exit)
505 MODULE_AUTHOR("Shailabh Nagar <nagar@watson.ibm.com>");
506 MODULE_DESCRIPTION("CKRM Disk I/O Resource Controller");
507 MODULE_LICENSE("GPL");