1 /* ckrm_tc.c - Class-based Kernel Resource Management (CKRM)
3 * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004
4 * (C) Shailabh Nagar, IBM Corp. 2003
5 * (C) Chandra Seetharaman, IBM Corp. 2003
6 * (C) Vivek Kashyap, IBM Corp. 2004
9 * Provides kernel API of CKRM for in-kernel,per-resource controllers
10 * (one each for cpu, memory, io, network) and callbacks for
11 * classification modules.
13 * Latest version, more details at http://ckrm.sf.net
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
27 * Made modifications to suit the new RBCE module.
29 * Fixed a bug in fork and exit callbacks. Added callbacks_active and
30 * surrounding logic. Added task paramter for all CE callbacks.
32 * moved to referenced counted class objects and correct locking
34 * introduced adopted to emerging classtype interface
37 #include <linux/config.h>
38 #include <linux/init.h>
39 #include <linux/linkage.h>
40 #include <linux/kernel.h>
41 #include <linux/errno.h>
42 #include <asm/uaccess.h>
44 #include <asm/errno.h>
45 #include <linux/string.h>
46 #include <linux/list.h>
47 #include <linux/spinlock.h>
48 #include <linux/module.h>
49 #include <linux/ckrm_rc.h>
51 #include <linux/ckrm_tc.h>
53 #define TC_DEBUG(fmt, args...) do { \
54 /* printk("%s: " fmt, __FUNCTION__ , ## args); */ } while (0)
56 static struct ckrm_task_class taskclass_dflt_class = {
59 const char *dflt_taskclass_name = TASK_CLASS_TYPE_NAME;
61 static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class
62 *parent, const char *name);
63 static int ckrm_free_task_class(struct ckrm_core_class *core);
65 static int tc_forced_reclassify(ckrm_core_class_t * target,
67 static int tc_show_members(struct ckrm_core_class *core, struct seq_file *seq);
68 static void tc_add_resctrl(struct ckrm_core_class *core, int resid);
70 struct ckrm_classtype CT_taskclass = {
72 .name = TASK_CLASS_TYPE_NAME,
73 .typeID = CKRM_CLASSTYPE_TASK_CLASS,
74 .maxdepth = 3, // Hubertus .. just to start
75 .resid_reserved = 4, // Hubertus .. reservation
76 .max_res_ctlrs = CKRM_MAX_RES_CTLRS,
79 .res_ctlrs_lock = SPIN_LOCK_UNLOCKED,
80 .classes = LIST_HEAD_INIT(CT_taskclass.classes),
82 .default_class = &taskclass_dflt_class.core,
84 // private version of functions
85 .alloc = &ckrm_alloc_task_class,
86 .free = &ckrm_free_task_class,
87 .show_members = &tc_show_members,
88 .forced_reclassify = &tc_forced_reclassify,
90 // use of default functions
91 .show_shares = &ckrm_class_show_shares,
92 .show_stats = &ckrm_class_show_stats,
93 .show_config = &ckrm_class_show_config,
94 .set_config = &ckrm_class_set_config,
95 .set_shares = &ckrm_class_set_shares,
96 .reset_stats = &ckrm_class_reset_stats,
98 // mandatory private version .. no dflt available
99 .add_resctrl = &tc_add_resctrl,
102 /**************************************************************************
104 **************************************************************************/
106 static inline void ckrm_init_task_lock(struct task_struct *tsk)
108 tsk->ckrm_tsklock = SPIN_LOCK_UNLOCKED;
111 // Hubertus .. following functions should move to ckrm_rc.h
113 static inline void ckrm_task_lock(struct task_struct *tsk)
115 spin_lock(&tsk->ckrm_tsklock);
118 static inline void ckrm_task_unlock(struct task_struct *tsk)
120 spin_unlock(&tsk->ckrm_tsklock);
124 * Change the task class of the given task.
126 * Change the task's task class to "newcls" if the task's current
127 * class (task->taskclass) is same as given "oldcls", if it is non-NULL.
129 * Caller is responsible to make sure the task structure stays put through
132 * This function should be called with the following locks NOT held
133 * - tsk->ckrm_task_lock
134 * - core->ckrm_lock, if core is NULL then ckrm_dflt_class.ckrm_lock
135 * - tsk->taskclass->ckrm_lock
137 * Function is also called with a ckrm_core_grab on the new core, hence
138 * it needs to be dropped if no assignment takes place.
141 ckrm_set_taskclass(struct task_struct *tsk, ckrm_task_class_t * newcls,
142 ckrm_task_class_t * oldcls, enum ckrm_event event)
145 ckrm_classtype_t *clstype;
146 ckrm_res_ctlr_t *rcbs;
147 ckrm_task_class_t *curcls;
148 void *old_res_class, *new_res_class;
152 curcls = tsk->taskclass;
154 if ((void *)-1 == curcls) {
155 // task is disassociated from ckrm... don't bother it.
156 ckrm_task_unlock(tsk);
157 ckrm_core_drop(class_core(newcls));
161 if ((curcls == NULL) && (newcls == (void *)-1)) {
162 // task need to disassociated from ckrm and has no curcls
163 // just disassociate and return.
164 tsk->taskclass = newcls;
165 ckrm_task_unlock(tsk);
168 // check whether compare_and_exchange should
169 if (oldcls && (oldcls != curcls)) {
170 ckrm_task_unlock(tsk);
172 /* compensate for previous grab */
173 TC_DEBUG("(%s:%d): Race-condition caught <%s> %d\n",
174 tsk->comm, tsk->pid, class_core(newcls)->name,
176 ckrm_core_drop(class_core(newcls));
180 // make sure we have a real destination core
182 newcls = &taskclass_dflt_class;
183 ckrm_core_grab(class_core(newcls));
185 // take out of old class
186 // remember that we need to drop the oldcore
187 if ((drop_old_cls = (curcls != NULL))) {
188 class_lock(class_core(curcls));
189 if (newcls == curcls) {
190 // we are already in the destination class.
191 // we still need to drop oldcore
192 class_unlock(class_core(curcls));
193 ckrm_task_unlock(tsk);
196 list_del(&tsk->taskclass_link);
197 INIT_LIST_HEAD(&tsk->taskclass_link);
198 tsk->taskclass = NULL;
199 class_unlock(class_core(curcls));
200 if (newcls == (void *)-1) {
201 tsk->taskclass = newcls;
202 ckrm_task_unlock(tsk);
203 // still need to get out of old class
208 // put into new class
209 class_lock(class_core(newcls));
210 tsk->taskclass = newcls;
211 list_add(&tsk->taskclass_link, &class_core(newcls)->objlist);
212 class_unlock(class_core(newcls));
214 if (newcls == curcls) {
215 ckrm_task_unlock(tsk);
219 CE_NOTIFY(&CT_taskclass, event, newcls, tsk);
221 ckrm_task_unlock(tsk);
224 clstype = &CT_taskclass;
225 if (clstype->bit_res_ctlrs) {
226 // avoid running through the entire list if non is registered
227 for (i = 0; i < clstype->max_resid; i++) {
228 if (clstype->res_ctlrs[i] == NULL)
230 atomic_inc(&clstype->nr_resusers[i]);
232 curcls ? class_core(curcls)->res_class[i] : NULL;
234 newcls ? class_core(newcls)->res_class[i] : NULL;
235 rcbs = clstype->res_ctlrs[i];
236 if (rcbs && rcbs->change_resclass
237 && (old_res_class != new_res_class))
238 (*rcbs->change_resclass) (tsk, old_res_class,
240 atomic_dec(&clstype->nr_resusers[i]);
246 ckrm_core_drop(class_core(curcls));
250 // HF SUGGEST: we could macro-tize this for other types
251 // DEF_FUNC_ADD_RESCTRL(funcname,link)
252 // would DEF_FUNC_ADD_RESCTRL(tc_add_resctrl,taskclass_link)
254 static void tc_add_resctrl(struct ckrm_core_class *core, int resid)
256 struct task_struct *tsk;
257 struct ckrm_res_ctlr *rcbs;
259 if ((resid < 0) || (resid >= CKRM_MAX_RES_CTLRS)
260 || ((rcbs = core->classtype->res_ctlrs[resid]) == NULL))
264 list_for_each_entry(tsk, &core->objlist, taskclass_link) {
265 if (rcbs->change_resclass)
266 (*rcbs->change_resclass) (tsk, (void *)-1,
267 core->res_class[resid]);
272 /**************************************************************************
273 * Functions called from classification points *
274 **************************************************************************/
276 #define ECB_PRINTK(fmt, args...) \
277 // do { if (CT_taskclass.ce_regd)
278 // printk("%s: " fmt, __FUNCTION__ , ## args); } while (0)
280 #define CE_CLASSIFY_TASK(event, tsk) \
282 struct ckrm_task_class *newcls = NULL; \
283 struct ckrm_task_class *oldcls = tsk->taskclass; \
285 CE_CLASSIFY_RET(newcls,&CT_taskclass,event,tsk); \
287 /* called synchrously. no need to get task struct */ \
288 ckrm_set_taskclass(tsk, newcls, oldcls, event); \
293 #define CE_CLASSIFY_TASK_PROTECT(event, tsk) \
295 ce_protect(&CT_taskclass); \
296 CE_CLASSIFY_TASK(event,tsk); \
297 ce_release(&CT_taskclass); \
300 static void cb_taskclass_newtask(struct task_struct *tsk)
302 tsk->taskclass = NULL;
303 INIT_LIST_HEAD(&tsk->taskclass_link);
306 static void cb_taskclass_fork(struct task_struct *tsk)
308 struct ckrm_task_class *cls = NULL;
310 ECB_PRINTK("%p:%d:%s\n", tsk, tsk->pid, tsk->comm);
312 ce_protect(&CT_taskclass);
313 CE_CLASSIFY_RET(cls, &CT_taskclass, CKRM_EVENT_FORK, tsk);
315 ckrm_task_lock(tsk->parent);
316 cls = tsk->parent->taskclass;
317 ckrm_core_grab(class_core(cls));
318 ckrm_task_unlock(tsk->parent);
320 if (!list_empty(&tsk->taskclass_link))
321 printk(KERN_WARNING "BUG in cb_fork.. tsk (%s:%d> already linked\n",
322 tsk->comm, tsk->pid);
324 ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_FORK);
325 ce_release(&CT_taskclass);
328 static void cb_taskclass_exit(struct task_struct *tsk)
330 CE_CLASSIFY_NORET(&CT_taskclass, CKRM_EVENT_EXIT, tsk);
331 ckrm_set_taskclass(tsk, (void *)-1, NULL, CKRM_EVENT_EXIT);
334 static void cb_taskclass_exec(const char *filename)
336 ECB_PRINTK("%p:%d:%s <%s>\n", current, current->pid, current->comm,
338 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_EXEC, current);
341 static void cb_taskclass_uid(void)
343 ECB_PRINTK("%p:%d:%s\n", current, current->pid, current->comm);
344 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_UID, current);
347 static void cb_taskclass_gid(void)
349 ECB_PRINTK("%p:%d:%s\n", current, current->pid, current->comm);
350 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_GID, current);
354 cb_taskclass_xid(struct task_struct *tsk)
356 ECB_PRINTK("%p:%d:%s\n",current,current->pid,current->comm);
357 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_XID, tsk);
360 static struct ckrm_event_spec taskclass_events_callbacks[] = {
361 CKRM_EVENT_SPEC(NEWTASK, cb_taskclass_newtask),
362 CKRM_EVENT_SPEC(EXEC, cb_taskclass_exec),
363 CKRM_EVENT_SPEC(FORK, cb_taskclass_fork),
364 CKRM_EVENT_SPEC(EXIT, cb_taskclass_exit),
365 CKRM_EVENT_SPEC(UID, cb_taskclass_uid),
366 CKRM_EVENT_SPEC(GID, cb_taskclass_gid),
367 CKRM_EVENT_SPEC(XID, cb_taskclass_xid),
371 /***********************************************************************
373 * Asynchronous callback functions (driven by RCFS)
375 * Async functions force a setting of the task structure
376 * synchronous callbacks are protected against race conditions
377 * by using a cmpxchg on the core before setting it.
378 * Async calls need to be serialized to ensure they can't
379 * race against each other
381 ***********************************************************************/
383 DECLARE_MUTEX(async_serializer); // serialize all async functions
386 * Go through the task list and reclassify all tasks according to the current
387 * classification rules.
389 * We have the problem that we can not hold any lock (including the
390 * tasklist_lock) while classifying. Two methods possible
392 * (a) go through entire pidrange (0..pidmax) and if a task exists at
393 * that pid then reclassify it
394 * (b) go several time through task list and build a bitmap for a particular
395 * subrange of pid otherwise the memory requirements ight be too much.
397 * We use a hybrid by comparing ratio nr_threads/pidmax
400 static int ckrm_reclassify_all_tasks(void)
404 struct task_struct *proc, *thread;
406 int curpidmax = pid_max;
410 /* Check permissions */
411 if ((!capable(CAP_SYS_NICE)) && (!capable(CAP_SYS_RESOURCE))) {
415 ratio = curpidmax / nr_threads;
416 if (curpidmax <= PID_MAX_DEFAULT) {
419 use_bitmap = (ratio >= 2);
422 ce_protect(&CT_taskclass);
426 if (use_bitmap == 0) {
427 // go through it in one walk
428 read_lock(&tasklist_lock);
429 for (i = 0; i < curpidmax; i++) {
430 if ((thread = find_task_by_pid(i)) == NULL)
432 get_task_struct(thread);
433 read_unlock(&tasklist_lock);
434 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY, thread);
435 put_task_struct(thread);
436 read_lock(&tasklist_lock);
438 read_unlock(&tasklist_lock);
440 unsigned long *bitmap;
446 bitmap = (unsigned long *)__get_free_pages(GFP_KERNEL, order);
447 if (bitmap == NULL) {
452 bitmapsize = 8 * (1 << (order + PAGE_SHIFT));
453 num_loops = (curpidmax + bitmapsize - 1) / bitmapsize;
456 for (i = 0; i < num_loops && do_next; i++) {
457 int pid_start = i * bitmapsize;
458 int pid_end = pid_start + bitmapsize;
462 memset(bitmap, 0, bitmapsize / 8); // start afresh
465 read_lock(&tasklist_lock);
466 do_each_thread(proc, thread) {
468 if ((pid < pid_start) || (pid >= pid_end)) {
469 if (pid >= pid_end) {
475 set_bit(pid, bitmap);
478 while_each_thread(proc, thread);
479 read_unlock(&tasklist_lock);
485 for (; num_found--;) {
486 pos = find_next_bit(bitmap, bitmapsize, pos);
487 pid = pos + pid_start;
489 read_lock(&tasklist_lock);
490 if ((thread = find_task_by_pid(pid)) != NULL) {
491 get_task_struct(thread);
492 read_unlock(&tasklist_lock);
493 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY,
495 put_task_struct(thread);
497 read_unlock(&tasklist_lock);
504 ce_release(&CT_taskclass);
509 * Reclassify all tasks in the given core class.
512 static void ckrm_reclassify_class_tasks(struct ckrm_task_class *cls)
515 struct ckrm_hnode *cnode;
516 struct ckrm_task_class *parcls;
519 if (!ckrm_validate_and_grab_core(&cls->core))
522 down(&async_serializer); // protect again race condition
523 TC_DEBUG("start %p:%s:%d:%d\n", cls, cls->core.name,
524 atomic_read(&cls->core.refcnt),
525 atomic_read(&cls->core.hnode.parent->refcnt));
526 // If no CE registered for this classtype, following will be needed
528 ce_regd = atomic_read(&class_core(cls)->classtype->ce_regd);
529 cnode = &(class_core(cls)->hnode);
530 parcls = class_type(ckrm_task_class_t, cnode->parent);
533 class_lock(class_core(cls));
534 if (!list_empty(&class_core(cls)->objlist)) {
535 struct ckrm_task_class *newcls = NULL;
536 struct task_struct *tsk =
537 list_entry(class_core(cls)->objlist.next,
538 struct task_struct, taskclass_link);
540 get_task_struct(tsk);
541 class_unlock(class_core(cls));
544 CE_CLASSIFY_RET(newcls, &CT_taskclass,
545 CKRM_EVENT_RECLASSIFY, tsk);
547 // don't allow reclassifying to the same class
548 // as we are in the process of cleaning up
551 // compensate CE's grab
552 ckrm_core_drop(class_core(newcls));
556 if (newcls == NULL) {
558 ckrm_core_grab(class_core(newcls));
560 ckrm_set_taskclass(tsk, newcls, cls, CKRM_EVENT_RECLASSIFY);
561 put_task_struct(tsk);
565 TC_DEBUG("stop %p:%s:%d:%d %d\n", cls, cls->core.name,
566 atomic_read(&cls->core.refcnt),
567 atomic_read(&cls->core.hnode.parent->refcnt), num);
568 class_unlock(class_core(cls));
569 ckrm_core_drop(class_core(cls));
571 up(&async_serializer);
577 * Change the core class of the given task
580 int ckrm_forced_reclassify_pid(pid_t pid, struct ckrm_task_class *cls)
582 struct task_struct *tsk;
584 if (cls && !ckrm_validate_and_grab_core(class_core(cls)))
587 read_lock(&tasklist_lock);
588 if ((tsk = find_task_by_pid(pid)) == NULL) {
589 read_unlock(&tasklist_lock);
591 ckrm_core_drop(class_core(cls));
594 get_task_struct(tsk);
595 read_unlock(&tasklist_lock);
597 /* Check permissions */
598 if ((!capable(CAP_SYS_NICE)) &&
599 (!capable(CAP_SYS_RESOURCE)) && (current->user != tsk->user)) {
601 ckrm_core_drop(class_core(cls));
602 put_task_struct(tsk);
606 ce_protect(&CT_taskclass);
608 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY,tsk);
610 ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_MANUAL);
612 ce_release(&CT_taskclass);
613 put_task_struct(tsk);
618 static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class
619 *parent, const char *name)
621 struct ckrm_task_class *taskcls;
622 taskcls = kmalloc(sizeof(struct ckrm_task_class), GFP_KERNEL);
625 memset(taskcls, 0, sizeof(struct ckrm_task_class));
627 ckrm_init_core_class(&CT_taskclass, class_core(taskcls), parent, name);
629 ce_protect(&CT_taskclass);
630 if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_add)
631 (*CT_taskclass.ce_callbacks.class_add) (name, taskcls,
632 CT_taskclass.typeID);
633 ce_release(&CT_taskclass);
635 return class_core(taskcls);
638 static int ckrm_free_task_class(struct ckrm_core_class *core)
640 struct ckrm_task_class *taskcls;
642 if (!ckrm_is_core_valid(core)) {
646 if (core == core->classtype->default_class) {
647 // reset the name tag
648 core->name = dflt_taskclass_name;
652 TC_DEBUG("%p:%s:%d\n", core, core->name, atomic_read(&core->refcnt));
654 taskcls = class_type(struct ckrm_task_class, core);
656 ce_protect(&CT_taskclass);
658 if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_delete)
659 (*CT_taskclass.ce_callbacks.class_delete) (core->name, taskcls,
660 CT_taskclass.typeID);
661 ckrm_reclassify_class_tasks(taskcls);
663 ce_release(&CT_taskclass);
665 ckrm_release_core_class(core);
666 // Hubertus .... could just drop the class .. error message
670 void __init ckrm_meta_init_taskclass(void)
672 printk(KERN_DEBUG "...... Initializing ClassType<%s> ........\n",
674 // intialize the default class
675 ckrm_init_core_class(&CT_taskclass, class_core(&taskclass_dflt_class),
676 NULL, dflt_taskclass_name);
678 // register classtype and initialize default task class
679 ckrm_register_classtype(&CT_taskclass);
680 ckrm_register_event_set(taskclass_events_callbacks);
682 // note registeration of all resource controllers will be done
683 // later dynamically as these are specified as modules
686 static int tc_show_members(struct ckrm_core_class *core, struct seq_file *seq)
688 struct list_head *lh;
689 struct task_struct *tsk;
692 list_for_each(lh, &core->objlist) {
693 tsk = container_of(lh, struct task_struct, taskclass_link);
694 seq_printf(seq, "%ld\n", (long)tsk->pid);
701 static int tc_forced_reclassify(struct ckrm_core_class *target, const char *obj)
706 pid = (pid_t) simple_strtol(obj, NULL, 0);
708 down(&async_serializer); // protect again race condition with reclassify_class
710 // do we want to treat this as process group .. TBD
712 } else if (pid == 0) {
713 rc = (target == NULL) ? ckrm_reclassify_all_tasks() : -EINVAL;
715 struct ckrm_task_class *cls = NULL;
717 cls = class_type(ckrm_task_class_t,target);
718 rc = ckrm_forced_reclassify_pid(pid,cls);
720 up(&async_serializer);
726 /******************************************************************************
727 * Debugging Task Classes: Utility functions
728 ******************************************************************************/
730 void check_tasklist_sanity(struct ckrm_task_class *cls)
732 struct ckrm_core_class *core = class_core(cls);
733 struct list_head *lh1, *lh2;
738 if (list_empty(&core->objlist)) {
740 printk(KERN_DEBUG "check_tasklist_sanity: class %s empty list\n",
744 list_for_each_safe(lh1, lh2, &core->objlist) {
745 struct task_struct *tsk =
746 container_of(lh1, struct task_struct,
748 if (count++ > 20000) {
749 printk(KERN_WARNING "list is CORRUPTED\n");
752 if (tsk->taskclass != cls) {
753 const char *tclsname;
754 tclsname = (tsk->taskclass) ?
755 class_core(tsk->taskclass)->name:"NULL";
756 printk(KERN_WARNING "sanity: task %s:%d has ckrm_core "
757 "|%s| but in list |%s|\n", tsk->comm,
758 tsk->pid, tclsname, core->name);
765 void ckrm_debug_free_task_class(struct ckrm_task_class *tskcls)
767 struct task_struct *proc, *thread;
770 printk(KERN_DEBUG "Analyze Error <%s> %d\n",
771 class_core(tskcls)->name,
772 atomic_read(&(class_core(tskcls)->refcnt)));
774 read_lock(&tasklist_lock);
775 class_lock(class_core(tskcls));
776 do_each_thread(proc, thread) {
777 count += (tskcls == thread->taskclass);
778 if ((thread->taskclass == tskcls) || (tskcls == NULL)) {
779 const char *tclsname;
780 tclsname = (thread->taskclass) ?
781 class_core(thread->taskclass)->name :"NULL";
782 printk(KERN_DEBUG "%d thread=<%s:%d> -> <%s> <%lx>\n", count,
783 thread->comm, thread->pid, tclsname,
784 thread->flags & PF_EXITING);
786 } while_each_thread(proc, thread);
787 class_unlock(class_core(tskcls));
788 read_unlock(&tasklist_lock);
790 printk(KERN_DEBUG "End Analyze Error <%s> %d\n",
791 class_core(tskcls)->name,
792 atomic_read(&(class_core(tskcls)->refcnt)));