1 /* ckrm_tc.c - Class-based Kernel Resource Management (CKRM)
3 * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004
4 * (C) Shailabh Nagar, IBM Corp. 2003
5 * (C) Chandra Seetharaman, IBM Corp. 2003
6 * (C) Vivek Kashyap, IBM Corp. 2004
9 * Provides kernel API of CKRM for in-kernel,per-resource controllers
10 * (one each for cpu, memory, io, network) and callbacks for
11 * classification modules.
13 * Latest version, more details at http://ckrm.sf.net
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
27 * Made modifications to suit the new RBCE module.
29 * Fixed a bug in fork and exit callbacks. Added callbacks_active and
30 * surrounding logic. Added task paramter for all CE callbacks.
32 * moved to referenced counted class objects and correct locking
34 * introduced adopted to emerging classtype interface
37 #include <linux/config.h>
38 #include <linux/init.h>
39 #include <linux/linkage.h>
40 #include <linux/kernel.h>
41 #include <linux/errno.h>
42 #include <asm/uaccess.h>
44 #include <asm/errno.h>
45 #include <linux/string.h>
46 #include <linux/list.h>
47 #include <linux/spinlock.h>
48 #include <linux/module.h>
49 #include <linux/ckrm_rc.h>
51 #include <linux/ckrm_tc.h>
53 #define TC_DEBUG(fmt, args...) do { \
54 /* printk("%s: " fmt, __FUNCTION__ , ## args); */ } while (0)
56 static struct ckrm_task_class taskclass_dflt_class = {
59 const char *dflt_taskclass_name = TASK_CLASS_TYPE_NAME;
61 static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class
62 *parent, const char *name);
63 static int ckrm_free_task_class(struct ckrm_core_class *core);
65 static int tc_forced_reclassify(ckrm_core_class_t * target,
67 static int tc_show_members(struct ckrm_core_class *core, struct seq_file *seq);
68 static void tc_add_resctrl(struct ckrm_core_class *core, int resid);
70 struct ckrm_classtype CT_taskclass = {
72 .name = TASK_CLASS_TYPE_NAME,
73 .typeID = CKRM_CLASSTYPE_TASK_CLASS,
74 .maxdepth = 3, // Hubertus .. just to start
75 .resid_reserved = 4, // Hubertus .. reservation
76 .max_res_ctlrs = CKRM_MAX_RES_CTLRS,
79 .res_ctlrs_lock = SPIN_LOCK_UNLOCKED,
80 .classes = LIST_HEAD_INIT(CT_taskclass.classes),
82 .default_class = &taskclass_dflt_class.core,
84 // private version of functions
85 .alloc = &ckrm_alloc_task_class,
86 .free = &ckrm_free_task_class,
87 .show_members = &tc_show_members,
88 .forced_reclassify = &tc_forced_reclassify,
90 // use of default functions
91 .show_shares = &ckrm_class_show_shares,
92 .show_stats = &ckrm_class_show_stats,
93 .show_config = &ckrm_class_show_config,
94 .set_config = &ckrm_class_set_config,
95 .set_shares = &ckrm_class_set_shares,
96 .reset_stats = &ckrm_class_reset_stats,
98 // mandatory private version .. no dflt available
99 .add_resctrl = &tc_add_resctrl,
102 /**************************************************************************
104 **************************************************************************/
106 static inline void ckrm_init_task_lock(struct task_struct *tsk)
108 tsk->ckrm_tsklock = SPIN_LOCK_UNLOCKED;
111 // Hubertus .. following functions should move to ckrm_rc.h
113 static inline void ckrm_task_lock(struct task_struct *tsk)
115 spin_lock(&tsk->ckrm_tsklock);
118 static inline void ckrm_task_unlock(struct task_struct *tsk)
120 spin_unlock(&tsk->ckrm_tsklock);
124 * Change the task class of the given task.
126 * Change the task's task class to "newcls" if the task's current
127 * class (task->taskclass) is same as given "oldcls", if it is non-NULL.
129 * Caller is responsible to make sure the task structure stays put through
132 * This function should be called with the following locks NOT held
133 * - tsk->ckrm_task_lock
134 * - core->ckrm_lock, if core is NULL then ckrm_dflt_class.ckrm_lock
135 * - tsk->taskclass->ckrm_lock
137 * Function is also called with a ckrm_core_grab on the new core, hence
138 * it needs to be dropped if no assignment takes place.
141 ckrm_set_taskclass(struct task_struct *tsk, ckrm_task_class_t * newcls,
142 ckrm_task_class_t * oldcls, enum ckrm_event event)
145 ckrm_classtype_t *clstype;
146 ckrm_res_ctlr_t *rcbs;
147 ckrm_task_class_t *curcls;
148 void *old_res_class, *new_res_class;
152 curcls = tsk->taskclass;
154 if ((void *)-1 == curcls) {
155 // task is disassociated from ckrm... don't bother it.
156 ckrm_task_unlock(tsk);
157 ckrm_core_drop(class_core(newcls));
161 if ((curcls == NULL) && (newcls == (void *)-1)) {
162 // task need to disassociated from ckrm and has no curcls
163 // just disassociate and return.
164 tsk->taskclass = newcls;
165 ckrm_task_unlock(tsk);
168 // check whether compare_and_exchange should
169 if (oldcls && (oldcls != curcls)) {
170 ckrm_task_unlock(tsk);
172 /* compensate for previous grab */
173 TC_DEBUG("(%s:%d): Race-condition caught <%s> %d\n",
174 tsk->comm, tsk->pid, class_core(newcls)->name,
176 ckrm_core_drop(class_core(newcls));
180 // make sure we have a real destination core
182 newcls = &taskclass_dflt_class;
183 ckrm_core_grab(class_core(newcls));
185 // take out of old class
186 // remember that we need to drop the oldcore
187 if ((drop_old_cls = (curcls != NULL))) {
188 class_lock(class_core(curcls));
189 if (newcls == curcls) {
190 // we are already in the destination class.
191 // we still need to drop oldcore
192 class_unlock(class_core(curcls));
193 ckrm_task_unlock(tsk);
196 list_del(&tsk->taskclass_link);
197 INIT_LIST_HEAD(&tsk->taskclass_link);
198 tsk->taskclass = NULL;
199 class_unlock(class_core(curcls));
200 if (newcls == (void *)-1) {
201 tsk->taskclass = newcls;
202 ckrm_task_unlock(tsk);
203 // still need to get out of old class
208 // put into new class
209 class_lock(class_core(newcls));
210 tsk->taskclass = newcls;
211 list_add(&tsk->taskclass_link, &class_core(newcls)->objlist);
212 class_unlock(class_core(newcls));
214 if (newcls == curcls) {
215 ckrm_task_unlock(tsk);
219 CE_NOTIFY(&CT_taskclass, event, newcls, tsk);
221 ckrm_task_unlock(tsk);
224 clstype = &CT_taskclass;
225 if (clstype->bit_res_ctlrs) {
226 // avoid running through the entire list if non is registered
227 for (i = 0; i < clstype->max_resid; i++) {
228 if (clstype->res_ctlrs[i] == NULL)
230 atomic_inc(&clstype->nr_resusers[i]);
232 curcls ? class_core(curcls)->res_class[i] : NULL;
234 newcls ? class_core(newcls)->res_class[i] : NULL;
235 rcbs = clstype->res_ctlrs[i];
236 if (rcbs && rcbs->change_resclass
237 && (old_res_class != new_res_class))
238 (*rcbs->change_resclass) (tsk, old_res_class,
240 atomic_dec(&clstype->nr_resusers[i]);
246 ckrm_core_drop(class_core(curcls));
250 // HF SUGGEST: we could macro-tize this for other types
251 // DEF_FUNC_ADD_RESCTRL(funcname,link)
252 // would DEF_FUNC_ADD_RESCTRL(tc_add_resctrl,taskclass_link)
254 static void tc_add_resctrl(struct ckrm_core_class *core, int resid)
256 struct task_struct *tsk;
257 struct ckrm_res_ctlr *rcbs;
259 if ((resid < 0) || (resid >= CKRM_MAX_RES_CTLRS)
260 || ((rcbs = core->classtype->res_ctlrs[resid]) == NULL))
264 list_for_each_entry(tsk, &core->objlist, taskclass_link) {
265 if (rcbs->change_resclass)
266 (*rcbs->change_resclass) (tsk, (void *)-1,
267 core->res_class[resid]);
272 /**************************************************************************
273 * Functions called from classification points *
274 **************************************************************************/
276 #define ECB_PRINTK(fmt, args...) \
277 // do { if (CT_taskclass.ce_regd)
278 // printk("%s: " fmt, __FUNCTION__ , ## args); } while (0)
280 #define CE_CLASSIFY_TASK(event, tsk) \
282 struct ckrm_task_class *newcls = NULL; \
283 struct ckrm_task_class *oldcls = tsk->taskclass; \
285 CE_CLASSIFY_RET(newcls,&CT_taskclass,event,tsk); \
287 /* called synchrously. no need to get task struct */ \
288 ckrm_set_taskclass(tsk, newcls, oldcls, event); \
293 #define CE_CLASSIFY_TASK_PROTECT(event, tsk) \
295 ce_protect(&CT_taskclass); \
296 CE_CLASSIFY_TASK(event,tsk); \
297 ce_release(&CT_taskclass); \
300 static void cb_taskclass_newtask(struct task_struct *tsk)
302 tsk->taskclass = NULL;
303 INIT_LIST_HEAD(&tsk->taskclass_link);
306 static void cb_taskclass_fork(struct task_struct *tsk)
308 struct ckrm_task_class *cls = NULL;
310 ECB_PRINTK("%p:%d:%s\n", tsk, tsk->pid, tsk->comm);
312 ce_protect(&CT_taskclass);
313 CE_CLASSIFY_RET(cls, &CT_taskclass, CKRM_EVENT_FORK, tsk);
315 ckrm_task_lock(tsk->parent);
316 cls = tsk->parent->taskclass;
317 ckrm_core_grab(class_core(cls));
318 ckrm_task_unlock(tsk->parent);
320 if (!list_empty(&tsk->taskclass_link))
321 printk("BUG in cb_fork.. tsk (%s:%d> already linked\n",
322 tsk->comm, tsk->pid);
324 ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_FORK);
325 ce_release(&CT_taskclass);
328 static void cb_taskclass_exit(struct task_struct *tsk)
330 CE_CLASSIFY_NORET(&CT_taskclass, CKRM_EVENT_EXIT, tsk);
331 ckrm_set_taskclass(tsk, (void *)-1, NULL, CKRM_EVENT_EXIT);
334 static void cb_taskclass_exec(const char *filename)
336 ECB_PRINTK("%p:%d:%s <%s>\n", current, current->pid, current->comm,
338 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_EXEC, current);
341 static void cb_taskclass_uid(void)
343 ECB_PRINTK("%p:%d:%s\n", current, current->pid, current->comm);
344 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_UID, current);
347 static void cb_taskclass_gid(void)
349 ECB_PRINTK("%p:%d:%s\n", current, current->pid, current->comm);
350 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_GID, current);
353 static struct ckrm_event_spec taskclass_events_callbacks[] = {
354 CKRM_EVENT_SPEC(NEWTASK, cb_taskclass_newtask),
355 CKRM_EVENT_SPEC(EXEC, cb_taskclass_exec),
356 CKRM_EVENT_SPEC(FORK, cb_taskclass_fork),
357 CKRM_EVENT_SPEC(EXIT, cb_taskclass_exit),
358 CKRM_EVENT_SPEC(UID, cb_taskclass_uid),
359 CKRM_EVENT_SPEC(GID, cb_taskclass_gid),
363 /***********************************************************************
365 * Asynchronous callback functions (driven by RCFS)
367 * Async functions force a setting of the task structure
368 * synchronous callbacks are protected against race conditions
369 * by using a cmpxchg on the core before setting it.
370 * Async calls need to be serialized to ensure they can't
371 * race against each other
373 ***********************************************************************/
375 DECLARE_MUTEX(async_serializer); // serialize all async functions
378 * Go through the task list and reclassify all tasks according to the current
379 * classification rules.
381 * We have the problem that we can not hold any lock (including the
382 * tasklist_lock) while classifying. Two methods possible
384 * (a) go through entire pidrange (0..pidmax) and if a task exists at
385 * that pid then reclassify it
386 * (b) go several time through task list and build a bitmap for a particular
387 * subrange of pid otherwise the memory requirements ight be too much.
389 * We use a hybrid by comparing ratio nr_threads/pidmax
392 static void ckrm_reclassify_all_tasks(void)
396 struct task_struct *proc, *thread;
398 int curpidmax = pid_max;
402 ratio = curpidmax / nr_threads;
403 if (curpidmax <= PID_MAX_DEFAULT) {
406 use_bitmap = (ratio >= 2);
409 ce_protect(&CT_taskclass);
412 if (use_bitmap == 0) {
413 // go through it in one walk
414 read_lock(&tasklist_lock);
415 for (i = 0; i < curpidmax; i++) {
416 if ((thread = find_task_by_pid(i)) == NULL)
418 get_task_struct(thread);
419 read_unlock(&tasklist_lock);
420 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY, thread);
421 put_task_struct(thread);
422 read_lock(&tasklist_lock);
424 read_unlock(&tasklist_lock);
426 unsigned long *bitmap;
432 bitmap = (unsigned long *)__get_free_pages(GFP_KERNEL, order);
433 if (bitmap == NULL) {
438 bitmapsize = 8 * (1 << (order + PAGE_SHIFT));
439 num_loops = (curpidmax + bitmapsize - 1) / bitmapsize;
442 for (i = 0; i < num_loops && do_next; i++) {
443 int pid_start = i * bitmapsize;
444 int pid_end = pid_start + bitmapsize;
448 memset(bitmap, 0, bitmapsize / 8); // start afresh
451 read_lock(&tasklist_lock);
452 do_each_thread(proc, thread) {
454 if ((pid < pid_start) || (pid >= pid_end)) {
455 if (pid >= pid_end) {
461 set_bit(pid, bitmap);
464 while_each_thread(proc, thread);
465 read_unlock(&tasklist_lock);
471 for (; num_found--;) {
472 pos = find_next_bit(bitmap, bitmapsize, pos);
473 pid = pos + pid_start;
475 read_lock(&tasklist_lock);
476 if ((thread = find_task_by_pid(pid)) != NULL) {
477 get_task_struct(thread);
478 read_unlock(&tasklist_lock);
479 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY,
481 put_task_struct(thread);
483 read_unlock(&tasklist_lock);
489 ce_release(&CT_taskclass);
492 int ckrm_reclassify(int pid)
494 struct task_struct *tsk;
497 down(&async_serializer); // protect again race condition
499 // do we want to treat this as process group .. should YES ToDo
501 } else if (pid == 0) {
502 // reclassify all tasks in the system
503 ckrm_reclassify_all_tasks();
505 // reclassify particular pid
506 read_lock(&tasklist_lock);
507 if ((tsk = find_task_by_pid(pid)) != NULL) {
508 get_task_struct(tsk);
509 read_unlock(&tasklist_lock);
510 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_RECLASSIFY, tsk);
511 put_task_struct(tsk);
513 read_unlock(&tasklist_lock);
517 up(&async_serializer);
522 * Reclassify all tasks in the given core class.
525 static void ckrm_reclassify_class_tasks(struct ckrm_task_class *cls)
528 struct ckrm_hnode *cnode;
529 struct ckrm_task_class *parcls;
532 if (!ckrm_validate_and_grab_core(&cls->core))
535 down(&async_serializer); // protect again race condition
536 TC_DEBUG("start %p:%s:%d:%d\n", cls, cls->core.name,
537 atomic_read(&cls->core.refcnt),
538 atomic_read(&cls->core.hnode.parent->refcnt));
539 // If no CE registered for this classtype, following will be needed
541 ce_regd = class_core(cls)->classtype->ce_regd;
542 cnode = &(class_core(cls)->hnode);
543 parcls = class_type(ckrm_task_class_t, cnode->parent);
546 class_lock(class_core(cls));
547 if (!list_empty(&class_core(cls)->objlist)) {
548 struct ckrm_task_class *newcls = NULL;
549 struct task_struct *tsk =
550 list_entry(class_core(cls)->objlist.next,
551 struct task_struct, taskclass_link);
553 get_task_struct(tsk);
554 class_unlock(class_core(cls));
557 CE_CLASSIFY_RET(newcls, &CT_taskclass,
558 CKRM_EVENT_RECLASSIFY, tsk);
560 // don't allow reclassifying to the same class
561 // as we are in the process of cleaning up
564 // compensate CE's grab
565 ckrm_core_drop(class_core(newcls));
569 if (newcls == NULL) {
571 ckrm_core_grab(class_core(newcls));
573 ckrm_set_taskclass(tsk, newcls, cls, CKRM_EVENT_RECLASSIFY);
574 put_task_struct(tsk);
578 TC_DEBUG("stop %p:%s:%d:%d %d\n", cls, cls->core.name,
579 atomic_read(&cls->core.refcnt),
580 atomic_read(&cls->core.hnode.parent->refcnt), num);
581 class_unlock(class_core(cls));
582 ckrm_core_drop(class_core(cls));
584 up(&async_serializer);
590 * Change the core class of the given task.
593 int ckrm_forced_reclassify_pid(pid_t pid, struct ckrm_task_class *cls)
595 struct task_struct *tsk;
597 if (!ckrm_validate_and_grab_core(class_core(cls)))
600 read_lock(&tasklist_lock);
601 if ((tsk = find_task_by_pid(pid)) == NULL) {
602 read_unlock(&tasklist_lock);
603 ckrm_core_drop(class_core(cls));
606 get_task_struct(tsk);
607 read_unlock(&tasklist_lock);
609 /* Check permissions */
610 if ((!capable(CAP_SYS_NICE)) &&
611 (!capable(CAP_SYS_RESOURCE)) && (current->user != tsk->user)) {
612 ckrm_core_drop(class_core(cls));
613 put_task_struct(tsk);
617 down(&async_serializer); // protect again race condition
619 ce_protect(&CT_taskclass);
620 ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_MANUAL);
621 ce_release(&CT_taskclass);
622 put_task_struct(tsk);
624 up(&async_serializer);
628 static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class
629 *parent, const char *name)
631 struct ckrm_task_class *taskcls;
632 taskcls = kmalloc(sizeof(struct ckrm_task_class), GFP_KERNEL);
635 memset(taskcls, 0, sizeof(struct ckrm_task_class));
637 ckrm_init_core_class(&CT_taskclass, class_core(taskcls), parent, name);
639 ce_protect(&CT_taskclass);
640 if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_add)
641 (*CT_taskclass.ce_callbacks.class_add) (name, taskcls,
642 CT_taskclass.typeID);
643 ce_release(&CT_taskclass);
645 return class_core(taskcls);
648 static int ckrm_free_task_class(struct ckrm_core_class *core)
650 struct ckrm_task_class *taskcls;
652 if (!ckrm_is_core_valid(core)) {
656 if (core == core->classtype->default_class) {
657 // reset the name tag
658 core->name = dflt_taskclass_name;
662 TC_DEBUG("%p:%s:%d\n", core, core->name, atomic_read(&core->refcnt));
664 taskcls = class_type(struct ckrm_task_class, core);
666 ce_protect(&CT_taskclass);
668 if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_delete)
669 (*CT_taskclass.ce_callbacks.class_delete) (core->name, taskcls,
670 CT_taskclass.typeID);
671 ckrm_reclassify_class_tasks(taskcls);
673 ce_release(&CT_taskclass);
675 ckrm_release_core_class(core);
676 // Hubertus .... could just drop the class .. error message
680 void __init ckrm_meta_init_taskclass(void)
682 printk("...... Initializing ClassType<%s> ........\n",
684 // intialize the default class
685 ckrm_init_core_class(&CT_taskclass, class_core(&taskclass_dflt_class),
686 NULL, dflt_taskclass_name);
688 // register classtype and initialize default task class
689 ckrm_register_classtype(&CT_taskclass);
690 ckrm_register_event_set(taskclass_events_callbacks);
692 // note registeration of all resource controllers will be done
693 // later dynamically as these are specified as modules
696 static int tc_show_members(struct ckrm_core_class *core, struct seq_file *seq)
698 struct list_head *lh;
699 struct task_struct *tsk;
702 list_for_each(lh, &core->objlist) {
703 tsk = container_of(lh, struct task_struct, taskclass_link);
704 seq_printf(seq, "%ld\n", (long)tsk->pid);
711 static int tc_forced_reclassify(struct ckrm_core_class *target, const char *obj)
716 pid = (pid_t) simple_strtoul(obj, NULL, 10);
718 rc = ckrm_forced_reclassify_pid(pid,
719 class_type(ckrm_task_class_t,
727 /******************************************************************************
728 * Debugging Task Classes: Utility functions
729 ******************************************************************************/
731 void check_tasklist_sanity(struct ckrm_task_class *cls)
733 struct ckrm_core_class *core = class_core(cls);
734 struct list_head *lh1, *lh2;
739 if (list_empty(&core->objlist)) {
741 printk("check_tasklist_sanity: class %s empty list\n",
745 list_for_each_safe(lh1, lh2, &core->objlist) {
746 struct task_struct *tsk =
747 container_of(lh1, struct task_struct,
749 if (count++ > 20000) {
750 printk("list is CORRUPTED\n");
753 if (tsk->taskclass != cls) {
754 const char *tclsname;
755 tclsname = (tsk->taskclass) ?
756 class_core(tsk->taskclass)->name:"NULL";
757 printk("sanity: task %s:%d has ckrm_core "
758 "|%s| but in list |%s|\n", tsk->comm,
759 tsk->pid, tclsname, core->name);
766 void ckrm_debug_free_task_class(struct ckrm_task_class *tskcls)
768 struct task_struct *proc, *thread;
771 printk("Analyze Error <%s> %d\n",
772 class_core(tskcls)->name,
773 atomic_read(&(class_core(tskcls)->refcnt)));
775 read_lock(&tasklist_lock);
776 class_lock(class_core(tskcls));
777 do_each_thread(proc, thread) {
778 count += (tskcls == thread->taskclass);
779 if ((thread->taskclass == tskcls) || (tskcls == NULL)) {
780 const char *tclsname;
781 tclsname = (thread->taskclass) ?
782 class_core(thread->taskclass)->name :"NULL";
783 printk("%d thread=<%s:%d> -> <%s> <%lx>\n", count,
784 thread->comm, thread->pid, tclsname,
785 thread->flags & PF_EXITING);
787 } while_each_thread(proc, thread);
788 class_unlock(class_core(tskcls));
789 read_unlock(&tasklist_lock);
791 printk("End Analyze Error <%s> %d\n",
792 class_core(tskcls)->name,
793 atomic_read(&(class_core(tskcls)->refcnt)));