1 /* ckrm_tc.c - Class-based Kernel Resource Management (CKRM)
3 * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004
4 * (C) Shailabh Nagar, IBM Corp. 2003
5 * (C) Chandra Seetharaman, IBM Corp. 2003
6 * (C) Vivek Kashyap, IBM Corp. 2004
9 * Provides kernel API of CKRM for in-kernel,per-resource controllers
10 * (one each for cpu, memory, io, network) and callbacks for
11 * classification modules.
13 * Latest version, more details at http://ckrm.sf.net
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
27 * Made modifications to suit the new RBCE module.
29 * Fixed a bug in fork and exit callbacks. Added callbacks_active and
30 * surrounding logic. Added task paramter for all CE callbacks.
32 * moved to referenced counted class objects and correct locking
34 * introduced adopted to emerging classtype interface
37 #include <linux/config.h>
38 #include <linux/init.h>
39 #include <linux/linkage.h>
40 #include <linux/kernel.h>
41 #include <linux/errno.h>
42 #include <asm/uaccess.h>
44 #include <asm/errno.h>
45 #include <linux/string.h>
46 #include <linux/list.h>
47 #include <linux/spinlock.h>
48 #include <linux/module.h>
49 #include <linux/ckrm_rc.h>
51 #include <linux/ckrm_tc.h>
55 #define TC_DEBUG(fmt, args...) do { /* printk("%s: " fmt, __FUNCTION__ , ## args); */ } while (0)
58 static struct ckrm_task_class taskclass_dflt_class = {
61 const char *dflt_taskclass_name = TASK_CLASS_TYPE_NAME;
63 static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class *parent, const char *name);
64 static int ckrm_free_task_class(struct ckrm_core_class *core);
66 static int tc_forced_reclassify(ckrm_core_class_t *target, const char *resname);
67 static int tc_show_members(struct ckrm_core_class *core, struct seq_file *seq);
68 static void tc_add_resctrl(struct ckrm_core_class *core, int resid);
70 struct ckrm_classtype CT_taskclass = {
72 .name = TASK_CLASS_TYPE_NAME,
73 .typeID = CKRM_CLASSTYPE_TASK_CLASS,
74 .maxdepth = 3, // Hubertus .. just to start
75 .resid_reserved = 4, // Hubertus .. reservation
76 .max_res_ctlrs = CKRM_MAX_RES_CTLRS,
79 .res_ctlrs_lock = SPIN_LOCK_UNLOCKED,
80 .classes = LIST_HEAD_INIT(CT_taskclass.classes),
82 .default_class = &taskclass_dflt_class.core,
84 // private version of functions
85 .alloc = &ckrm_alloc_task_class,
86 .free = &ckrm_free_task_class,
87 .show_members = &tc_show_members,
88 .forced_reclassify = &tc_forced_reclassify,
90 // use of default functions
91 .show_shares = &ckrm_class_show_shares,
92 .show_stats = &ckrm_class_show_stats,
93 .show_config = &ckrm_class_show_config,
94 .set_config = &ckrm_class_set_config,
95 .set_shares = &ckrm_class_set_shares,
96 .reset_stats = &ckrm_class_reset_stats,
98 // mandatory private version .. no dflt available
99 .add_resctrl = &tc_add_resctrl,
102 /**************************************************************************
104 **************************************************************************/
107 ckrm_init_task_lock(struct task_struct *tsk)
109 tsk->ckrm_tsklock = SPIN_LOCK_UNLOCKED;
112 // Hubertus .. following functions should move to ckrm_rc.h
115 ckrm_task_lock(struct task_struct *tsk)
117 spin_lock(&tsk->ckrm_tsklock);
121 ckrm_task_unlock(struct task_struct *tsk)
123 spin_unlock(&tsk->ckrm_tsklock);
127 * Change the task class of the given task.
129 * Change the task's task class to "newcls" if the task's current
130 * class (task->taskclass) is same as given "oldcls", if it is non-NULL.
132 * Caller is responsible to make sure the task structure stays put through
135 * This function should be called with the following locks NOT held
136 * - tsk->ckrm_task_lock
137 * - core->ckrm_lock, if core is NULL then ckrm_dflt_class.ckrm_lock
138 * - tsk->taskclass->ckrm_lock
140 * Function is also called with a ckrm_core_grab on the new core, hence
141 * it needs to be dropped if no assignment takes place.
145 ckrm_set_taskclass(struct task_struct *tsk, ckrm_task_class_t *newcls,
146 ckrm_task_class_t *oldcls, enum ckrm_event event)
149 ckrm_classtype_t *clstype;
150 ckrm_res_ctlr_t *rcbs;
151 ckrm_task_class_t *curcls;
152 void *old_res_class, *new_res_class;
156 curcls = tsk->taskclass;
158 // check whether compare_and_exchange should
159 if (oldcls && (oldcls != curcls)) {
160 ckrm_task_unlock(tsk);
162 /* compensate for previous grab */
163 TC_DEBUG("(%s:%d): Race-condition caught <%s> %d\n",
164 tsk->comm,tsk->pid,class_core(newcls)->name,event);
165 ckrm_core_drop(class_core(newcls));
170 // make sure we have a real destination core
172 newcls = &taskclass_dflt_class;
173 ckrm_core_grab(class_core(newcls));
176 // take out of old class
177 // remember that we need to drop the oldcore
178 if ((drop_old_cls = (curcls != NULL))) {
179 class_lock(class_core(curcls));
180 if (newcls == curcls) {
181 // we are already in the destination class.
182 // we still need to drop oldcore
183 class_unlock(class_core(curcls));
184 ckrm_task_unlock(tsk);
187 list_del(&tsk->taskclass_link);
188 INIT_LIST_HEAD(&tsk->taskclass_link);
189 tsk->taskclass = NULL;
190 class_unlock(class_core(curcls));
193 // put into new class
194 class_lock(class_core(newcls));
195 tsk->taskclass = newcls;
196 list_add(&tsk->taskclass_link, &class_core(newcls)->objlist);
197 class_unlock(class_core(newcls));
199 if (newcls == curcls) {
200 ckrm_task_unlock(tsk);
204 CE_NOTIFY(&CT_taskclass,event,newcls,tsk);
206 ckrm_task_unlock(tsk);
208 clstype = class_isa(newcls); // Hubertus .. can hardcode ckrm_CT_taskclass
209 if (clstype->bit_res_ctlrs) { // avoid running through the entire list if non is registered
210 for (i = 0; i < clstype->max_resid; i++) {
211 if (clstype->res_ctlrs[i] == NULL)
213 atomic_inc(&clstype->nr_resusers[i]);
214 old_res_class = curcls ? class_core(curcls)->res_class[i] : NULL;
215 new_res_class = newcls ? class_core(newcls)->res_class[i] : NULL;
216 rcbs = clstype->res_ctlrs[i];
217 if (rcbs && rcbs->change_resclass && (old_res_class != new_res_class))
218 (*rcbs->change_resclass)(tsk, old_res_class, new_res_class);
219 atomic_dec(&clstype->nr_resusers[i]);
225 ckrm_core_drop(class_core(curcls));
229 // HF SUGGEST: we could macro-tize this for other types DEF_FUNC_ADD_RESCTRL(funcname,link)
230 // would DEF_FUNC_ADD_RESCTRL(tc_add_resctrl,taskclass_link)
233 tc_add_resctrl(struct ckrm_core_class *core, int resid)
235 struct task_struct *tsk;
236 struct ckrm_res_ctlr *rcbs;
238 if ((resid < 0) || (resid >= CKRM_MAX_RES_CTLRS) || ((rcbs = core->classtype->res_ctlrs[resid]) == NULL))
242 list_for_each_entry(tsk, &core->objlist, taskclass_link) {
243 if (rcbs->change_resclass)
244 (*rcbs->change_resclass)(tsk, (void *) -1, core->res_class[resid]);
250 /**************************************************************************
251 * Functions called from classification points *
252 **************************************************************************/
254 #define ECB_PRINTK(fmt, args...) // do { if (CT_taskclass.ce_regd) printk("%s: " fmt, __FUNCTION__ , ## args); } while (0)
256 #define CE_CLASSIFY_TASK(event, tsk) \
258 struct ckrm_task_class *newcls = NULL, *oldcls = tsk->taskclass; \
260 CE_CLASSIFY_RET(newcls,&CT_taskclass,event,tsk); \
262 /* called synchrously. no need to get task struct */ \
263 ckrm_set_taskclass(tsk, newcls, oldcls, event); \
267 #define CE_CLASSIFY_TASK_PROTECT(event, tsk) \
269 ce_protect(&CT_taskclass); \
270 CE_CLASSIFY_TASK(event,tsk); \
271 ce_release(&CT_taskclass); \
278 cb_taskclass_newtask(struct task_struct *tsk)
280 tsk->taskclass = NULL;
281 INIT_LIST_HEAD(&tsk->taskclass_link);
286 cb_taskclass_fork(struct task_struct *tsk)
288 struct ckrm_task_class *cls = NULL;
290 ECB_PRINTK("%p:%d:%s\n",tsk,tsk->pid,tsk->comm);
292 ce_protect(&CT_taskclass);
293 CE_CLASSIFY_RET(cls,&CT_taskclass,CKRM_EVENT_FORK,tsk);
295 ckrm_task_lock(tsk->parent);
296 cls = tsk->parent->taskclass;
297 ckrm_core_grab(class_core(cls));
298 ckrm_task_unlock(tsk->parent);
300 if (!list_empty(&tsk->taskclass_link))
301 printk("BUG in cb_fork.. tsk (%s:%d> already linked\n",
304 ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_FORK);
305 ce_release(&CT_taskclass);
309 cb_taskclass_exit(struct task_struct *tsk)
311 ckrm_task_class_t *cls;
313 // Remove the task from the current core class
315 ECB_PRINTK("%p:%d:%s\n",tsk,tsk->pid,tsk->comm);
318 CE_CLASSIFY_NORET( &CT_taskclass, CKRM_EVENT_EXIT, tsk);
320 if ((cls = tsk->taskclass) != NULL) {
321 class_lock(class_core(cls));
322 tsk->taskclass = NULL;
323 list_del(&tsk->taskclass_link);
324 class_unlock(class_core(cls));
325 ckrm_core_drop(class_core(cls));
327 INIT_LIST_HEAD(&tsk->taskclass_link);
329 ckrm_task_unlock(tsk);
333 cb_taskclass_exec(const char *filename)
335 ECB_PRINTK("%p:%d:%s <%s>\n",current,current->pid,current->comm,filename);
336 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_EXEC, current);
340 cb_taskclass_uid(void)
342 ECB_PRINTK("%p:%d:%s\n",current,current->pid,current->comm);
343 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_UID, current);
347 cb_taskclass_gid(void)
349 ECB_PRINTK("%p:%d:%s\n",current,current->pid,current->comm);
350 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_GID, current);
353 static struct ckrm_event_spec taskclass_events_callbacks[] = {
354 CKRM_EVENT_SPEC( NEWTASK, cb_taskclass_newtask ),
355 CKRM_EVENT_SPEC( EXEC , cb_taskclass_exec ),
356 CKRM_EVENT_SPEC( FORK , cb_taskclass_fork ),
357 CKRM_EVENT_SPEC( EXIT , cb_taskclass_exit ),
358 CKRM_EVENT_SPEC( UID , cb_taskclass_uid ),
359 CKRM_EVENT_SPEC( GID , cb_taskclass_gid ),
363 /***********************************************************************
365 * Asynchronous callback functions (driven by RCFS)
367 * Async functions force a setting of the task structure
368 * synchronous callbacks are protected against race conditions
369 * by using a cmpxchg on the core before setting it.
370 * Async calls need to be serialized to ensure they can't
371 * race against each other
373 ***********************************************************************/
375 DECLARE_MUTEX(async_serializer); // serialize all async functions
379 * Go through the task list and reclassify all tasks according to the current
380 * classification rules.
382 * We have the problem that we can not hold any lock (including the
383 * tasklist_lock) while classifying. Two methods possible
385 * (a) go through entire pidrange (0..pidmax) and if a task exists at
386 * that pid then reclassify it
387 * (b) go several time through task list and build a bitmap for a particular
388 * subrange of pid otherwise the memory requirements ight be too much.
390 * We use a hybrid by comparing ratio nr_threads/pidmax
394 ckrm_reclassify_all_tasks(void)
398 struct task_struct *proc, *thread;
400 int curpidmax = pid_max;
405 ratio = curpidmax / nr_threads;
406 if (curpidmax <= PID_MAX_DEFAULT) {
409 use_bitmap = (ratio >= 2);
412 ce_protect(&CT_taskclass);
415 if (use_bitmap == 0) {
416 // go through it in one walk
417 read_lock(&tasklist_lock);
418 for ( i=0 ; i<curpidmax ; i++ ) {
419 if ((thread = find_task_by_pid(i)) == NULL)
421 get_task_struct(thread);
422 read_unlock(&tasklist_lock);
423 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY, thread);
424 put_task_struct(thread);
425 read_lock(&tasklist_lock);
427 read_unlock(&tasklist_lock);
429 unsigned long *bitmap;
436 bitmap = (unsigned long*) __get_free_pages(GFP_KERNEL,order);
437 if (bitmap == NULL) {
442 bitmapsize = 8 * (1 << (order + PAGE_SHIFT));
443 num_loops = (curpidmax + bitmapsize - 1) / bitmapsize;
446 for ( i=0 ; i < num_loops && do_next; i++) {
447 int pid_start = i*bitmapsize;
448 int pid_end = pid_start + bitmapsize;
452 memset(bitmap, 0, bitmapsize/8); // start afresh
455 read_lock(&tasklist_lock);
456 do_each_thread(proc, thread) {
458 if ((pid < pid_start) || (pid >= pid_end)) {
459 if (pid >= pid_end) {
465 set_bit(pid, bitmap);
467 } while_each_thread(proc, thread);
468 read_unlock(&tasklist_lock);
474 for ( ; num_found-- ; ) {
475 pos = find_next_bit(bitmap, bitmapsize, pos);
476 pid = pos + pid_start;
478 read_lock(&tasklist_lock);
479 if ((thread = find_task_by_pid(pid)) != NULL) {
480 get_task_struct(thread);
481 read_unlock(&tasklist_lock);
482 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY, thread);
483 put_task_struct(thread);
485 read_unlock(&tasklist_lock);
491 ce_release(&CT_taskclass);
495 ckrm_reclassify(int pid)
497 struct task_struct *tsk;
500 down(&async_serializer); // protect again race condition
502 // do we want to treat this as process group .. should YES ToDo
504 } else if (pid == 0) {
505 // reclassify all tasks in the system
506 ckrm_reclassify_all_tasks();
508 // reclassify particular pid
509 read_lock(&tasklist_lock);
510 if ((tsk = find_task_by_pid(pid)) != NULL) {
511 get_task_struct(tsk);
512 read_unlock(&tasklist_lock);
513 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_RECLASSIFY, tsk);
514 put_task_struct(tsk);
516 read_unlock(&tasklist_lock);
520 up(&async_serializer);
525 * Reclassify all tasks in the given core class.
529 ckrm_reclassify_class_tasks(struct ckrm_task_class *cls)
532 struct ckrm_hnode *cnode;
533 struct ckrm_task_class *parcls;
536 if (!ckrm_validate_and_grab_core(&cls->core))
539 down(&async_serializer); // protect again race condition
542 TC_DEBUG("start %p:%s:%d:%d\n",cls,cls->core.name,
543 atomic_read(&cls->core.refcnt),atomic_read(&cls->core.hnode.parent->refcnt));
544 // If no CE registered for this classtype, following will be needed repeatedly;
545 ce_regd = class_core(cls)->classtype->ce_regd;
546 cnode = &(class_core(cls)->hnode);
547 parcls = class_type(ckrm_task_class_t, cnode->parent);
550 class_lock(class_core(cls));
551 if (!list_empty(&class_core(cls)->objlist)) {
552 struct ckrm_task_class *newcls = NULL;
553 struct task_struct *tsk =
554 list_entry(class_core(cls)->objlist.next,
555 struct task_struct, taskclass_link);
557 get_task_struct(tsk);
558 class_unlock(class_core(cls));
561 CE_CLASSIFY_RET(newcls,&CT_taskclass,CKRM_EVENT_RECLASSIFY,tsk);
563 // don't allow reclassifying to the same class
564 // as we are in the process of cleaning up this class
565 ckrm_core_drop(class_core(newcls)); // to compensate CE's grab
569 if (newcls == NULL) {
571 ckrm_core_grab(class_core(newcls));
573 ckrm_set_taskclass(tsk, newcls, cls, CKRM_EVENT_RECLASSIFY);
574 put_task_struct(tsk);
578 TC_DEBUG("stop %p:%s:%d:%d %d\n",cls,cls->core.name,
579 atomic_read(&cls->core.refcnt),atomic_read(&cls->core.hnode.parent->refcnt),num);
580 class_unlock(class_core(cls));
581 ckrm_core_drop(class_core(cls));
583 up(&async_serializer);
589 * Change the core class of the given task.
593 ckrm_forced_reclassify_pid(pid_t pid, struct ckrm_task_class *cls)
595 struct task_struct *tsk;
597 if (!ckrm_validate_and_grab_core(class_core(cls)))
600 read_lock(&tasklist_lock);
601 if ((tsk = find_task_by_pid(pid)) == NULL) {
602 read_unlock(&tasklist_lock);
603 ckrm_core_drop(class_core(cls));
606 get_task_struct(tsk);
607 read_unlock(&tasklist_lock);
609 down(&async_serializer); // protect again race condition
611 ce_protect(&CT_taskclass);
612 ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_MANUAL);
613 ce_release(&CT_taskclass);
614 put_task_struct(tsk);
616 up(&async_serializer);
620 static struct ckrm_core_class *
621 ckrm_alloc_task_class(struct ckrm_core_class *parent, const char *name)
623 struct ckrm_task_class *taskcls;
624 taskcls = kmalloc(sizeof(struct ckrm_task_class), GFP_KERNEL);
628 ckrm_init_core_class(&CT_taskclass,
629 class_core(taskcls),parent,name);
631 ce_protect(&CT_taskclass);
632 if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_add)
633 (*CT_taskclass.ce_callbacks.class_add)(name,taskcls);
634 ce_release(&CT_taskclass);
636 return class_core(taskcls);
640 ckrm_free_task_class(struct ckrm_core_class *core)
642 struct ckrm_task_class *taskcls;
644 if (!ckrm_is_core_valid(core)) {
648 if (core == core->classtype->default_class) {
649 // reset the name tag
650 core->name = dflt_taskclass_name;
654 TC_DEBUG("%p:%s:%d\n",core,core->name,atomic_read(&core->refcnt));
656 taskcls = class_type(struct ckrm_task_class, core);
658 ce_protect(&CT_taskclass);
660 if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_delete)
661 (*CT_taskclass.ce_callbacks.class_delete)(core->name,taskcls);
662 ckrm_reclassify_class_tasks( taskcls );
664 ce_release(&CT_taskclass);
666 ckrm_release_core_class(core); // Hubertus .... could just drop the class .. error message
672 ckrm_meta_init_taskclass(void)
674 printk("...... Initializing ClassType<%s> ........\n",CT_taskclass.name);
675 // intialize the default class
676 ckrm_init_core_class(&CT_taskclass, class_core(&taskclass_dflt_class),
677 NULL,dflt_taskclass_name);
679 // register classtype and initialize default task class
680 ckrm_register_classtype(&CT_taskclass);
681 ckrm_register_event_set(taskclass_events_callbacks);
683 // note registeration of all resource controllers will be done later dynamically
684 // as these are specified as modules
690 tc_show_members(struct ckrm_core_class *core, struct seq_file *seq)
692 struct list_head *lh;
693 struct task_struct *tsk;
696 list_for_each(lh, &core->objlist) {
697 tsk = container_of(lh, struct task_struct, taskclass_link);
698 seq_printf(seq,"%ld\n", (long)tsk->pid);
706 tc_forced_reclassify(struct ckrm_core_class *target,const char *obj)
711 pid = (pid_t) simple_strtoul(obj,NULL,10);
713 rc = ckrm_forced_reclassify_pid(pid,
714 class_type(ckrm_task_class_t,target));
721 /***************************************************************************************
722 * Debugging Task Classes: Utility functions
723 **************************************************************************************/
726 check_tasklist_sanity(struct ckrm_task_class *cls)
728 struct ckrm_core_class *core = class_core(cls);
729 struct list_head *lh1, *lh2;
734 if (list_empty(&core->objlist)) {
736 printk("check_tasklist_sanity: class %s empty list\n",
740 list_for_each_safe(lh1, lh2, &core->objlist) {
741 struct task_struct *tsk = container_of(lh1, struct task_struct, taskclass_link);
742 if (count++ > 20000) {
743 printk("list is CORRUPTED\n");
746 if (tsk->taskclass != cls) {
747 const char *tclsname;
748 tclsname = (tsk->taskclass) ? class_core(tsk->taskclass)->name
750 printk("sanity: task %s:%d has ckrm_core |%s| but in list |%s|\n",
751 tsk->comm,tsk->pid,tclsname,core->name);
759 ckrm_debug_free_task_class(struct ckrm_task_class *tskcls)
761 struct task_struct *proc, *thread;
764 printk("Analyze Error <%s> %d\n",
765 class_core(tskcls)->name,atomic_read(&(class_core(tskcls)->refcnt)));
767 read_lock(&tasklist_lock);
768 class_lock(class_core(tskcls));
769 do_each_thread(proc, thread) {
770 count += (tskcls == thread->taskclass);
771 if ((thread->taskclass == tskcls) || (tskcls == NULL)) {
772 const char *tclsname;
773 tclsname = (thread->taskclass) ? class_core(thread->taskclass)->name : "NULL";
774 printk("%d thread=<%s:%d> -> <%s> <%lx>\n",
775 count,thread->comm,thread->pid,tclsname, thread->flags & PF_EXITING);
777 } while_each_thread(proc, thread);
778 class_unlock(class_core(tskcls));
779 read_unlock(&tasklist_lock);
781 printk("End Analyze Error <%s> %d\n",
782 class_core(tskcls)->name,atomic_read(&(class_core(tskcls)->refcnt)));