1 /* ckrm_tc.c - Class-based Kernel Resource Management (CKRM)
3 * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004
4 * (C) Shailabh Nagar, IBM Corp. 2003
5 * (C) Chandra Seetharaman, IBM Corp. 2003
6 * (C) Vivek Kashyap, IBM Corp. 2004
9 * Provides kernel API of CKRM for in-kernel,per-resource controllers
10 * (one each for cpu, memory, io, network) and callbacks for
11 * classification modules.
13 * Latest version, more details at http://ckrm.sf.net
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
27 * Made modifications to suit the new RBCE module.
29 * Fixed a bug in fork and exit callbacks. Added callbacks_active and
30 * surrounding logic. Added task paramter for all CE callbacks.
32 * moved to referenced counted class objects and correct locking
34 * introduced adopted to emerging classtype interface
37 #include <linux/config.h>
38 #include <linux/init.h>
39 #include <linux/linkage.h>
40 #include <linux/kernel.h>
41 #include <linux/errno.h>
42 #include <asm/uaccess.h>
44 #include <asm/errno.h>
45 #include <linux/string.h>
46 #include <linux/list.h>
47 #include <linux/spinlock.h>
48 #include <linux/module.h>
49 #include <linux/ckrm_rc.h>
51 #include <linux/ckrm_tc.h>
53 #warning MEF I cannot believe that vserver changes force the following include statement: FIX THIS!
54 #include <linux/vs_cvirt.h>
57 #define TC_DEBUG(fmt, args...) do { \
58 /* printk("%s: " fmt, __FUNCTION__ , ## args); */ } while (0)
60 static struct ckrm_task_class taskclass_dflt_class = {
63 const char *dflt_taskclass_name = TASK_CLASS_TYPE_NAME;
65 static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class
66 *parent, const char *name);
67 static int ckrm_free_task_class(struct ckrm_core_class *core);
69 static int tc_forced_reclassify(ckrm_core_class_t * target,
71 static int tc_show_members(struct ckrm_core_class *core, struct seq_file *seq);
72 static void tc_add_resctrl(struct ckrm_core_class *core, int resid);
74 struct ckrm_classtype CT_taskclass = {
76 .name = TASK_CLASS_TYPE_NAME,
77 .typeID = CKRM_CLASSTYPE_TASK_CLASS,
78 .maxdepth = 3, // Hubertus .. just to start
79 .resid_reserved = 4, // Hubertus .. reservation
80 .max_res_ctlrs = CKRM_MAX_RES_CTLRS,
83 .res_ctlrs_lock = SPIN_LOCK_UNLOCKED,
84 .classes = LIST_HEAD_INIT(CT_taskclass.classes),
86 .default_class = &taskclass_dflt_class.core,
88 // private version of functions
89 .alloc = &ckrm_alloc_task_class,
90 .free = &ckrm_free_task_class,
91 .show_members = &tc_show_members,
92 .forced_reclassify = &tc_forced_reclassify,
94 // use of default functions
95 .show_shares = &ckrm_class_show_shares,
96 .show_stats = &ckrm_class_show_stats,
97 .show_config = &ckrm_class_show_config,
98 .set_config = &ckrm_class_set_config,
99 .set_shares = &ckrm_class_set_shares,
100 .reset_stats = &ckrm_class_reset_stats,
102 // mandatory private version .. no dflt available
103 .add_resctrl = &tc_add_resctrl,
106 /**************************************************************************
108 **************************************************************************/
110 static inline void ckrm_init_task_lock(struct task_struct *tsk)
112 tsk->ckrm_tsklock = SPIN_LOCK_UNLOCKED;
115 // Hubertus .. following functions should move to ckrm_rc.h
117 static inline void ckrm_task_lock(struct task_struct *tsk)
119 spin_lock(&tsk->ckrm_tsklock);
122 static inline void ckrm_task_unlock(struct task_struct *tsk)
124 spin_unlock(&tsk->ckrm_tsklock);
128 * Change the task class of the given task.
130 * Change the task's task class to "newcls" if the task's current
131 * class (task->taskclass) is same as given "oldcls", if it is non-NULL.
133 * Caller is responsible to make sure the task structure stays put through
136 * This function should be called with the following locks NOT held
137 * - tsk->ckrm_task_lock
138 * - core->ckrm_lock, if core is NULL then ckrm_dflt_class.ckrm_lock
139 * - tsk->taskclass->ckrm_lock
141 * Function is also called with a ckrm_core_grab on the new core, hence
142 * it needs to be dropped if no assignment takes place.
145 ckrm_set_taskclass(struct task_struct *tsk, ckrm_task_class_t * newcls,
146 ckrm_task_class_t * oldcls, enum ckrm_event event)
149 ckrm_classtype_t *clstype;
150 ckrm_res_ctlr_t *rcbs;
151 ckrm_task_class_t *curcls;
152 void *old_res_class, *new_res_class;
156 curcls = tsk->taskclass;
158 if ((void *)-1 == curcls) {
159 // task is disassociated from ckrm... don't bother it.
160 ckrm_task_unlock(tsk);
161 ckrm_core_drop(class_core(newcls));
165 if ((curcls == NULL) && (newcls == (void *)-1)) {
166 // task need to disassociated from ckrm and has no curcls
167 // just disassociate and return.
168 tsk->taskclass = newcls;
169 ckrm_task_unlock(tsk);
172 // check whether compare_and_exchange should
173 if (oldcls && (oldcls != curcls)) {
174 ckrm_task_unlock(tsk);
176 /* compensate for previous grab */
177 TC_DEBUG("(%s:%d): Race-condition caught <%s> %d\n",
178 tsk->comm, tsk->pid, class_core(newcls)->name,
180 ckrm_core_drop(class_core(newcls));
184 // make sure we have a real destination core
186 newcls = &taskclass_dflt_class;
187 ckrm_core_grab(class_core(newcls));
189 // take out of old class
190 // remember that we need to drop the oldcore
191 if ((drop_old_cls = (curcls != NULL))) {
192 class_lock(class_core(curcls));
193 if (newcls == curcls) {
194 // we are already in the destination class.
195 // we still need to drop oldcore
196 class_unlock(class_core(curcls));
197 ckrm_task_unlock(tsk);
200 list_del(&tsk->taskclass_link);
201 INIT_LIST_HEAD(&tsk->taskclass_link);
202 tsk->taskclass = NULL;
203 class_unlock(class_core(curcls));
204 if (newcls == (void *)-1) {
205 tsk->taskclass = newcls;
206 ckrm_task_unlock(tsk);
207 // still need to get out of old class
212 // put into new class
213 class_lock(class_core(newcls));
214 tsk->taskclass = newcls;
215 list_add(&tsk->taskclass_link, &class_core(newcls)->objlist);
216 class_unlock(class_core(newcls));
218 if (newcls == curcls) {
219 ckrm_task_unlock(tsk);
223 CE_NOTIFY(&CT_taskclass, event, newcls, tsk);
225 ckrm_task_unlock(tsk);
228 clstype = &CT_taskclass;
229 if (clstype->bit_res_ctlrs) {
230 // avoid running through the entire list if non is registered
231 for (i = 0; i < clstype->max_resid; i++) {
232 if (clstype->res_ctlrs[i] == NULL)
234 atomic_inc(&clstype->nr_resusers[i]);
236 curcls ? class_core(curcls)->res_class[i] : NULL;
238 newcls ? class_core(newcls)->res_class[i] : NULL;
239 rcbs = clstype->res_ctlrs[i];
240 if (rcbs && rcbs->change_resclass
241 && (old_res_class != new_res_class))
242 (*rcbs->change_resclass) (tsk, old_res_class,
244 atomic_dec(&clstype->nr_resusers[i]);
250 ckrm_core_drop(class_core(curcls));
254 // HF SUGGEST: we could macro-tize this for other types
255 // DEF_FUNC_ADD_RESCTRL(funcname,link)
256 // would DEF_FUNC_ADD_RESCTRL(tc_add_resctrl,taskclass_link)
258 static void tc_add_resctrl(struct ckrm_core_class *core, int resid)
260 struct task_struct *tsk;
261 struct ckrm_res_ctlr *rcbs;
263 if ((resid < 0) || (resid >= CKRM_MAX_RES_CTLRS)
264 || ((rcbs = core->classtype->res_ctlrs[resid]) == NULL))
268 list_for_each_entry(tsk, &core->objlist, taskclass_link) {
269 if (rcbs->change_resclass)
270 (*rcbs->change_resclass) (tsk, (void *)-1,
271 core->res_class[resid]);
276 /**************************************************************************
277 * Functions called from classification points *
278 **************************************************************************/
280 #define ECB_PRINTK(fmt, args...) \
281 // do { if (CT_taskclass.ce_regd)
282 // printk("%s: " fmt, __FUNCTION__ , ## args); } while (0)
284 #define CE_CLASSIFY_TASK(event, tsk) \
286 struct ckrm_task_class *newcls = NULL; \
287 struct ckrm_task_class *oldcls = tsk->taskclass; \
289 CE_CLASSIFY_RET(newcls,&CT_taskclass,event,tsk); \
291 /* called synchrously. no need to get task struct */ \
292 ckrm_set_taskclass(tsk, newcls, oldcls, event); \
297 #define CE_CLASSIFY_TASK_PROTECT(event, tsk) \
299 ce_protect(&CT_taskclass); \
300 CE_CLASSIFY_TASK(event,tsk); \
301 ce_release(&CT_taskclass); \
304 static void cb_taskclass_newtask(struct task_struct *tsk)
306 tsk->taskclass = NULL;
307 INIT_LIST_HEAD(&tsk->taskclass_link);
310 static void cb_taskclass_fork(struct task_struct *tsk)
312 struct ckrm_task_class *cls = NULL;
314 ECB_PRINTK("%p:%d:%s\n", tsk, tsk->pid, tsk->comm);
316 ce_protect(&CT_taskclass);
317 CE_CLASSIFY_RET(cls, &CT_taskclass, CKRM_EVENT_FORK, tsk);
319 ckrm_task_lock(tsk->parent);
320 cls = tsk->parent->taskclass;
321 ckrm_core_grab(class_core(cls));
322 ckrm_task_unlock(tsk->parent);
324 if (!list_empty(&tsk->taskclass_link))
325 printk(KERN_WARNING "BUG in cb_fork.. tsk (%s:%d> already linked\n",
326 tsk->comm, tsk->pid);
328 ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_FORK);
329 ce_release(&CT_taskclass);
332 static void cb_taskclass_exit(struct task_struct *tsk)
334 CE_CLASSIFY_NORET(&CT_taskclass, CKRM_EVENT_EXIT, tsk);
335 ckrm_set_taskclass(tsk, (void *)-1, NULL, CKRM_EVENT_EXIT);
338 static void cb_taskclass_exec(const char *filename)
340 ECB_PRINTK("%p:%d:%s <%s>\n", current, current->pid, current->comm,
342 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_EXEC, current);
345 static void cb_taskclass_uid(void)
347 ECB_PRINTK("%p:%d:%s\n", current, current->pid, current->comm);
348 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_UID, current);
351 static void cb_taskclass_gid(void)
353 ECB_PRINTK("%p:%d:%s\n", current, current->pid, current->comm);
354 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_GID, current);
358 cb_taskclass_xid(struct task_struct *tsk)
360 ECB_PRINTK("%p:%d:%s\n",current,current->pid,current->comm);
361 CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_XID, tsk);
364 static struct ckrm_event_spec taskclass_events_callbacks[] = {
365 CKRM_EVENT_SPEC(NEWTASK, cb_taskclass_newtask),
366 CKRM_EVENT_SPEC(EXEC, cb_taskclass_exec),
367 CKRM_EVENT_SPEC(FORK, cb_taskclass_fork),
368 CKRM_EVENT_SPEC(EXIT, cb_taskclass_exit),
369 CKRM_EVENT_SPEC(UID, cb_taskclass_uid),
370 CKRM_EVENT_SPEC(GID, cb_taskclass_gid),
371 CKRM_EVENT_SPEC(XID, cb_taskclass_xid),
375 /***********************************************************************
377 * Asynchronous callback functions (driven by RCFS)
379 * Async functions force a setting of the task structure
380 * synchronous callbacks are protected against race conditions
381 * by using a cmpxchg on the core before setting it.
382 * Async calls need to be serialized to ensure they can't
383 * race against each other
385 ***********************************************************************/
387 DECLARE_MUTEX(async_serializer); // serialize all async functions
390 * Go through the task list and reclassify all tasks according to the current
391 * classification rules.
393 * We have the problem that we can not hold any lock (including the
394 * tasklist_lock) while classifying. Two methods possible
396 * (a) go through entire pidrange (0..pidmax) and if a task exists at
397 * that pid then reclassify it
398 * (b) go several time through task list and build a bitmap for a particular
399 * subrange of pid otherwise the memory requirements ight be too much.
401 * We use a hybrid by comparing ratio nr_threads/pidmax
404 static int ckrm_reclassify_all_tasks(void)
408 struct task_struct *proc, *thread;
410 int curpidmax = pid_max;
414 /* Check permissions */
415 if ((!capable(CAP_SYS_NICE)) && (!capable(CAP_SYS_RESOURCE))) {
419 ratio = curpidmax / nr_threads;
420 if (curpidmax <= PID_MAX_DEFAULT) {
423 use_bitmap = (ratio >= 2);
426 ce_protect(&CT_taskclass);
430 if (use_bitmap == 0) {
431 // go through it in one walk
432 read_lock(&tasklist_lock);
433 for (i = 0; i < curpidmax; i++) {
434 if ((thread = find_task_by_pid(i)) == NULL)
436 get_task_struct(thread);
437 read_unlock(&tasklist_lock);
438 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY, thread);
439 put_task_struct(thread);
440 read_lock(&tasklist_lock);
442 read_unlock(&tasklist_lock);
444 unsigned long *bitmap;
450 bitmap = (unsigned long *)__get_free_pages(GFP_KERNEL, order);
451 if (bitmap == NULL) {
456 bitmapsize = 8 * (1 << (order + PAGE_SHIFT));
457 num_loops = (curpidmax + bitmapsize - 1) / bitmapsize;
460 for (i = 0; i < num_loops && do_next; i++) {
461 int pid_start = i * bitmapsize;
462 int pid_end = pid_start + bitmapsize;
466 memset(bitmap, 0, bitmapsize / 8); // start afresh
469 read_lock(&tasklist_lock);
470 do_each_thread(proc, thread) {
472 if ((pid < pid_start) || (pid >= pid_end)) {
473 if (pid >= pid_end) {
479 set_bit(pid, bitmap);
482 while_each_thread(proc, thread);
483 read_unlock(&tasklist_lock);
489 for (; num_found--;) {
490 pos = find_next_bit(bitmap, bitmapsize, pos);
491 pid = pos + pid_start;
493 read_lock(&tasklist_lock);
494 if ((thread = find_task_by_pid(pid)) != NULL) {
495 get_task_struct(thread);
496 read_unlock(&tasklist_lock);
497 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY,
499 put_task_struct(thread);
501 read_unlock(&tasklist_lock);
508 ce_release(&CT_taskclass);
513 * Reclassify all tasks in the given core class.
516 static void ckrm_reclassify_class_tasks(struct ckrm_task_class *cls)
519 struct ckrm_hnode *cnode;
520 struct ckrm_task_class *parcls;
523 if (!ckrm_validate_and_grab_core(&cls->core))
526 down(&async_serializer); // protect again race condition
527 TC_DEBUG("start %p:%s:%d:%d\n", cls, cls->core.name,
528 atomic_read(&cls->core.refcnt),
529 atomic_read(&cls->core.hnode.parent->refcnt));
530 // If no CE registered for this classtype, following will be needed
532 ce_regd = atomic_read(&class_core(cls)->classtype->ce_regd);
533 cnode = &(class_core(cls)->hnode);
534 parcls = class_type(ckrm_task_class_t, cnode->parent);
537 class_lock(class_core(cls));
538 if (!list_empty(&class_core(cls)->objlist)) {
539 struct ckrm_task_class *newcls = NULL;
540 struct task_struct *tsk =
541 list_entry(class_core(cls)->objlist.next,
542 struct task_struct, taskclass_link);
544 get_task_struct(tsk);
545 class_unlock(class_core(cls));
548 CE_CLASSIFY_RET(newcls, &CT_taskclass,
549 CKRM_EVENT_RECLASSIFY, tsk);
551 // don't allow reclassifying to the same class
552 // as we are in the process of cleaning up
555 // compensate CE's grab
556 ckrm_core_drop(class_core(newcls));
560 if (newcls == NULL) {
562 ckrm_core_grab(class_core(newcls));
564 ckrm_set_taskclass(tsk, newcls, cls, CKRM_EVENT_RECLASSIFY);
565 put_task_struct(tsk);
569 TC_DEBUG("stop %p:%s:%d:%d %d\n", cls, cls->core.name,
570 atomic_read(&cls->core.refcnt),
571 atomic_read(&cls->core.hnode.parent->refcnt), num);
572 class_unlock(class_core(cls));
573 ckrm_core_drop(class_core(cls));
575 up(&async_serializer);
581 * Change the core class of the given task
584 int ckrm_forced_reclassify_pid(pid_t pid, struct ckrm_task_class *cls)
586 struct task_struct *tsk;
588 if (cls && !ckrm_validate_and_grab_core(class_core(cls)))
591 read_lock(&tasklist_lock);
592 if ((tsk = find_task_by_pid(pid)) == NULL) {
593 read_unlock(&tasklist_lock);
595 ckrm_core_drop(class_core(cls));
598 get_task_struct(tsk);
599 read_unlock(&tasklist_lock);
601 /* Check permissions */
602 if ((!capable(CAP_SYS_NICE)) &&
603 (!capable(CAP_SYS_RESOURCE)) && (current->user != tsk->user)) {
605 ckrm_core_drop(class_core(cls));
606 put_task_struct(tsk);
610 ce_protect(&CT_taskclass);
612 CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY,tsk);
614 ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_MANUAL);
616 ce_release(&CT_taskclass);
617 put_task_struct(tsk);
622 static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class
623 *parent, const char *name)
625 struct ckrm_task_class *taskcls;
626 taskcls = kmalloc(sizeof(struct ckrm_task_class), GFP_KERNEL);
629 memset(taskcls, 0, sizeof(struct ckrm_task_class));
631 ckrm_init_core_class(&CT_taskclass, class_core(taskcls), parent, name);
633 ce_protect(&CT_taskclass);
634 if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_add)
635 (*CT_taskclass.ce_callbacks.class_add) (name, taskcls,
636 CT_taskclass.typeID);
637 ce_release(&CT_taskclass);
639 return class_core(taskcls);
642 static int ckrm_free_task_class(struct ckrm_core_class *core)
644 struct ckrm_task_class *taskcls;
646 if (!ckrm_is_core_valid(core)) {
650 if (core == core->classtype->default_class) {
651 // reset the name tag
652 core->name = dflt_taskclass_name;
656 TC_DEBUG("%p:%s:%d\n", core, core->name, atomic_read(&core->refcnt));
658 taskcls = class_type(struct ckrm_task_class, core);
660 ce_protect(&CT_taskclass);
662 if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_delete)
663 (*CT_taskclass.ce_callbacks.class_delete) (core->name, taskcls,
664 CT_taskclass.typeID);
665 ckrm_reclassify_class_tasks(taskcls);
667 ce_release(&CT_taskclass);
669 ckrm_release_core_class(core);
670 // Hubertus .... could just drop the class .. error message
674 void __init ckrm_meta_init_taskclass(void)
676 printk(KERN_DEBUG "...... Initializing ClassType<%s> ........\n",
678 // intialize the default class
679 ckrm_init_core_class(&CT_taskclass, class_core(&taskclass_dflt_class),
680 NULL, dflt_taskclass_name);
682 // register classtype and initialize default task class
683 ckrm_register_classtype(&CT_taskclass);
684 ckrm_register_event_set(taskclass_events_callbacks);
686 // note registeration of all resource controllers will be done
687 // later dynamically as these are specified as modules
690 static int tc_show_members(struct ckrm_core_class *core, struct seq_file *seq)
692 struct list_head *lh;
693 struct task_struct *tsk;
696 list_for_each(lh, &core->objlist) {
697 tsk = container_of(lh, struct task_struct, taskclass_link);
698 seq_printf(seq, "%ld\n", (long)tsk->pid);
705 static int tc_forced_reclassify(struct ckrm_core_class *target, const char *obj)
710 pid = (pid_t) simple_strtol(obj, NULL, 0);
712 down(&async_serializer); // protect again race condition with reclassify_class
714 // do we want to treat this as process group .. TBD
716 } else if (pid == 0) {
717 rc = (target == NULL) ? ckrm_reclassify_all_tasks() : -EINVAL;
719 struct ckrm_task_class *cls = NULL;
721 cls = class_type(ckrm_task_class_t,target);
722 rc = ckrm_forced_reclassify_pid(pid,cls);
724 up(&async_serializer);
730 /******************************************************************************
731 * Debugging Task Classes: Utility functions
732 ******************************************************************************/
734 void check_tasklist_sanity(struct ckrm_task_class *cls)
736 struct ckrm_core_class *core = class_core(cls);
737 struct list_head *lh1, *lh2;
742 if (list_empty(&core->objlist)) {
744 printk(KERN_DEBUG "check_tasklist_sanity: class %s empty list\n",
748 list_for_each_safe(lh1, lh2, &core->objlist) {
749 struct task_struct *tsk =
750 container_of(lh1, struct task_struct,
752 if (count++ > 20000) {
753 printk(KERN_WARNING "list is CORRUPTED\n");
756 if (tsk->taskclass != cls) {
757 const char *tclsname;
758 tclsname = (tsk->taskclass) ?
759 class_core(tsk->taskclass)->name:"NULL";
760 printk(KERN_WARNING "sanity: task %s:%d has ckrm_core "
761 "|%s| but in list |%s|\n", tsk->comm,
762 tsk->pid, tclsname, core->name);
769 void ckrm_debug_free_task_class(struct ckrm_task_class *tskcls)
771 struct task_struct *proc, *thread;
774 printk(KERN_DEBUG "Analyze Error <%s> %d\n",
775 class_core(tskcls)->name,
776 atomic_read(&(class_core(tskcls)->refcnt)));
778 read_lock(&tasklist_lock);
779 class_lock(class_core(tskcls));
780 do_each_thread(proc, thread) {
781 count += (tskcls == thread->taskclass);
782 if ((thread->taskclass == tskcls) || (tskcls == NULL)) {
783 const char *tclsname;
784 tclsname = (thread->taskclass) ?
785 class_core(thread->taskclass)->name :"NULL";
786 printk(KERN_DEBUG "%d thread=<%s:%d> -> <%s> <%lx>\n", count,
787 thread->comm, thread->pid, tclsname,
788 thread->flags & PF_EXITING);
790 } while_each_thread(proc, thread);
791 class_unlock(class_core(tskcls));
792 read_unlock(&tasklist_lock);
794 printk(KERN_DEBUG "End Analyze Error <%s> %d\n",
795 class_core(tskcls)->name,
796 atomic_read(&(class_core(tskcls)->refcnt)));