From 7ae96b0cada94a9d7b46881e11cc926250dc6b2a Mon Sep 17 00:00:00 2001 From: Marc Fiuczynski Date: Mon, 8 Aug 2005 21:12:46 +0000 Subject: [PATCH] Removed CKRM code base from kernel tree. --- Documentation/ckrm/block_io | 154 -- Documentation/ckrm/ckrm_basics | 66 - Documentation/ckrm/core_usage | 72 - Documentation/ckrm/cpusched | 86 - Documentation/ckrm/crbce | 33 - Documentation/ckrm/installation | 70 - Documentation/ckrm/mem_rc.design | 178 -- Documentation/ckrm/mem_rc.usage | 112 - Documentation/ckrm/numtasks | 122 - Documentation/ckrm/rbce_basics | 67 - Documentation/ckrm/rbce_usage | 98 - configs/kernel-2.6.10-i686-planetlab.config | 39 +- drivers/block/Makefile | 3 +- drivers/block/ckrm-io.c | 508 ---- drivers/block/ckrm-iostub.c | 78 - fs/Makefile | 1 - fs/exec.c | 4 - fs/proc/array.c | 18 - fs/proc/base.c | 7 - fs/rcfs/Makefile | 10 - fs/rcfs/dir.c | 336 --- fs/rcfs/inode.c | 204 -- fs/rcfs/magic.c | 530 ---- fs/rcfs/rootdir.c | 227 -- fs/rcfs/socket_fs.c | 338 --- fs/rcfs/super.c | 304 --- fs/rcfs/tc_magic.c | 94 - include/asm-x86_64/unistd.h | 10 - include/linux/ckrm-io.h | 41 - include/linux/ckrm_ce.h | 108 - include/linux/ckrm_classqueue.h | 130 - include/linux/ckrm_events.h | 192 -- include/linux/ckrm_mem.h | 105 - include/linux/ckrm_mem_inline.h | 403 --- include/linux/ckrm_net.h | 41 - include/linux/ckrm_rc.h | 355 --- include/linux/ckrm_sched.h | 562 ---- include/linux/ckrm_tc.h | 50 - include/linux/ckrm_tsk.h | 35 - include/linux/crbce.h | 175 -- include/linux/fs.h | 22 - include/linux/init_task.h | 1 - include/linux/mm.h | 4 - include/linux/mm_inline.h | 17 - include/linux/mmzone.h | 6 +- include/linux/page-flags.h | 11 - include/linux/rbce.h | 127 - include/linux/rcfs.h | 96 - include/linux/sched.h | 47 - include/linux/taskdelays.h | 20 - include/linux/tcp.h | 33 - include/net/sock.h | 3 - include/net/tcp.h | 107 - init/Kconfig | 181 -- init/main.c | 6 - kernel/Makefile | 3 +- kernel/ckrm/Makefile | 15 - kernel/ckrm/ckrm.c | 927 ------- kernel/ckrm/ckrm_cpu_class.c | 388 --- kernel/ckrm/ckrm_cpu_monitor.c | 1023 -------- kernel/ckrm/ckrm_events.c | 97 - kernel/ckrm/ckrm_listenaq.c | 495 ---- kernel/ckrm/ckrm_mem.c | 981 ------- kernel/ckrm/ckrm_memcore.c | 628 ----- kernel/ckrm/ckrm_memctlr.c | 439 ---- kernel/ckrm/ckrm_null_class.c | 308 --- kernel/ckrm/ckrm_numtasks.c | 496 ---- kernel/ckrm/ckrm_numtasks_stub.c | 53 - kernel/ckrm/ckrm_sockc.c | 576 ---- kernel/ckrm/ckrm_tc.c | 802 ------ kernel/ckrm/ckrmutils.c | 200 -- kernel/ckrm/rbce/Makefile | 13 - kernel/ckrm/rbce/bitvector.h | 152 -- kernel/ckrm/rbce/crbce.h | 152 -- kernel/ckrm/rbce/crbcemod.c | 2 - kernel/ckrm/rbce/info.h | 58 - kernel/ckrm/rbce/rbce.h | 122 - kernel/ckrm/rbce/rbce_fs.c | 490 ---- kernel/ckrm/rbce/rbcemod.c | 2611 ------------------- kernel/ckrm/rbce/rbcemod_ext.c | 622 ----- kernel/ckrm/rbce/token.c | 301 --- kernel/ckrm_classqueue.c | 211 -- kernel/ckrm_sched.c | 217 -- kernel/exit.c | 6 - kernel/fork.c | 17 - kernel/sched.c | 793 +----- kernel/sys.c | 7 - kernel/vserver/context.c | 4 - mm/memory.c | 9 +- mm/page_alloc.c | 14 - mm/swap.c | 10 - mm/vmscan.c | 329 +-- net/ipv4/Kconfig | 23 - net/ipv4/tcp.c | 166 -- net/ipv4/tcp_ipv4.c | 34 - net/ipv4/tcp_minisocks.c | 9 +- net/ipv4/tcp_timer.c | 17 - net/ipv6/tcp_ipv6.c | 32 - 98 files changed, 87 insertions(+), 20412 deletions(-) delete mode 100644 Documentation/ckrm/block_io delete mode 100644 Documentation/ckrm/ckrm_basics delete mode 100644 Documentation/ckrm/core_usage delete mode 100644 Documentation/ckrm/cpusched delete mode 100644 Documentation/ckrm/crbce delete mode 100644 Documentation/ckrm/installation delete mode 100644 Documentation/ckrm/mem_rc.design delete mode 100644 Documentation/ckrm/mem_rc.usage delete mode 100644 Documentation/ckrm/numtasks delete mode 100644 Documentation/ckrm/rbce_basics delete mode 100644 Documentation/ckrm/rbce_usage delete mode 100644 drivers/block/ckrm-io.c delete mode 100644 drivers/block/ckrm-iostub.c delete mode 100644 fs/rcfs/Makefile delete mode 100644 fs/rcfs/dir.c delete mode 100644 fs/rcfs/inode.c delete mode 100644 fs/rcfs/magic.c delete mode 100644 fs/rcfs/rootdir.c delete mode 100644 fs/rcfs/socket_fs.c delete mode 100644 fs/rcfs/super.c delete mode 100644 fs/rcfs/tc_magic.c delete mode 100644 include/linux/ckrm-io.h delete mode 100644 include/linux/ckrm_ce.h delete mode 100644 include/linux/ckrm_classqueue.h delete mode 100644 include/linux/ckrm_events.h delete mode 100644 include/linux/ckrm_mem.h delete mode 100644 include/linux/ckrm_mem_inline.h delete mode 100644 include/linux/ckrm_net.h delete mode 100644 include/linux/ckrm_rc.h delete mode 100644 include/linux/ckrm_sched.h delete mode 100644 include/linux/ckrm_tc.h delete mode 100644 include/linux/ckrm_tsk.h delete mode 100644 include/linux/crbce.h delete mode 100644 include/linux/rbce.h delete mode 100644 include/linux/rcfs.h delete mode 100644 include/linux/taskdelays.h delete mode 100644 kernel/ckrm/Makefile delete mode 100644 kernel/ckrm/ckrm.c delete mode 100644 kernel/ckrm/ckrm_cpu_class.c delete mode 100644 kernel/ckrm/ckrm_cpu_monitor.c delete mode 100644 kernel/ckrm/ckrm_events.c delete mode 100644 kernel/ckrm/ckrm_listenaq.c delete mode 100644 kernel/ckrm/ckrm_mem.c delete mode 100644 kernel/ckrm/ckrm_memcore.c delete mode 100644 kernel/ckrm/ckrm_memctlr.c delete mode 100644 kernel/ckrm/ckrm_null_class.c delete mode 100644 kernel/ckrm/ckrm_numtasks.c delete mode 100644 kernel/ckrm/ckrm_numtasks_stub.c delete mode 100644 kernel/ckrm/ckrm_sockc.c delete mode 100644 kernel/ckrm/ckrm_tc.c delete mode 100644 kernel/ckrm/ckrmutils.c delete mode 100644 kernel/ckrm/rbce/Makefile delete mode 100644 kernel/ckrm/rbce/bitvector.h delete mode 100644 kernel/ckrm/rbce/crbce.h delete mode 100644 kernel/ckrm/rbce/crbcemod.c delete mode 100644 kernel/ckrm/rbce/info.h delete mode 100644 kernel/ckrm/rbce/rbce.h delete mode 100644 kernel/ckrm/rbce/rbce_fs.c delete mode 100644 kernel/ckrm/rbce/rbcemod.c delete mode 100644 kernel/ckrm/rbce/rbcemod_ext.c delete mode 100644 kernel/ckrm/rbce/token.c delete mode 100644 kernel/ckrm_classqueue.c delete mode 100644 kernel/ckrm_sched.c diff --git a/Documentation/ckrm/block_io b/Documentation/ckrm/block_io deleted file mode 100644 index e4a0b8b95..000000000 --- a/Documentation/ckrm/block_io +++ /dev/null @@ -1,154 +0,0 @@ -CKRM I/O controller - -Last updated: Sep 21, 2004 - - -Intro ------ - -CKRM's I/O scheduler is developed as a delta over a modified version of -the Complete Fair Queuing scheduler (CFQ) that implements I/O priorities. -The latter's original posting can be found at: - http://www.ussg.iu.edu/hypermail/linux/kernel/0311.1/0019.html - -Please note that this is not the CFQ version currently in the linus kernel -(2.6.8.1 at time of writing) which provides equal, not prioritized, -bandwidth allocation amongst processes. Since the CFQ in the kernel is likely -to eventually move towards I/O priority implementation, CKRM has not renamed -the underlying I/O scheduler and simply replaces drivers/block/cfq-iosched.c -with the modified version. - -Installation ------------- - -1. Configure "Disk I/O Resource Controller" under CKRM (see -Documentation/ckrm/installation) - -2. After booting into the new kernel, load ckrm-io - # modprobe ckrm-io - -3. Verify that reading /rcfs/taskclass/shares displays values for the -I/O controller (res=cki). - -4. Mount sysfs for monitoring bandwidth received (temporary solution till -a userlevel tool is developed) - # mount -t sysfs none /sys - - -Usage ------ - -For brevity, we assume we are in the /rcfs/taskclass directory for all the -code snippets below. - -Initially, the systemwide default class gets 100% of the I/O bandwidth. - - $ cat stats - - - 20 total ioprio - 20 unused/default ioprio - -The first value is the share of a class, as a parent. The second is the share -of its default subclass. Initially the two are equal. As named subclasses get -created and assigned shares, the default subclass' share (which equals the -"unused" portion of the parent's allocation) dwindles. - - -CFQ assigns one of 20 I/O priorities to all I/O requests. Each priority level -gets a fixed proportion of the total bandwidth in increments of 5%. e.g. - ioprio=1 gets 5%, - ioprio=2 gets 10%..... - all the way through ioprio=19 getting 95% - -ioprio=0 gets bandwidth only if no other priority level submits I/O i.e. it can -get starved. -ioprio=20 is considered realtime I/O and always gets priority. - -CKRM's I/O scheduler distributes these 20 priority levels amongst the hierarchy -of classes according to the relative share of each class. Thus, root starts out -with the total allocation of 20 initially. As children get created and shares -assigned to them, root's allocation reduces. At any time, the sum of absolute -share values of all classes equals 20. - - - -Class creation --------------- - - $ mkdir a - -Its initial share is zero. The parent's share values will be unchanged. Note -that even classes with zero share get unused bandwidth under CFQ. - -Setting a new class share -------------------------- - - $ echo "res=cki,guarantee=20" > /rcfs/taskclass/a/shares - Set cki shares to 20 -1 -1 -1 - - $ echo a/shares - - res=cki,guarantee=20,limit=100,total_guarantee=100,max_limit=100 - -The limit and max_limit fields can be ignored as they are not implemented. -The absolute share of a is 20% of parent's absolute total (20) and can be seen -through - $ echo a/stats - - - 4 total ioprio - 4 unused/default ioprio - -Since a gets 4, parent's default's share diminishes accordingly. Thus - - $ echo stats - - - 20 total ioprio - 16 unused/default ioprio - - -Monitoring ----------- - -Each priority level's request service rate can be viewed through sysfs (mounted -during installation). To view the servicing of priority 4's requests, - - $ while : ; echo /sys/block//queue/iosched/p4 ; sleep 1 ; done - rq (10,15) sec (20,30) q (40,50) - - - -where - rq = cumulative I/O requests received (10) and serviced (15) - sec = cumulative sectors requested (20) and served (30) - q = cumulative number of times the queue was created(40)/destroyed (50) - -The rate at which requests or sectors are serviced should differ for different -priority levels. The difference in received and serviced values indicates queue -depth - with insufficient depth, differentiation between I/O priority levels -will not be observed. - -The rate of q creation is not significant for CKRM. - - -Caveats -------- - -CFQ's I/O differentiation is still being worked upon so its better to choose -widely separated share values to observe differences in delivered I/O -bandwidth. - -CFQ, and consequently CKRM, does not provide limits yet. So it is not possible -to completely limit an I/O hog process by putting it in a class with a low I/O -share. Only if the competing classes maintain sufficient queue depth (i.e a -high I/O issue rate) will they get preferential treatment. However, they may -still see latency degradation due to seeks caused by servicing of the low -priority class. - -When limits are implemented, this behaviour will be rectified. - -Please post questions on the CKRM I/O scheduler on ckrm-tech@lists.sf.net. - - diff --git a/Documentation/ckrm/ckrm_basics b/Documentation/ckrm/ckrm_basics deleted file mode 100644 index cfd9a9256..000000000 --- a/Documentation/ckrm/ckrm_basics +++ /dev/null @@ -1,66 +0,0 @@ -CKRM Basics -------------- -A brief review of CKRM concepts and terminology will help make installation -and testing easier. For more details, please visit http://ckrm.sf.net. - -Currently there are two class types, taskclass and socketclass for grouping, -regulating and monitoring tasks and sockets respectively. - -To avoid repeating instructions for each classtype, this document assumes a -task to be the kernel object being grouped. By and large, one can replace task -with socket and taskclass with socketclass. - -RCFS depicts a CKRM class as a directory. Hierarchy of classes can be -created in which children of a class share resources allotted to -the parent. Tasks can be classified to any class which is at any level. -There is no correlation between parent-child relationship of tasks and -the parent-child relationship of classes they belong to. - -Without a Classification Engine, class is inherited by a task. A privileged -user can reassigned a task to a class as described below, after which all -the child tasks under that task will be assigned to that class, unless the -user reassigns any of them. - -A Classification Engine, if one exists, will be used by CKRM to -classify a task to a class. The Rule based classification engine uses some -of the attributes of the task to classify a task. When a CE is present -class is not inherited by a task. - -Characteristics of a class can be accessed/changed through the following magic -files under the directory representing the class: - -shares: allows to change the shares of different resources managed by the - class -stats: allows to see the statistics associated with each resources managed - by the class -target: allows to assign a task to a class. If a CE is present, assigning - a task to a class through this interface will prevent CE from - reassigning the task to any class during reclassification. -members: allows to see which tasks has been assigned to a class -config: allow to view and modify configuration information of different - resources in a class. - -Resource allocations for a class is controlled by the parameters: - -guarantee: specifies how much of a resource is guranteed to a class. A - special value DONT_CARE(-2) mean that there is no specific - guarantee of a resource is specified, this class may not get - any resource if the system is runing short of resources -limit: specifies the maximum amount of resource that is allowed to be - allocated by a class. A special value DONT_CARE(-2) mean that - there is no specific limit is specified, this class can get all - the resources available. -total_guarantee: total guarantee that is allowed among the children of this - class. In other words, the sum of "guarantee"s of all children - of this class cannot exit this number. -max_limit: Maximum "limit" allowed for any of this class's children. In - other words, "limit" of any children of this class cannot exceed - this value. - -None of this parameters are absolute or have any units associated with -them. These are just numbers(that are relative to its parents') that are -used to calculate the absolute number of resource available for a specific -class. - -Note: The root class has an absolute number of resource units associated with it. - diff --git a/Documentation/ckrm/core_usage b/Documentation/ckrm/core_usage deleted file mode 100644 index 6b5d808c3..000000000 --- a/Documentation/ckrm/core_usage +++ /dev/null @@ -1,72 +0,0 @@ -Usage of CKRM without a classification engine ------------------------------------------------ - -1. Create a class - - # mkdir /rcfs/taskclass/c1 - creates a taskclass named c1 , while - # mkdir /rcfs/socket_class/s1 - creates a socketclass named s1 - -The newly created class directory is automatically populated by magic files -shares, stats, members, target and config. - -2. View default shares - - # cat /rcfs/taskclass/c1/shares - - "guarantee=-2,limit=-2,total_guarantee=100,max_limit=100" is the default - value set for resources that have controllers registered with CKRM. - -3. change shares of a - - One or more of the following fields can/must be specified - res= #mandatory - guarantee= - limit= - total_guarantee= - max_limit= - e.g. - # echo "res=numtasks,limit=20" > /rcfs/taskclass/c1 - - If any of these parameters are not specified, the current value will be - retained. - -4. Reclassify a task (listening socket) - - write the pid of the process to the destination class' target file - # echo 1004 > /rcfs/taskclass/c1/target - - write the "\" string to the destination class' target file - # echo "0.0.0.0\32770" > /rcfs/taskclass/c1/target - -5. Get a list of tasks (sockets) assigned to a taskclass (socketclass) - - # cat /rcfs/taskclass/c1/members - lists pids of tasks belonging to c1 - - # cat /rcfs/socket_class/s1/members - lists the ipaddress\port of all listening sockets in s1 - -6. Get the statictics of different resources of a class - - # cat /rcfs/tasksclass/c1/stats - shows c1's statistics for each resource with a registered resource - controller. - - # cat /rcfs/socket_class/s1/stats - show's s1's stats for the listenaq controller. - -7. View the configuration values of the resources associated with a class - - # cat /rcfs/taskclass/c1/config - shows per-controller config values for c1. - -8. Change the configuration values of resources associated with a class - Configuration values are different for different resources. the comman - field "res=" must always be specified. - - # echo "res=numtasks,parameter=value" > /rcfs/taskclass/c1/config - to change (without any effect), the value associated with . - - diff --git a/Documentation/ckrm/cpusched b/Documentation/ckrm/cpusched deleted file mode 100644 index 01f7f232a..000000000 --- a/Documentation/ckrm/cpusched +++ /dev/null @@ -1,86 +0,0 @@ -CKRM CPU Scheduling -=================== - -Overview --------- - -In CKRM, cpu scheduling is based on a two level scheduling decision. -Every time a new task is to be selected, the scheduler first determines -which class to run next and then schedules the next task in selected -task. - -The scheduling within a class is performed using the default Linux -O(1) scheduler. - -The class scheduler also follows the O(1) principle and works as -follows: - -Each class maintains a local runqueue per cpu aka or short lrq. The existing O(1) scheduler is used to -schedule within an . - -Weights are assigned to each lrq that mirror the effectives shares of -that class. Every time a task executes, its weighted cycles are -charged against its class. Thus classes progress in time called -cummulative virtual time (CVT). In essence the class with the smallest -CVT is selected next. Provisions are made to keep interactivity and -avoid starvation by longer sleeping classes. - -Load balancing across an SMP is performed by balancing the load of -each class across CPUs such that they produce equal load and thus -on the whole system maintain their share. - -Due to the fact that CKRM uses a class hierarchy, cycles that are unused -by a class are redistributed to among busy siblings. -Enabling the CKRM CPU scheduler -------------------------------- - -The scheduler is integrated into the linux scheduler and therefore -can not be loaded dynamically like other CKRM schedulers - -However it can be selected at boot time or dynamically at run time. - -The boot options "ckrmcpu" OR "nockrmcpu" enable / disable the CKRM -cpu scheduler at boot time. Currently by default the scheduler is -disabled. - -# cat /rcfs/taskclass/config - -"res=cpu,mode=enabled" indicates that the CKRM cpu scheduler is -enabled - -"res=cpu,mode=disabled" indicates that the CKRM cpu scheduler is -disabled - -The strings can also be used to dynamically change the scheduling modus -at runtime. For example, to dynamically activate the scheduler. - -# echo "res=cpu,mode=enabled" > /rcfs/taskclass/config - -# cat /rcfs/taskclass/*/stats - -The cpu portion of the scheduler is shown - - "cpu-usage(2,10,60)= 290 340 510" - -The 3 numbers represent the load for the 2 second, 10 second -and 60 seconds. The base = 1000. -Hence the system has 29.0%, 33.5% and 49.8% respectively - -For debugging purposes additional information can be printed out but -that format should not be relied upon. - -Use `echo "res=cpu,usage_detail=3" for the highest detail on usage. -Please consult the source code for the specifics. - -Assigning shares ----------------- - -Follows the general approach described under ckrm_basics. - -# echo "res=cpu,guarantee=val" > shares - -sets the minimum guarantee of a class. - - - diff --git a/Documentation/ckrm/crbce b/Documentation/ckrm/crbce deleted file mode 100644 index dfb4b1e96..000000000 --- a/Documentation/ckrm/crbce +++ /dev/null @@ -1,33 +0,0 @@ -CRBCE ----------- - -crbce is a superset of rbce. In addition to providing automatic -classification, the crbce module -- monitors per-process delay data that is collected by the delay -accounting patch -- collects data on significant kernel events where reclassification -could occur e.g. fork/exec/setuid/setgid etc., and -- uses relayfs to supply both these datapoints to userspace - -To illustrate the utility of the data gathered by crbce, we provide a -userspace daemon called crbcedmn that prints the header info received -from the records sent by the crbce module. - -0. Ensure that a CKRM-enabled kernel with following options configured - has been compiled. At a minimum, core, rcfs, atleast one classtype, - delay-accounting patch and relayfs. For testing, it is recommended - all classtypes and resource controllers be compiled as modules. - -1. Ensure that the Makefile's BUILD_CRBCE=1 and KDIR points to the - kernel of step 1 and call make. - This also builds the userspace daemon, crbcedmn. - -2..9 Same as rbce installation and testing instructions, - except replacing rbce.ko with crbce.ko - -10. Read the pseudo daemon help file - # ./crbcedmn -h - -11. Run the crbcedmn to display all records being processed - # ./crbcedmn - diff --git a/Documentation/ckrm/installation b/Documentation/ckrm/installation deleted file mode 100644 index 0c9033891..000000000 --- a/Documentation/ckrm/installation +++ /dev/null @@ -1,70 +0,0 @@ -Kernel installation ------------------------------- - - = version of mainline Linux kernel - = version of CKRM - -Note: It is expected that CKRM versions will change fairly rapidly. Hence once -a CKRM version has been released for some , it will only be made -available for future 's until the next CKRM version is released. - -1. Patch - - Apply ckrm/kernel//ckrm-.patch to a mainline kernel - tree with version . - - If CRBCE will be used, additionally apply the following patches, in order: - delayacctg-.patch - relayfs-.patch - - -2. Configure - -Select appropriate configuration options: - -a. for taskclasses - - General Setup-->Class Based Kernel Resource Management - - [*] Class Based Kernel Resource Management - Resource Class File System (User API) - [*] Class Manager for Task Groups - Number of Tasks Resource Manager - -b. To test socket_classes and multiple accept queue controller - - General Setup-->Class Based Kernel Resource Management - [*] Class Based Kernel Resource Management - Resource Class File System (User API) - [*] Class Manager for socket groups - Multiple Accept Queues Resource Manager - - Device Drivers-->Networking Support-->Networking options--> - [*] Network packet filtering (replaces ipchains) - [*] IP: TCP Multiple accept queues support - -c. To test CRBCE later (requires 2a.) - - File Systems-->Pseudo filesystems--> - Relayfs filesystem support - (enable all sub fields) - - General Setup--> - [*] Enable delay accounting - - -3. Build, boot into kernel - -4. Enable rcfs - - # insmod /fs/rcfs/rcfs.ko - # mount -t rcfs rcfs /rcfs - - This will create the directories /rcfs/taskclass and - /rcfs/socketclass which are the "roots" of subtrees for creating - taskclasses and socketclasses respectively. - -5. Load numtasks and listenaq controllers - - # insmod /kernel/ckrm/ckrm_tasks.ko - # insmod /kernel/ckrm/ckrm_listenaq.ko diff --git a/Documentation/ckrm/mem_rc.design b/Documentation/ckrm/mem_rc.design deleted file mode 100644 index 1c020ff5a..000000000 --- a/Documentation/ckrm/mem_rc.design +++ /dev/null @@ -1,178 +0,0 @@ -0. Lifecycle of a LRU Page: ----------------------------- -These are the events in a page's lifecycle: - - allocation of the page - there are multiple high level page alloc functions; __alloc_pages() - is the lowest level function that does the real allocation. - - get into LRU list (active list or inactive list) - - get out of LRU list - - freeing the page - there are multiple high level page free functions; free_pages_bulk() - is the lowest level function that does the real free. - -When the memory subsystem runs low on LRU pages, pages are reclaimed by - - moving pages from active list to inactive list (refill_inactive_zone()) - - freeing pages from the inactive list (shrink_zone) -depending on the recent usage of the page(approximately). - -In the process of the life cycle a page can move from the lru list to swap -and back. For this document's purpose, we treat it same as freeing and -allocating the page, respectfully. - -1. Introduction ---------------- -Memory resource controller controls the number of lru physical pages -(active and inactive list) a class uses. It does not restrict any -other physical pages (slabs etc.,) - -For simplicity, this document will always refer lru physical pages as -physical pages or simply pages. - -There are two parameters(that are set by the user) that affect the number -of pages a class is allowed to have in active/inactive list. -They are - - guarantee - specifies the number of pages a class is - guaranteed to get. In other words, if a class is using less than - 'guarantee' number of pages, its pages will not be freed when the - memory subsystem tries to free some pages. - - limit - specifies the maximum number of pages a class can get; - 'limit' in essence can be considered as the 'hard limit' - -Rest of this document details how these two parameters are used in the -memory allocation logic. - -Note that the numbers that are specified in the shares file, doesn't -directly correspond to the number of pages. But, the user can make -it so by making the total_guarantee and max_limit of the default class -(/rcfs/taskclass) to be the total number of pages(given in stats file) -available in the system. - - for example: - # cd /rcfs/taskclass - # grep System stats - System: tot_pages=257512,active=5897,inactive=2931,free=243991 - # cat shares - res=mem,guarantee=-2,limit=-2,total_guarantee=100,max_limit=100 - - "tot_pages=257512" above mean there are 257512 lru pages in - the system. - - By making total_guarantee and max_limit to be same as this number at - this level (/rcfs/taskclass), one can make guarantee and limit in all - classes refer to the number of pages. - - # echo 'res=mem,total_guarantee=257512,max_limit=257512' > shares - # cat shares - res=mem,guarantee=-2,limit=-2,total_guarantee=257512,max_limit=257512 - - -The number of pages a class can use be anywhere between its guarantee and -limit. CKRM memory controller springs into action when the system needs -to choose a victim page to swap out. While the number of pages a class can -have allocated may be anywhere between its guarantee and limit, victim -pages will be choosen from classes that are above their guarantee. - -Victim class will be chosen by the number pages a class is using over its -guarantee. i.e a class that is using 10000 pages over its guarantee will be -chosen against a class that is using 1000 pages over its guarantee. -Pages belonging to classes that are below their guarantee will not be -chosen as a victim. - -2. Configuaration parameters ---------------------------- - -Memory controller provides the following configuration parameters. Usage of -these parameters will be made clear in the following section. - -fail_over: When pages are being allocated, if the class is over fail_over % of - its limit, then fail the memory allocation. Default is 110. - ex: If limit of a class is 30000 and fail_over is 110, then memory - allocations would start failing once the class is using more than 33000 - pages. - -shrink_at: When a class is using shrink_at % of its limit, then start - shrinking the class, i.e start freeing the page to make more free pages - available for this class. Default is 90. - ex: If limit of a class is 30000 and shrink_at is 90, then pages from this - class will start to get freed when the class's usage is above 27000 - -shrink_to: When a class reached shrink_at % of its limit, ckrm will try to - shrink the class's usage to shrink_to %. Defalut is 80. - ex: If limit of a class is 30000 with shrink_at being 90 and shrink_to - being 80, then ckrm will try to free pages from the class when its - usage reaches 27000 and will try to bring it down to 24000. - -num_shrinks: Number of shrink attempts ckrm will do within shrink_interval - seconds. After this many attempts in a period, ckrm will not attempt a - shrink even if the class's usage goes over shrink_at %. Default is 10. - -shrink_interval: Number of seconds in a shrink period. Default is 10. - -3. Design --------------------------- - -CKRM memory resource controller taps at appropriate low level memory -management functions to associate a page with a class and to charge -a class that brings the page to the LRU list. - -CKRM maintains lru lists per-class instead of keeping it system-wide, so -that reducing a class's usage doesn't involve going through the system-wide -lru lists. - -3.1 Changes in page allocation function(__alloc_pages()) --------------------------------------------------------- -- If the class that the current task belong to is over 'fail_over' % of its - 'limit', allocation of page(s) fail. Otherwise, the page allocation will - proceed as before. -- Note that the class is _not_ charged for the page(s) here. - -3.2 Changes in page free(free_pages_bulk()) -------------------------------------------- -- If the page still belong to a class, the class will be credited for this - page. - -3.3 Adding/Deleting page to active/inactive list -------------------------------------------------- -When a page is added to the active or inactive list, the class that the -task belongs to is charged for the page usage. - -When a page is deleted from the active or inactive list, the class that the -page belongs to is credited back. - -If a class uses 'shrink_at' % of its limit, attempt is made to shrink -the class's usage to 'shrink_to' % of its limit, in order to help the class -stay within its limit. -But, if the class is aggressive, and keep getting over the class's limit -often(more than such 'num_shrinks' events in 'shrink_interval' seconds), -then the memory resource controller gives up on the class and doesn't try -to shrink the class, which will eventually lead the class to reach -fail_over % and then the page allocations will start failing. - -3.4 Changes in the page reclaimation path (refill_inactive_zone and shrink_zone) -------------------------------------------------------------------------------- -Pages will be moved from active to inactive list(refill_inactive_zone) and -pages from inactive list by choosing victim classes. Victim classes are -chosen depending on their usage over their guarantee. - -Classes with DONT_CARE guarantee are assumed an implicit guarantee which is -based on the number of children(with DONT_CARE guarantee) its parent has -(including the default class) and the unused pages its parent still has. -ex1: If a default root class /rcfs/taskclass has 3 children c1, c2 and c3 -and has 200000 pages, and all the classes have DONT_CARE guarantees, then -all the classes (c1, c2, c3 and the default class of /rcfs/taskclass) will -get 50000 (200000 / 4) pages each. -ex2: If, in the above example c1 is set with a guarantee of 80000 pages, -then the other classes (c2, c3 and the default class of /rcfs/taskclass) -will get 40000 ((200000 - 80000) / 3) pages each. - -3.5 Handling of Shared pages ----------------------------- -Even if a mm is shared by tasks, the pages that belong to the mm will be -charged against the individual tasks that bring the page into LRU. - -But, when any task that is using a mm moves to a different class or exits, -then all pages that belong to the mm will be charged against the richest -class among the tasks that are using the mm. - -Note: Shared page handling need to be improved with a better policy. - diff --git a/Documentation/ckrm/mem_rc.usage b/Documentation/ckrm/mem_rc.usage deleted file mode 100644 index 3d2f2f04f..000000000 --- a/Documentation/ckrm/mem_rc.usage +++ /dev/null @@ -1,112 +0,0 @@ -Installation ------------- - -1. Configure "Class based physical memory controller" under CKRM (see - Documentation/ckrm/installation) - -2. Reboot the system with the new kernel. - -3. Verify that the memory controller is present by reading the file - /rcfs/taskclass/config (should show a line with res=mem) - -Usage ------ - -For brevity, unless otherwise specified all the following commands are -executed in the default class (/rcfs/taskclass). - -Initially, the systemwide default class gets 100% of the LRU pages, and the -stats file at the /rcfs/taskclass level displays the total number of -physical pages. - - # cd /rcfs/taskclass - # grep System stats - System: tot_pages=239778,active=60473,inactive=135285,free=44555 - # cat shares - res=mem,guarantee=-2,limit=-2,total_guarantee=100,max_limit=100 - - tot_pages - total number of pages - active - number of pages in the active list ( sum of all zones) - inactive - number of pages in the inactive list ( sum of all zones) - free - number of free pages (sum of all zones) - - By making total_guarantee and max_limit to be same as tot_pages, one can - make the numbers in shares file be same as the number of pages for a - class. - - # echo 'res=mem,total_guarantee=239778,max_limit=239778' > shares - # cat shares - res=mem,guarantee=-2,limit=-2,total_guarantee=239778,max_limit=239778 - -Changing configuration parameters: ----------------------------------- -For description of the paramters read the file mem_rc.design in this same directory. - -Following is the default values for the configuration parameters: - - localhost:~ # cd /rcfs/taskclass - localhost:/rcfs/taskclass # cat config - res=mem,fail_over=110,shrink_at=90,shrink_to=80,num_shrinks=10,shrink_interval=10 - -Here is how to change a specific configuration parameter. Note that more than one -configuration parameter can be changed in a single echo command though for simplicity -we show one per echo. - -ex: Changing fail_over: - localhost:/rcfs/taskclass # echo "res=mem,fail_over=120" > config - localhost:/rcfs/taskclass # cat config - res=mem,fail_over=120,shrink_at=90,shrink_to=80,num_shrinks=10,shrink_interval=10 - -ex: Changing shrink_at: - localhost:/rcfs/taskclass # echo "res=mem,shrink_at=85" > config - localhost:/rcfs/taskclass # cat config - res=mem,fail_over=120,shrink_at=85,shrink_to=80,num_shrinks=10,shrink_interval=10 - -ex: Changing shrink_to: - localhost:/rcfs/taskclass # echo "res=mem,shrink_to=75" > config - localhost:/rcfs/taskclass # cat config - res=mem,fail_over=120,shrink_at=85,shrink_to=75,num_shrinks=10,shrink_interval=10 - -ex: Changing num_shrinks: - localhost:/rcfs/taskclass # echo "res=mem,num_shrinks=20" > config - localhost:/rcfs/taskclass # cat config - res=mem,fail_over=120,shrink_at=85,shrink_to=75,num_shrinks=20,shrink_interval=10 - -ex: Changing shrink_interval: - localhost:/rcfs/taskclass # echo "res=mem,shrink_interval=15" > config - localhost:/rcfs/taskclass # cat config - res=mem,fail_over=120,shrink_at=85,shrink_to=75,num_shrinks=20,shrink_interval=15 - -Class creation --------------- - - # mkdir c1 - -Its initial share is DONT_CARE. The parent's share values will be unchanged. - -Setting a new class share -------------------------- - - # echo 'res=mem,guarantee=25000,limit=50000' > c1/shares - - # cat c1/shares - res=mem,guarantee=25000,limit=50000,total_guarantee=100,max_limit=100 - - 'guarantee' specifies the number of pages this class entitled to get - 'limit' is the maximum number of pages this class can get. - -Monitoring ----------- - -stats file shows statistics of the page usage of a class - # cat stats - ----------- Memory Resource stats start ----------- - System: tot_pages=239778,active=60473,inactive=135285,free=44555 - Number of pages used(including pages lent to children): 196654 - Number of pages guaranteed: 239778 - Maximum limit of pages: 239778 - Total number of pages available(after serving guarantees to children): 214778 - Number of pages lent to children: 0 - Number of pages borrowed from the parent: 0 - ----------- Memory Resource stats end ----------- - diff --git a/Documentation/ckrm/numtasks b/Documentation/ckrm/numtasks deleted file mode 100644 index 94b4b09ef..000000000 --- a/Documentation/ckrm/numtasks +++ /dev/null @@ -1,122 +0,0 @@ -Introduction -------------- - -Numtasks is a resource controller under the CKRM framework that allows the -user/sysadmin to manage the number of tasks a class can create. It also allows -one to limit the fork rate across the system. - -As with any other resource under the CKRM framework, numtasks also assigns -all the resources to the detault class(/rcfs/taskclass). Since , the number -of tasks in a system is not limited, this resource controller provides a -way to set the total number of tasks available in the system through the config -file. By default this value is 128k(131072). In other words, if not changed, -the total number of tasks allowed in a system is 131072. - -The config variable that affect this is sys_total_tasks. - -This resource controller also allows the sysadmin to limit the number of forks -that are allowed in the system within the specified number of seconds. This -can be acheived by changing the attributes forkrate and forkrate_interval in -the config file. Through this feature one can protect the system from being -attacked by fork bomb type applications. - -Installation -------------- - -1. Configure "Number of Tasks Resource Manager" under CKRM (see - Documentation/ckrm/installation). This can be configured as a module - also. But, when inserted as a module it cannot be removed. - -2. Reboot the system with the new kernel. Insert the module, if compiled - as a module. - -3. Verify that the memory controller is present by reading the file - /rcfs/taskclass/config (should show a line with res=numtasks) - -Usage ------ - -For brevity, unless otherwise specified all the following commands are -executed in the default class (/rcfs/taskclass). - -As explained above the config file shows sys_total_tasks and forkrate -info. - - # cd /rcfs/taskclass - # cat config - res=numtasks,sys_total_tasks=131072,forkrate=1000000,forkrate_interval=3600 - -By default, the sys_total_tasks is set to 131072(128k), and forkrate is set -to 1 million and forkrate_interval is set to 3600 seconds. Which means the -total number of tasks in a system is limited to 131072 and the forks are -limited to 1 million per hour. - -sysadmin can change these values by just writing the attribute/value pair -to the config file. - - # echo res=numtasks,forkrate=100,forkrate_interval=10 > config - # cat config - res=numtasks,sys_total_tasks=1000,forkrate=100,forkrate_interval=10 - - # echo res=numtasks,forkrate=100,forkrate_interval=10 > config - # cat config - res=numtasks,sys_total_tasks=1000,forkrate=100,forkrate_interval=10 - -By making total_guarantee and max_limit to be same as sys_total_tasks, -sysadmin can make the numbers in shares file be same as the number of tasks -for a class. - - # echo res=numtasks,total_guarantee=131072,max_limit=131072 > shares - # cat shares - res=numtasks,guarantee=-2,limit=-2,total_guarantee=131072,max_limit=131072 - - -Class creation --------------- - - # mkdir c1 - -Its initial share is don't care. The parent's share values will be unchanged. - -Setting a new class share -------------------------- - -'guarantee' specifies the number of tasks this class is entitled to get -'limit' is the maximum number of tasks this class can get. - -Following command will set the guarantee of class c1 to be 25000 and the limit -to be 50000 - - # echo 'res=numtasks,guarantee=25000,limit=50000' > c1/shares - # cat c1/shares - res=numtasks,guarantee=25000,limit=50000,total_guarantee=100,max_limit=100 - -Limiting forks in a time period -------------------------------- -By default, this resource controller allows forking of 1 million tasks in -an hour. - -Folowing command would change it to allow only 100 forks per 10 seconds - - # echo res=numtasks,forkrate=100,forkrate_interval=10 > config - # cat config - res=numtasks,sys_total_tasks=1000,forkrate=100,forkrate_interval=10 - -Note that the same set of values is used across the system. In other words, -each individual class will be allowed 'forkrate' forks in 'forkrate_interval' -seconds. - -Monitoring ----------- - -stats file shows statistics of the number of tasks usage of a class -[root@localhost taskclass]# cat stats -Number of tasks resource: -Total Over limit failures: 0 -Total Over guarantee sucesses: 0 -Total Over guarantee failures: 0 -Maximum Over limit failures: 0 -Maximum Over guarantee sucesses: 0 -Maximum Over guarantee failures: 0 -cur_alloc 38; borrowed 0; cnt_guar 131072; cnt_limit 131072 cnt_unused 131072, unused_guarantee 100, cur_max_limit 0 - diff --git a/Documentation/ckrm/rbce_basics b/Documentation/ckrm/rbce_basics deleted file mode 100644 index fd66ef2fb..000000000 --- a/Documentation/ckrm/rbce_basics +++ /dev/null @@ -1,67 +0,0 @@ -Rule-based Classification Engine (RBCE) -------------------------------------------- - -The ckrm/rbce directory contains the sources for two classification engines -called rbce and crbce. Both are optional, built as kernel modules and share much -of their codebase. Only one classification engine (CE) can be loaded at a time -in CKRM. - - -With RBCE, user can specify rules for how tasks are classified to a -class. Rules are specified by one or more attribute-value pairs and -an associated class. The tasks that match all the attr-value pairs -will get classified to the class attached with the rule. - -The file rbce_info under /rcfs/ce directory details the functionality -of different files available under the directory and also details -about attributes that can are used to define rules. - -order: When multiple rules are defined the rules are executed - according to the order of a rule. Order can be specified - while defining a rule. If order is not specified, the - highest order will be assigned to the rule(i.e, the new - rule will be executed after all the previously defined - evaluate false). So, order of rules is important as that - will decide, which class a task will get assigned to. For - example, if we have the two following rules: r1: - uid=1004,order=10,class=/rcfs/taskclass/c1 r2: - uid=1004,cmd=grep,order=20,class=/rcfs/taskclass/c2 then, - the task "grep" executed by user 1004 will always be - assigned to class /rcfs/taskclass/c1, as rule r1 will be - executed before r2 and the task successfully matched the - rule's attr-value pairs. Rule r2 will never be consulted - for the command. Note: The order in which the rules are - displayed(by ls) has no correlation with the order of the - rule. - -dependency: Rules can be defined to be depend on another rule. i.e a - rule can be dependent on one rule and has its own - additional attr-value pairs. the dependent rule will - evaluate true only if all the attr-value pairs of both - rules are satisfied. ex: r1: gid=502,class=/rcfs/taskclass - r2: depend=r1,cmd=grep,class=rcfstaskclass/c1 r2 is a - dependent rule that depends on r1, a task will be assigned - to /rcfs/taskclass/c1 if its gid is 502 and the executable - command name is "grep". If a task's gid is 502 but the - command name is _not_ "grep" then it will be assigned to - /rcfs/taskclass - - Note: The order of dependent rule must be _lesser_ than the - rule it depends on, so that it is evaluated _before the - base rule is evaluated. Otherwise the base rule will - evaluate true and the task will be assigned to the class of - that rule without the dependent rule ever getting - evaluated. In the example above, order of r2 must be lesser - than order of r1. - -app_tag: a task can be attached with a tag(ascii string), that becomes - an attribute of that task and rules can be defined with the - tag value. - -state: states are at two levels in RBCE. The entire RBCE can be - enabled or disabled which writing 1 or 0 to the file - rbce_state under /rcfs/ce. Disabling RBCE, would mean that - the rules defined in RBCE will not be utilized for - classifying a task to a class. A specific rule can be - enabled/disabled by changing the state of that rule. Once - it is disabled, the rule will not be evaluated. diff --git a/Documentation/ckrm/rbce_usage b/Documentation/ckrm/rbce_usage deleted file mode 100644 index 6d1592646..000000000 --- a/Documentation/ckrm/rbce_usage +++ /dev/null @@ -1,98 +0,0 @@ -Usage of CKRM with RBCE --------------------------- - -0. Ensure that a CKRM-enabled kernel with following options configured - has been compiled. At a minimum, core, rcfs and atleast one - classtype. For testing, it is recommended all classtypes and - resource controllers be compiled as modules. - -1. Change ckrm/rbce/Makefile's KDIR to point to this compiled kernel's source - tree and call make - -2. Load rbce module. - # insmod ckrm/rbce/rbce.ko - Note that /rcfs has to be mounted before this. - Note: this command should populate the directory /rcfs/ce with files - rbce_reclassify, rbce_tag, rbce_info, rbce_state and a directory - rules. - - Note2: If these are not created automatically, just create them by - using the commands touch and mkdir.(bug that needs to be fixed) - -3. Defining a rule - Rules are defined by creating(by writing) to a file under the - /rcfs/ce/rules directory by concatinating multiple attribute value - pairs. - - Note that the classes must be defined before defining rules that - uses the classes. eg: the command # echo - "uid=1004,class=/rcfs/taskclass/c1" > /rcfs/ce/rules/r1 will define - a rule r1 that classifies all tasks belong to user id 1004 to class - /rcfs/taskclass/c1 - -4. Viewing a rule - read the corresponding file. - to read rule r1, issue the command: - # cat /rcfs/ce/rules/r1 - -5. Changing a rule - - Changing a rule is done the same way as defining a rule, the new - rule will include the old set of attr-value pairs slapped with new - attr-value pairs. eg: if the current r2 is - uid=1004,depend=r1,class=/rcfs/taskclass/c1 - (r1 as defined in step 3) - - the command: - # echo gid=502 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,gid=502,depend=r1,class=/rcfs/taskclass/c1 - - the command: - # echo uid=1005 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1005,class=/rcfs/taskclass/c1 - - the command: - # echo class=/rcfs/taskclass/c2 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,depend=r1,class=/rcfs/taskclass/c2 - - the command: - # echo depend=r4 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,depend=r4,class=/rcfs/taskclass/c2 - - the command: - # echo +depend=r4 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,depend=r1,depend=r4,class=/rcfs/taskclass/c2 - - the command: - # echo -depend=r1 > /rcfs/ce/rules/r1 - will change the rule to - r1: uid=1004,class=/rcfs/taskclass/c2 - -6. Checking the state of RBCE - State(enabled/disabled) of RBCE can be checked by reading the file - /rcfs/ce/rbce_state, it will show 1(enabled) or 0(disabled). - By default, RBCE is enabled(1). - ex: # cat /rcfs/ce/rbce_state - -7. Changing the state of RBCE - State of RBCE can be changed by writing 1(enable) or 0(disable). - ex: # echo 1 > cat /rcfs/ce/rbce_state - -8. Checking the state of a rule - State of a rule is displayed in the rule. Rule can be viewed by - reading the rule file. ex: # cat /rcfs/ce/rules/r1 - -9. Changing the state of a rule - - State of a rule can be changed by writing "state=1"(enable) or - "state=0"(disable) to the corresponding rule file. By defeault, the - rule is enabled when defined. ex: to disable an existing rule r1, - issue the command - # echo "state=0" > /rcfs/ce/rules/r1 - - diff --git a/configs/kernel-2.6.10-i686-planetlab.config b/configs/kernel-2.6.10-i686-planetlab.config index 741d59781..36c57e02f 100644 --- a/configs/kernel-2.6.10-i686-planetlab.config +++ b/configs/kernel-2.6.10-i686-planetlab.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.10-1.771_FC2.2.planetlab -# Mon Jun 6 12:56:10 2005 +# Linux kernel version: 2.6.10-1.771_FC2.2.planetlab.2005.07.21 +# Thu Jul 21 17:47:33 2005 # CONFIG_X86=y CONFIG_MMU=y @@ -25,22 +25,6 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set - -# -# Class Based Kernel Resource Management -# -CONFIG_CKRM=y -CONFIG_RCFS_FS=y -CONFIG_CKRM_TYPE_TASKCLASS=y -CONFIG_CKRM_RES_NULL=m -# CONFIG_CKRM_RES_MEM is not set -# CONFIG_CKRM_TYPE_SOCKETCLASS is not set -# CONFIG_CKRM_RES_NUMTASKS is not set -CONFIG_CKRM_CPU_SCHEDULE=y -# CONFIG_CKRM_RES_BLKIO is not set -CONFIG_CKRM_CPU_SCHEDULE_AT_BOOT=y -CONFIG_CKRM_RBCE=y -# CONFIG_CKRM_CRBCE is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set CONFIG_LOG_BUF_SHIFT=17 @@ -160,7 +144,9 @@ CONFIG_KEXEC=y # # Power management options (ACPI, APM) # -# CONFIG_PM is not set +CONFIG_PM=y +# CONFIG_PM_DEBUG is not set +# CONFIG_SOFTWARE_SUSPEND is not set # # ACPI (Advanced Configuration and Power Interface) Support @@ -168,6 +154,8 @@ CONFIG_KEXEC=y CONFIG_ACPI=y CONFIG_ACPI_BOOT=y CONFIG_ACPI_INTERPRETER=y +CONFIG_ACPI_SLEEP=y +CONFIG_ACPI_SLEEP_PROC_FS=y # CONFIG_ACPI_AC is not set # CONFIG_ACPI_BATTERY is not set # CONFIG_ACPI_BUTTON is not set @@ -186,6 +174,11 @@ CONFIG_ACPI_PCI=y CONFIG_ACPI_SYSTEM=y # CONFIG_X86_PM_TIMER is not set +# +# APM (Advanced Power Management) BIOS Support +# +# CONFIG_APM is not set + # # CPU Frequency scaling # @@ -524,7 +517,6 @@ CONFIG_INET=y # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set # CONFIG_INET_TUNNEL is not set -# CONFIG_ACCEPT_QUEUES is not set # CONFIG_IP_TCPDIAG is not set # CONFIG_IP_TCPDIAG_IPV6 is not set @@ -994,6 +986,7 @@ CONFIG_USB=y CONFIG_USB_DEVICEFS=y # CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set +# CONFIG_USB_SUSPEND is not set # CONFIG_USB_OTG is not set CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y @@ -1333,7 +1326,8 @@ CONFIG_NLS_UTF8=m # # Profiling support # -# CONFIG_PROFILING is not set +CONFIG_PROFILING=y +CONFIG_OPROFILE=y # # Kernel hacking @@ -1364,7 +1358,8 @@ CONFIG_STACK_WARN=4096 CONFIG_VSERVER_FILESHARING=y CONFIG_VSERVER_LEGACY=y # CONFIG_VSERVER_PROC_SECURE is not set -# CONFIG_VSERVER_HARDCPU is not set +CONFIG_VSERVER_HARDCPU=y +# CONFIG_VSERVER_HARDCPU_IDLE is not set # CONFIG_INOXID_NONE is not set # CONFIG_INOXID_UID16 is not set # CONFIG_INOXID_GID16 is not set diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 9438a961f..f31551024 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -13,13 +13,12 @@ # kblockd threads # -obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o ckrm-iostub.o +obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_CKRM_RES_BLKIO) += ckrm-io.o obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o obj-$(CONFIG_BLK_DEV_FD98) += floppy98.o diff --git a/drivers/block/ckrm-io.c b/drivers/block/ckrm-io.c deleted file mode 100644 index 89910268f..000000000 --- a/drivers/block/ckrm-io.c +++ /dev/null @@ -1,508 +0,0 @@ -/* linux/drivers/block/ckrm_io.c : Block I/O Resource Controller for CKRM - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2004 - * - * - * Provides best-effort block I/O bandwidth control for CKRM - * This file provides the CKRM API. The underlying scheduler is a - * modified Complete-Fair Queueing (CFQ) iosched. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 29 July 2004 - * Third complete rewrite for CKRM's current API - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* sectorate == 512 byte sectors served in CFQ_EPOCH ns*/ - -/* CKI_ROOTSECTORATE needs to be made configurable from outside */ -#define CKI_ROOTSECTORATE 100000 -#define CKI_MINSECTORATE 100 - -#define CKI_IOUSAGE_UNIT 512 - -typedef struct ckrm_io_stats{ - struct timeval epochstart ; /* all measurements relative to this - start time */ - unsigned long blksz; /* size of bandwidth unit */ - atomic_t blkrd; /* read units submitted to DD */ - atomic_t blkwr; /* write units submitted to DD */ - - int nskip; /* # times q skipped */ - unsigned long navsec; /* avg sectors serviced */ - int timedout; /* # times gap > epoch */ - u64 sec[2]; /* sectors serviced in - prev & curr epochs */ -} cki_stats_t; /* per class I/O statistics */ - -/* Note - * Currently local unit == CFQ I/O priority directly. - * CFQ ionice values have an implied bandwidth share so they - * can be added, subdivided etc. as long as the initial allocation - * of the systemwide default's total is set to the highest CFQ ionice - * value (== 100% of disk bandwidth) - */ - -typedef struct ckrm_io_class { - - struct ckrm_core_class *core; - struct ckrm_core_class *parent; - - struct ckrm_shares shares; - spinlock_t shares_lock; /* protect share changes */ - - /* Absolute shares of this class - * in local units. - */ - - cfqlim_t cfqpriv; /* Data common with cfq priolvl's */ - - - int cnt_guarantee; /* Allocation as parent */ - int cnt_unused; /* Allocation to default subclass */ - int cnt_limit; - - /* Statistics, for class and default subclass */ - cki_stats_t stats; - cki_stats_t mystats; - -} cki_icls_t; - - -/* Internal functions */ -static inline void cki_reset_stats(cki_stats_t *usg); -static inline void init_icls_one(cki_icls_t *icls); -static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres); - -/* External functions e.g. interface to ioscheduler */ -void *cki_tsk_icls (struct task_struct *tsk); -int cki_tsk_ioprio (struct task_struct *tsk); - -extern void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv); - -/* CKRM Resource Controller API functions */ -static void * cki_alloc(struct ckrm_core_class *this, - struct ckrm_core_class * parent); -static void cki_free(void *res); -static int cki_setshare(void *res, struct ckrm_shares * shares); -static int cki_getshare(void *res, struct ckrm_shares * shares); -static int cki_getstats(void *res, struct seq_file *); -static int cki_resetstats(void *res); -static int cki_showconfig(void *res, struct seq_file *sfile); -static int cki_setconfig(void *res, const char *cfgstr); -static void cki_chgcls(void *tsk, void *oldres, void *newres); - - -struct ckrm_res_ctlr cki_rcbs; - -static inline void cki_reset_stats(cki_stats_t *stats) -{ - if (stats) { - atomic_set(&stats->blkrd,0); - atomic_set(&stats->blkwr,0); - } -} - -static inline void init_icls_stats(cki_icls_t *icls) -{ - struct timeval tv; - - do_gettimeofday(&tv); - icls->stats.epochstart = icls->mystats.epochstart = tv; - icls->stats.blksz = icls->mystats.blksz = CKI_IOUSAGE_UNIT; - cki_reset_stats(&icls->stats); - cki_reset_stats(&icls->mystats); -} - -/* Initialize icls to default values - * No other classes touched, locks not reinitialized. - */ - -static inline void init_icls_one(cki_icls_t *icls) -{ - /* Zero initial guarantee for scalable creation of - multiple classes */ - - /* Try out a new set */ - - icls->shares.my_guarantee = CKRM_SHARE_DONTCARE; - icls->shares.my_limit = CKRM_SHARE_DONTCARE; - icls->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - icls->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - icls->shares.unused_guarantee = icls->shares.total_guarantee; - icls->shares.cur_max_limit = 0; - - icls->cnt_guarantee = CKRM_SHARE_DONTCARE; - icls->cnt_unused = CKRM_SHARE_DONTCARE; - icls->cnt_limit = CKRM_SHARE_DONTCARE; - - init_icls_stats(icls); -} - -/* Recalculate absolute shares from relative - * Caller should hold a lock on icls - */ - -static void cki_recalc_propagate(cki_icls_t *res, cki_icls_t *parres) -{ - - ckrm_core_class_t *child = NULL; - cki_icls_t *childres; - int resid = cki_rcbs.resid; - u64 temp; - - if (parres) { - struct ckrm_shares *par = &parres->shares; - struct ckrm_shares *self = &res->shares; - - - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { - res->cnt_guarantee = CKRM_SHARE_DONTCARE; - } else if (par->total_guarantee) { - temp = (u64) self->my_guarantee * - parres->cnt_guarantee; - do_div(temp, par->total_guarantee); - res->cnt_guarantee = (int) temp; - } else { - res->cnt_guarantee = 0; - } - - - if (parres->cnt_limit == CKRM_SHARE_DONTCARE) { - res->cnt_limit = CKRM_SHARE_DONTCARE; - atomic_set(&res->cfqpriv.sectorate,CKI_MINSECTORATE); - } else { - if (par->max_limit) { - temp = (u64) self->my_limit * - parres->cnt_limit; - do_div(temp, par->max_limit); - res->cnt_limit = (int) temp; - } else { - res->cnt_limit = 0; - } - atomic_set(&res->cfqpriv.sectorate,res->cnt_limit); - } - - if (res->cnt_guarantee == CKRM_SHARE_DONTCARE) { - res->cnt_unused = CKRM_SHARE_DONTCARE; - } else { - if (self->total_guarantee) { - temp = (u64) self->unused_guarantee * - res->cnt_guarantee; - do_div(temp, self->total_guarantee); - res->cnt_unused = (int) temp; - } else { - res->cnt_unused = 0; - } - - } - - } - // propagate to children - ckrm_lock_hier(res->core); - while ((child = ckrm_get_next_child(res->core,child)) != NULL){ - childres = ckrm_get_res_class(child, resid, - cki_icls_t); - - spin_lock(&childres->shares_lock); - cki_recalc_propagate(childres, res); - spin_unlock(&childres->shares_lock); - } - ckrm_unlock_hier(res->core); -} - -void *cki_tsk_icls(struct task_struct *tsk) -{ - return (void *) ckrm_get_res_class(class_core(tsk->taskclass), - cki_rcbs.resid, cki_icls_t); -} - -int cki_tsk_ioprio(struct task_struct *tsk) -{ - /* Don't use I/O priorities for now */ - return IOPRIO_NORM; -} - -void *cki_tsk_cfqpriv(struct task_struct *tsk) -{ - cki_icls_t *icls = ckrm_get_res_class(class_core(tsk->taskclass), - cki_rcbs.resid, cki_icls_t); - return (void *)&(icls->cfqpriv); -} - - -static void *cki_alloc(struct ckrm_core_class *core, - struct ckrm_core_class *parent) -{ - cki_icls_t *icls; - - icls = kmalloc(sizeof(cki_icls_t), GFP_ATOMIC); - if (!icls) { - printk(KERN_ERR "cki_res_alloc failed GFP_ATOMIC\n"); - return NULL; - } - - memset(icls, 0, sizeof(cki_icls_t)); - icls->core = core; - icls->parent = parent; - icls->shares_lock = SPIN_LOCK_UNLOCKED; - - init_icls_one(icls); - - if (parent == NULL) { - icls->cnt_guarantee = CKI_ROOTSECTORATE; - icls->cnt_unused = CKI_ROOTSECTORATE; - icls->cnt_limit = CKI_ROOTSECTORATE; - atomic_set(&(icls->cfqpriv.sectorate),icls->cnt_limit); - } - try_module_get(THIS_MODULE); - return icls; -} - -static void cki_free(void *res) -{ - cki_icls_t *icls = res, *parres, *childres; - ckrm_core_class_t *child = NULL; - int maxlimit, resid = cki_rcbs.resid; - - - if (!res) - return; - - /* Deallocate CFQ queues */ - - /* Currently CFQ queues are deallocated when empty. Since no task - * should belong to this icls, no new requests will get added to the - * CFQ queue. - * - * When CFQ switches to persistent queues, call its "put" function - * so it gets deallocated after the last pending request is serviced. - * - */ - - parres = ckrm_get_res_class(icls->parent, resid, cki_icls_t); - if (!parres) { - printk(KERN_ERR "cki_free: error getting " - "resclass from core \n"); - return; - } - - /* Update parent's shares */ - spin_lock(&parres->shares_lock); - - child_guarantee_changed(&parres->shares, icls->shares.my_guarantee, 0); - parres->cnt_unused += icls->cnt_guarantee; - - // run thru parent's children and get the new max_limit of the parent - ckrm_lock_hier(parres->core); - maxlimit = 0; - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - childres = ckrm_get_res_class(child, resid, cki_icls_t); - if (maxlimit < childres->shares.my_limit) { - maxlimit = childres->shares.my_limit; - } - } - ckrm_unlock_hier(parres->core); - if (parres->shares.cur_max_limit < maxlimit) { - parres->shares.cur_max_limit = maxlimit; - } - spin_unlock(&parres->shares_lock); - - kfree(res); - module_put(THIS_MODULE); - return; -} - - -static int cki_setshare(void *res, struct ckrm_shares *new) -{ - cki_icls_t *icls = res, *parres; - struct ckrm_shares *cur, *par; - int rc = -EINVAL, resid = cki_rcbs.resid; - - if (!icls) - return rc; - - cur = &icls->shares; - if (icls->parent) { - parres = - ckrm_get_res_class(icls->parent, resid, cki_icls_t); - if (!parres) { - pr_debug("cki_setshare: invalid resclass\n"); - return -EINVAL; - } - spin_lock(&parres->shares_lock); - spin_lock(&icls->shares_lock); - par = &parres->shares; - } else { - spin_lock(&icls->shares_lock); - parres = NULL; - par = NULL; - } - - rc = set_shares(new, cur, par); - - if ((!rc) && parres) { - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) { - parres->cnt_unused = CKRM_SHARE_DONTCARE; - } else if (par->total_guarantee) { - u64 temp = (u64) par->unused_guarantee * - parres->cnt_guarantee; - do_div(temp, par->total_guarantee); - parres->cnt_unused = (int) temp; - } else { - parres->cnt_unused = 0; - } - cki_recalc_propagate(res, parres); - } - spin_unlock(&icls->shares_lock); - if (icls->parent) { - spin_unlock(&parres->shares_lock); - } - return rc; -} - -static int cki_getshare(void *res, struct ckrm_shares * shares) -{ - cki_icls_t *icls = res; - - if (!icls) - return -EINVAL; - *shares = icls->shares; - return 0; -} - -static int cki_getstats(void *res, struct seq_file *sfile) -{ - cki_icls_t *icls = res; - - if (!icls) - return -EINVAL; - - seq_printf(sfile, "abs limit %d\n",icls->cnt_limit); - seq_printf(sfile, "skip %d timdout %d avsec %lu rate %ld " - " sec0 %ld sec1 %ld\n", - icls->cfqpriv.nskip, - icls->cfqpriv.timedout, - icls->cfqpriv.navsec, - atomic_read(&(icls->cfqpriv.sectorate)), - (unsigned long)icls->cfqpriv.sec[0], - (unsigned long)icls->cfqpriv.sec[1]); - - return 0; -} - -static int cki_resetstats(void *res) -{ - cki_icls_t *icls = res; - - if (!res) - return -EINVAL; - - init_icls_stats(icls); - return 0; -} - -static int cki_showconfig(void *res, struct seq_file *sfile) -{ - return -ENOSYS; -} - -static int cki_setconfig(void *res, const char *cfgstr) -{ - return -ENOSYS; -} - -static void cki_chgcls(void *tsk, void *oldres, void *newres) -{ - /* cki_icls_t *oldicls = oldres, *newicls = newres; */ - - /* Nothing needs to be done - * Future requests from task will go to the new class's CFQ q - * Old ones will continue to get satisfied from the original q - * - * Once CFQ moves to a persistent queue model and if refcounts on - * icls's CFQ queues are used, a decrement op would be needed here - */ - - return; -} - - - -struct ckrm_res_ctlr cki_rcbs = { - .res_name = "io", - .res_hdepth = 1, - .resid = -1, - .res_alloc = cki_alloc, - .res_free = cki_free, - .set_share_values = cki_setshare, - .get_share_values = cki_getshare, - .get_stats = cki_getstats, - .reset_stats = cki_resetstats, - .show_config = cki_showconfig, - .set_config = cki_setconfig, - .change_resclass = cki_chgcls, -}; - - - -int __init cki_init(void) -{ - struct ckrm_classtype *clstype; - int resid = cki_rcbs.resid; - - clstype = ckrm_find_classtype_by_name("taskclass"); - if (clstype == NULL) { - printk(KERN_INFO "init_cki: classtype not found\n"); - return -ENOENT; - } - - if (resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &cki_rcbs); - if (resid != -1) { - cki_rcbs.classtype = clstype; - cki_cfq_set(cki_tsk_icls,cki_tsk_ioprio,cki_tsk_cfqpriv); - } - } - - return 0; -} - -void __exit cki_exit(void) -{ - ckrm_unregister_res_ctlr(&cki_rcbs); - cki_rcbs.resid = -1; - cki_rcbs.classtype = NULL; - cki_cfq_set(NULL,NULL,NULL); -} - -module_init(cki_init) -module_exit(cki_exit) - -MODULE_AUTHOR("Shailabh Nagar "); -MODULE_DESCRIPTION("CKRM Disk I/O Resource Controller"); -MODULE_LICENSE("GPL"); - diff --git a/drivers/block/ckrm-iostub.c b/drivers/block/ckrm-iostub.c deleted file mode 100644 index f4012545b..000000000 --- a/drivers/block/ckrm-iostub.c +++ /dev/null @@ -1,78 +0,0 @@ -/* ckrm-iostub.c - Stub file for ckrm_io module - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 07 Aug 2004: Created - * - */ - -#include -#include -#include - -static spinlock_t stub_lock = SPIN_LOCK_UNLOCKED; - -static icls_tsk_t tskiclstub; -static icls_ioprio_t tskiopriostub; -static icls_tsk_t tskcfqprivstub; - -void cki_cfq_set(icls_tsk_t tskicls, icls_ioprio_t tskioprio, icls_tsk_t tskcfqpriv) -{ - spin_lock(&stub_lock); - tskiclstub = tskicls; - tskiopriostub = tskioprio; - tskcfqprivstub = tskcfqpriv; - spin_unlock(&stub_lock); -} - -void *cki_hash_key(struct task_struct *tsk) -{ - void *ret; - spin_lock(&stub_lock); - if (tskiclstub) - ret = (*tskiclstub)(tsk); - else - ret = (void *) tsk->tgid; - spin_unlock(&stub_lock); - return ret; -} - -int cki_ioprio(struct task_struct *tsk) -{ - int ret; - spin_lock(&stub_lock); - if (tskiopriostub) - ret = (*tskiopriostub)(tsk); - else - ret = tsk->ioprio; - spin_unlock(&stub_lock); - return ret; -} - -void *cki_cfqpriv(struct task_struct *tsk) -{ - void *ret; - spin_lock(&stub_lock); - if (tskiclstub) - ret = (*tskcfqprivstub)(tsk); - else - ret = NULL; - spin_unlock(&stub_lock); - return ret; -} - -EXPORT_SYMBOL(cki_cfq_set); -EXPORT_SYMBOL(cki_hash_key); -EXPORT_SYMBOL(cki_ioprio); -EXPORT_SYMBOL(cki_cfqpriv); diff --git a/fs/Makefile b/fs/Makefile index c58878280..49e9b11b4 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -93,6 +93,5 @@ obj-$(CONFIG_JFS_FS) += jfs/ obj-$(CONFIG_XFS_FS) += xfs/ obj-$(CONFIG_AFS_FS) += afs/ obj-$(CONFIG_BEFS_FS) += befs/ -obj-$(CONFIG_RCFS_FS) += rcfs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ diff --git a/fs/exec.c b/fs/exec.c index 4aa484b1f..dd40d3010 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -47,8 +47,6 @@ #include #include #include -#include -#include #include #include @@ -564,7 +562,6 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); task_unlock(tsk); arch_pick_mmap_layout(mm); - ckrm_task_change_mm(tsk, old_mm, mm); if (old_mm) { if (active_mm != old_mm) BUG(); mmput(old_mm); @@ -1070,7 +1067,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) fput(bprm->file); bprm->file = NULL; current->did_exec = 1; - ckrm_cb_exec(bprm->filename); return retval; } read_lock(&binfmt_lock); diff --git a/fs/proc/array.c b/fs/proc/array.c index fe7f82315..b6378ee4f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -544,21 +544,3 @@ int proc_pid_statm(struct task_struct *task, char *buffer) return sprintf(buffer,"%d %d %d %d %d %d %d\n", size, resident, shared, text, lib, data, 0); } - - -int proc_pid_delay(struct task_struct *task, char * buffer) -{ - int res; - - res = sprintf(buffer,"%u %llu %llu %u %llu %u %llu\n", - (unsigned int) get_delay(task,runs), - (uint64_t) get_delay(task,runcpu_total), - (uint64_t) get_delay(task,waitcpu_total), - (unsigned int) get_delay(task,num_iowaits), - (uint64_t) get_delay(task,iowait_total), - (unsigned int) get_delay(task,num_memwaits), - (uint64_t) get_delay(task,mem_iowait_total) - ); - return res; -} - diff --git a/fs/proc/base.c b/fs/proc/base.c index 65f144adb..aada8d4d6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1408,13 +1408,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir, inode->i_fop = &proc_info_file_operations; ei->op.proc_read = proc_pid_nx_info; break; -#ifdef CONFIG_DELAY_ACCT - case PROC_TID_DELAY_ACCT: - case PROC_TGID_DELAY_ACCT: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_schedstat; - break; -#endif #ifdef CONFIG_SCHEDSTATS case PROC_TID_SCHEDSTAT: case PROC_TGID_SCHEDSTAT: diff --git a/fs/rcfs/Makefile b/fs/rcfs/Makefile deleted file mode 100644 index 29575223e..000000000 --- a/fs/rcfs/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# -# Makefile for rcfs routines. -# - -obj-$(CONFIG_RCFS_FS) += rcfs.o - -rcfs-objs := super.o inode.o dir.o rootdir.o magic.o tc_magic.o socket_fs.o - -rcfs-objs-$(CONFIG_CKRM_TYPE_TASKCLASS) += tc_magic.o -rcfs-objs-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += socket_fs.o diff --git a/fs/rcfs/dir.c b/fs/rcfs/dir.c deleted file mode 100644 index 048fe09bd..000000000 --- a/fs/rcfs/dir.c +++ /dev/null @@ -1,336 +0,0 @@ -/* - * fs/rcfs/dir.c - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2004 - * Vivek Kashyap, IBM Corp. 2004 - * - * - * Directory operations for rcfs - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 08 Mar 2004 - * Created. - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - - - -#define rcfs_positive(dentry) ((dentry)->d_inode && !d_unhashed((dentry))) - -int rcfs_empty(struct dentry *dentry) -{ - struct dentry *child; - int ret = 0; - - spin_lock(&dcache_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_child) - if (!rcfs_is_magic(child) && rcfs_positive(child)) - goto out; - ret = 1; -out: - spin_unlock(&dcache_lock); - return ret; -} - - - - -/* Directory inode operations */ - - -int -rcfs_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - return rcfs_mknod(dir, dentry, mode | S_IFREG, 0); -} -EXPORT_SYMBOL(rcfs_create); - - -/* Symlinks permitted ?? */ -int -rcfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname) -{ - struct inode *inode; - int error = -ENOSPC; - - inode = rcfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); - if (inode) { - int l = strlen(symname)+1; - error = page_symlink(inode, symname, l); - if (!error) { - if (dir->i_mode & S_ISGID) - inode->i_gid = dir->i_gid; - d_instantiate(dentry, inode); - dget(dentry); - } else - iput(inode); - } - return error; -} -EXPORT_SYMBOL(rcfs_symlink); - -int -rcfs_create_coredir(struct inode *dir, struct dentry *dentry) -{ - - struct rcfs_inode_info *ripar, *ridir; - int sz; - - ripar = RCFS_I(dir); - ridir = RCFS_I(dentry->d_inode); - - // Inform RC's - do Core operations - if (ckrm_is_core_valid(ripar->core)) { - sz = strlen(ripar->name) + strlen(dentry->d_name.name) + 2 ; - ridir->name = kmalloc(sz, GFP_KERNEL); - if (!ridir->name) { - return -ENOMEM; - } - snprintf(ridir->name, sz,"%s/%s", ripar->name, - dentry->d_name.name); - ridir->core = (*(ripar->core->classtype->alloc)) - (ripar->core,ridir->name); - } - else { - printk(KERN_ERR "rcfs_mkdir: Invalid parent core %p\n", - ripar->core); - return -EINVAL; - } - - return 0; -} -EXPORT_SYMBOL(rcfs_create_coredir); - - -int -rcfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - - int retval = 0; - ckrm_classtype_t *clstype; - -#if 0 - struct dentry *pd = list_entry(dir->i_dentry.next, struct dentry, - d_alias); - if ((!strcmp(pd->d_name.name, "/") && - !strcmp(dentry->d_name.name, "ce"))) { - // Call CE's mkdir if it has registered, else fail. - if (rcfs_eng_callbacks.mkdir) { - return (*rcfs_eng_callbacks.mkdir)(dir, dentry, mode); - } else { - return -EINVAL; - } - } -#endif - - if (_rcfs_mknod(dir, dentry, mode | S_IFDIR, 0)) { - printk(KERN_ERR "rcfs_mkdir: error in _rcfs_mknod\n"); - return retval; - } - - dir->i_nlink++; - - // Inherit parent's ops since _rcfs_mknod assigns noperm ops - dentry->d_inode->i_op = dir->i_op; - dentry->d_inode->i_fop = dir->i_fop; - - - retval = rcfs_create_coredir(dir, dentry); - if (retval) { - simple_rmdir(dir,dentry); - return retval; - // goto mkdir_err; - } - - // create the default set of magic files - clstype = (RCFS_I(dentry->d_inode))->core->classtype; - rcfs_create_magic(dentry, &(((struct rcfs_magf*)clstype->mfdesc)[1]), - clstype->mfcount-1); - - return retval; - -//mkdir_err: - dir->i_nlink--; - return retval; -} -EXPORT_SYMBOL(rcfs_mkdir); - - -int -rcfs_rmdir(struct inode * dir, struct dentry * dentry) -{ - struct rcfs_inode_info *ri = RCFS_I(dentry->d_inode); - -#if 0 - struct dentry *pd = list_entry(dir->i_dentry.next, - struct dentry, d_alias); - if ((!strcmp(pd->d_name.name, "/") && - !strcmp(dentry->d_name.name, "ce"))) { - // Call CE's mkdir if it has registered, else fail. - if (rcfs_eng_callbacks.rmdir) { - return (*rcfs_eng_callbacks.rmdir)(dir, dentry); - } else { - return simple_rmdir(dir, dentry); - } - } - else if ((!strcmp(pd->d_name.name, "/") && - !strcmp(dentry->d_name.name, "network"))) { - return -EPERM; - } -#endif - - if (!rcfs_empty(dentry)) { - printk(KERN_ERR "rcfs_rmdir: directory not empty\n"); - goto out; - } - - // Core class removal - - if (ri->core == NULL) { - printk(KERN_ERR "rcfs_rmdir: core==NULL\n"); - // likely a race condition - return 0; - } - - if ((*(ri->core->classtype->free))(ri->core)) { - printk(KERN_ERR "rcfs_rmdir: ckrm_free_core_class failed\n"); - goto out; - } - ri->core = NULL ; // just to be safe - - // Clear magic files only after core successfully removed - rcfs_clear_magic(dentry); - - return simple_rmdir(dir, dentry); - -out: - return -EBUSY; -} -EXPORT_SYMBOL(rcfs_rmdir); - - -int -rcfs_unlink(struct inode *dir, struct dentry *dentry) -{ - // -ENOENT and not -ENOPERM to allow rm -rf to work despite - // magic files being present - return -ENOENT; -} -EXPORT_SYMBOL(rcfs_unlink); - -// rename is allowed on directories only -int -rcfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - if (S_ISDIR(old_dentry->d_inode->i_mode)) - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); - else - return -EINVAL; -} -EXPORT_SYMBOL(rcfs_rename); - - -struct inode_operations rcfs_dir_inode_operations = { - .create = rcfs_create, - .lookup = simple_lookup, - .link = simple_link, - .unlink = rcfs_unlink, - .symlink = rcfs_symlink, - .mkdir = rcfs_mkdir, - .rmdir = rcfs_rmdir, - .mknod = rcfs_mknod, - .rename = rcfs_rename, -}; - - - - - -int -rcfs_root_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - return -EPERM; -} - - -int -rcfs_root_symlink(struct inode * dir, struct dentry *dentry, - const char * symname) -{ - return -EPERM; -} - -int -rcfs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - return -EPERM; -} - -int -rcfs_root_rmdir(struct inode * dir, struct dentry * dentry) -{ - return -EPERM; -} - -int -rcfs_root_unlink(struct inode *dir, struct dentry *dentry) -{ - return -EPERM; -} - -int -rcfs_root_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) -{ - return -EPERM; -} - -int -rcfs_root_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - return -EPERM; -} - -struct inode_operations rcfs_rootdir_inode_operations = { - .create = rcfs_root_create, - .lookup = simple_lookup, - .link = simple_link, - .unlink = rcfs_root_unlink, - .symlink = rcfs_root_symlink, - .mkdir = rcfs_root_mkdir, - .rmdir = rcfs_root_rmdir, - .mknod = rcfs_root_mknod, - .rename = rcfs_root_rename, -}; diff --git a/fs/rcfs/inode.c b/fs/rcfs/inode.c deleted file mode 100644 index d9be67394..000000000 --- a/fs/rcfs/inode.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - * fs/rcfs/inode.c - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2004 - * Vivek Kashyap, IBM Corp. 2004 - * - * - * Resource class filesystem (rcfs) forming the - * user interface to Class-based Kernel Resource Management (CKRM). - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 05 Mar 2004 - * Created. - * 06 Mar 2004 - * Parsing for shares added - */ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - - -// Address of variable used as flag to indicate a magic file, -// ; value unimportant -int RCFS_IS_MAGIC; - - -struct inode *rcfs_get_inode(struct super_block *sb, int mode, dev_t dev) -{ - struct inode * inode = new_inode(sb); - - if (inode) { - inode->i_mode = mode; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; - inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - switch (mode & S_IFMT) { - default: - init_special_inode(inode, mode, dev); - break; - case S_IFREG: - // Treat as default assignment */ - inode->i_op = &rcfs_file_inode_operations; - // inode->i_fop = &rcfs_file_operations; - break; - case S_IFDIR: - // inode->i_op = &rcfs_dir_inode_operations; - inode->i_op = &rcfs_rootdir_inode_operations; - inode->i_fop = &simple_dir_operations; - - // directory inodes start off with i_nlink == 2 - // (for "." entry) - - inode->i_nlink++; - break; - case S_IFLNK: - inode->i_op = &page_symlink_inode_operations; - break; - } - } - return inode; -} - - - -int -_rcfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) -{ - struct inode *inode; - int error = -EPERM; - - if (dentry->d_inode) - return -EEXIST; - - inode = rcfs_get_inode(dir->i_sb, mode, dev); - if (inode) { - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } - d_instantiate(dentry, inode); - dget(dentry); - error = 0; - } - - return error; -} -EXPORT_SYMBOL(_rcfs_mknod); - - -int -rcfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) -{ - // User can only create directories, not files - if ((mode & S_IFMT) != S_IFDIR) - return -EINVAL; - - return dir->i_op->mkdir(dir, dentry, mode); -} -EXPORT_SYMBOL(rcfs_mknod); - - -struct dentry * -rcfs_create_internal(struct dentry *parent, struct rcfs_magf *magf, int magic) -{ - struct qstr qstr; - struct dentry *mfdentry ; - - // Get new dentry for name - qstr.name = magf->name; - qstr.len = strlen(magf->name); - qstr.hash = full_name_hash(magf->name,qstr.len); - mfdentry = lookup_hash(&qstr,parent); - - if (!IS_ERR(mfdentry)) { - int err; - - down(&parent->d_inode->i_sem); - if (magic && (magf->mode & S_IFDIR)) - err = parent->d_inode->i_op->mkdir(parent->d_inode, - mfdentry, magf->mode); - else { - err =_rcfs_mknod(parent->d_inode,mfdentry, - magf->mode,0); - // _rcfs_mknod doesn't increment parent's link count, - // i_op->mkdir does. - parent->d_inode->i_nlink++; - } - up(&parent->d_inode->i_sem); - - if (err) { - dput(mfdentry); - return mfdentry; - } - } - return mfdentry ; -} -EXPORT_SYMBOL(rcfs_create_internal); - -int -rcfs_delete_internal(struct dentry *mfdentry) -{ - struct dentry *parent ; - - if (!mfdentry || !mfdentry->d_parent) - return -EINVAL; - - parent = mfdentry->d_parent; - - if (!mfdentry->d_inode) { - return 0; - } - down(&mfdentry->d_inode->i_sem); - if (S_ISDIR(mfdentry->d_inode->i_mode)) - simple_rmdir(parent->d_inode, mfdentry); - else - simple_unlink(parent->d_inode, mfdentry); - up(&mfdentry->d_inode->i_sem); - - d_delete(mfdentry); - - return 0; -} -EXPORT_SYMBOL(rcfs_delete_internal); - -struct inode_operations rcfs_file_inode_operations = { - .getattr = simple_getattr, -}; - - - - - - diff --git a/fs/rcfs/magic.c b/fs/rcfs/magic.c deleted file mode 100644 index 8a811cbaf..000000000 --- a/fs/rcfs/magic.c +++ /dev/null @@ -1,530 +0,0 @@ -/* - * fs/rcfs/magic.c - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2004 - * (C) Vivek Kashyap, IBM Corp. 2004 - * (C) Chandra Seetharaman, IBM Corp. 2004 - * (C) Hubertus Franke, IBM Corp. 2004 - * - * File operations for common magic files in rcfs, - * the user interface for CKRM. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -/* - * Changes - * - * 23 Apr 2004 - * Created from code kept earlier in fs/rcfs/magic_*.c - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Macros - * - * generic macros to assist in writing magic fileops - * - */ - -#define MAGIC_SHOW(FUNC) \ -static int \ -FUNC ## _show(struct seq_file *s, void *v) \ -{ \ - int rc=0; \ - ssize_t precnt; \ - ckrm_core_class_t *core ; \ - \ - core = (ckrm_core_class_t *) \ - (((struct rcfs_inode_info *)s->private)->core); \ - \ - if (!ckrm_is_core_valid(core)) { \ - return -EINVAL; \ - } \ - precnt = s->count ; \ - if (core->classtype->show_ ## FUNC) \ - rc = (* core->classtype->show_ ## FUNC)(core, s); \ - \ - if (s->count == precnt) \ - seq_printf(s, "No data to display\n"); \ - return rc; \ -}; - -#define MAGIC_OPEN(FUNC) \ -static int \ -FUNC ## _open(struct inode *inode, struct file *file) \ -{ \ - struct rcfs_inode_info *ri; \ - int ret=-EINVAL; \ - \ - if (file->f_dentry && file->f_dentry->d_parent) { \ - \ - ri = RCFS_I(file->f_dentry->d_parent->d_inode); \ - ret = single_open(file,FUNC ## _show, (void *)ri); \ - } \ - return ret; \ -} - -#define MAGIC_CLOSE(FUNC) \ -static int \ -FUNC ## _close(struct inode *inode, struct file *file) \ -{ \ - return single_release(inode,file); \ -} - -#define MAGIC_PARSE(FUNC) \ -static int \ -FUNC ## _parse(char *options, char **resstr, char **otherstr) \ -{ \ - char *p; \ - *resstr = NULL; \ - \ - if (!options) \ - return 0; \ - \ - while ((p = strsep(&options, ",")) != NULL) { \ - substring_t args[MAX_OPT_ARGS]; \ - int token; \ - \ - if (!*p) \ - continue; \ - \ - token = match_token(p, FUNC##_tokens, args); \ - switch (token) { \ - case FUNC ## _res_type: \ - *resstr = match_strdup(args); \ - if (!strcmp(#FUNC, "config")) { \ - char *str = p + strlen(p) + 1; \ - *otherstr = kmalloc(strlen(str) + 1, \ - GFP_KERNEL); \ - if (*otherstr == NULL) { \ - kfree(*resstr); \ - *resstr = NULL; \ - return 0; \ - } else { \ - strcpy(*otherstr, str); \ - return 1; \ - } \ - } \ - break; \ - case FUNC ## _str: \ - *otherstr = match_strdup(args); \ - break; \ - default: \ - return 0; \ - } \ - } \ - return (*resstr != NULL); \ -} - -#define MAGIC_WRITE(FUNC,CLSTYPEFUN) \ -static ssize_t \ -FUNC ## _write(struct file *file, const char __user *buf, \ - size_t count, loff_t *ppos) \ -{ \ - struct rcfs_inode_info *ri = \ - RCFS_I(file->f_dentry->d_parent->d_inode); \ - char *optbuf, *otherstr=NULL, *resname=NULL; \ - int done, rc = 0; \ - ckrm_core_class_t *core ; \ - \ - core = ri->core; \ - if (!ckrm_is_core_valid(core)) \ - return -EINVAL; \ - \ - if ((ssize_t) count < 0 \ - || (ssize_t) count > FUNC ## _max_input_size) \ - return -EINVAL; \ - \ - if (!access_ok(VERIFY_READ, buf, count)) \ - return -EFAULT; \ - \ - down(&(ri->vfs_inode.i_sem)); \ - \ - optbuf = kmalloc(FUNC ## _max_input_size, GFP_KERNEL); \ - __copy_from_user(optbuf, buf, count); \ - if (optbuf[count-1] == '\n') \ - optbuf[count-1]='\0'; \ - \ - done = FUNC ## _parse(optbuf, &resname, &otherstr); \ - \ - if (!done) { \ - printk(KERN_ERR "Error parsing FUNC \n"); \ - goto FUNC ## _write_out; \ - } \ - \ - if (core->classtype-> CLSTYPEFUN) { \ - rc = (*core->classtype->CLSTYPEFUN) \ - (core, resname, otherstr); \ - if (rc) { \ - printk(KERN_ERR "FUNC_write: CLSTYPEFUN error\n"); \ - goto FUNC ## _write_out; \ - } \ - } \ - \ -FUNC ## _write_out: \ - up(&(ri->vfs_inode.i_sem)); \ - kfree(optbuf); \ - kfree(otherstr); \ - kfree(resname); \ - return rc ? rc : count; \ -} - -#define MAGIC_RD_FILEOPS(FUNC) \ -struct file_operations FUNC ## _fileops = { \ - .open = FUNC ## _open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = FUNC ## _close, \ -}; \ -EXPORT_SYMBOL(FUNC ## _fileops); - -#define MAGIC_RDWR_FILEOPS(FUNC) \ -struct file_operations FUNC ## _fileops = { \ - .open = FUNC ## _open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = FUNC ## _close, \ - .write = FUNC ## _write, \ -}; \ -EXPORT_SYMBOL(FUNC ## _fileops); - -/* - * Shared function used by Members / Reclassify - */ - -#define MEMBERS_MAX_INPUT_SIZE 100 - -static ssize_t -members_reclassify_write(struct file *file, const char __user * buf, - size_t count, loff_t * ppos, int manual) -{ - struct rcfs_inode_info *ri = RCFS_I(file->f_dentry->d_inode); - char *optbuf; - int rc = -EINVAL; - ckrm_classtype_t *clstype; - - if ((ssize_t) count < 0 || (ssize_t) count > MEMBERS_MAX_INPUT_SIZE) - return -EINVAL; - if (!access_ok(VERIFY_READ, buf, count)) - return -EFAULT; - down(&(ri->vfs_inode.i_sem)); - optbuf = kmalloc(MEMBERS_MAX_INPUT_SIZE, GFP_KERNEL); - __copy_from_user(optbuf, buf, count); - if (optbuf[count - 1] == '\n') - optbuf[count - 1] = '\0'; - clstype = ri->core->classtype; - if (clstype->forced_reclassify) - rc = (*clstype->forced_reclassify) (manual ? ri->core: NULL, optbuf); - up(&(ri->vfs_inode.i_sem)); - kfree(optbuf); - return (!rc ? count : rc); - -} - -/* - * Reclassify - * - * pseudo file for reclassification of an object through CE - */ - -static ssize_t -reclassify_write(struct file *file, const char __user * buf, - size_t count, loff_t * ppos) -{ - return members_reclassify_write(file,buf,count,ppos,0); -} - -struct file_operations reclassify_fileops = { - .write = reclassify_write, -}; - -EXPORT_SYMBOL_GPL(reclassify_fileops); - -/* - * Config - * - * Set/get configuration parameters of a class. - */ - -/* - * Currently there are no per-class config parameters defined. - * Use existing code as a template - */ - -#define config_max_input_size 300 - -enum config_token_t { - config_str, config_res_type, config_err -}; - -static match_table_t config_tokens = { - {config_res_type, "res=%s"}, - {config_err, NULL}, -}; - -MAGIC_PARSE(config); -MAGIC_WRITE(config, set_config); -MAGIC_SHOW(config); -MAGIC_OPEN(config); -MAGIC_CLOSE(config); - -MAGIC_RDWR_FILEOPS(config); - -/* - * Members - * - * List members of a class - */ - -MAGIC_SHOW(members); -MAGIC_OPEN(members); -MAGIC_CLOSE(members); - -static ssize_t -members_write(struct file *file, const char __user * buf, - size_t count, loff_t * ppos) -{ - return members_reclassify_write(file,buf,count,ppos,1); -} - -MAGIC_RDWR_FILEOPS(members); - -/* - * Stats - * - * Get/reset class statistics - * No standard set of stats defined. Each resource controller chooses - * its own set of statistics to maintain and export. - */ - -#define stats_max_input_size 50 - -enum stats_token_t { - stats_res_type, stats_str, stats_err -}; - -static match_table_t stats_tokens = { - {stats_res_type, "res=%s"}, - {stats_str, NULL}, - {stats_err, NULL}, -}; - -MAGIC_PARSE(stats); -MAGIC_WRITE(stats, reset_stats); -MAGIC_SHOW(stats); -MAGIC_OPEN(stats); -MAGIC_CLOSE(stats); - -MAGIC_RDWR_FILEOPS(stats); - -/* - * Shares - * - * Set/get shares of a taskclass. - * Share types and semantics are defined by rcfs and ckrm core - */ - -#define SHARES_MAX_INPUT_SIZE 300 - -/* - * The enums for the share types should match the indices expected by - * array parameter to ckrm_set_resshare - * - * Note only the first NUM_SHAREVAL enums correspond to share types, - * the remaining ones are for token matching purposes - */ - -enum share_token_t { - MY_GUAR, MY_LIM, TOT_GUAR, MAX_LIM, SHARE_RES_TYPE, SHARE_ERR -}; - -/* Token matching for parsing input to this magic file */ -static match_table_t shares_tokens = { - {SHARE_RES_TYPE, "res=%s"}, - {MY_GUAR, "guarantee=%d"}, - {MY_LIM, "limit=%d"}, - {TOT_GUAR, "total_guarantee=%d"}, - {MAX_LIM, "max_limit=%d"}, - {SHARE_ERR, NULL} -}; - -static int -shares_parse(char *options, char **resstr, struct ckrm_shares *shares) -{ - char *p; - int option; - - if (!options) - return 1; - while ((p = strsep(&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - - if (!*p) - continue; - token = match_token(p, shares_tokens, args); - switch (token) { - case SHARE_RES_TYPE: - *resstr = match_strdup(args); - break; - case MY_GUAR: - if (match_int(args, &option)) - return 0; - shares->my_guarantee = option; - break; - case MY_LIM: - if (match_int(args, &option)) - return 0; - shares->my_limit = option; - break; - case TOT_GUAR: - if (match_int(args, &option)) - return 0; - shares->total_guarantee = option; - break; - case MAX_LIM: - if (match_int(args, &option)) - return 0; - shares->max_limit = option; - break; - default: - return 0; - } - } - return 1; -} - -static ssize_t -shares_write(struct file *file, const char __user * buf, - size_t count, loff_t * ppos) -{ - struct inode *inode = file->f_dentry->d_inode; - struct rcfs_inode_info *ri; - char *optbuf; - int rc = 0; - struct ckrm_core_class *core; - int done; - char *resname = NULL; - - struct ckrm_shares newshares = { - CKRM_SHARE_UNCHANGED, - CKRM_SHARE_UNCHANGED, - CKRM_SHARE_UNCHANGED, - CKRM_SHARE_UNCHANGED, - CKRM_SHARE_UNCHANGED, - CKRM_SHARE_UNCHANGED - }; - if ((ssize_t) count < 0 || (ssize_t) count > SHARES_MAX_INPUT_SIZE) - return -EINVAL; - if (!access_ok(VERIFY_READ, buf, count)) - return -EFAULT; - ri = RCFS_I(file->f_dentry->d_parent->d_inode); - if (!ri || !ckrm_is_core_valid((ckrm_core_class_t *) (ri->core))) { - printk(KERN_ERR "shares_write: Error accessing core class\n"); - return -EFAULT; - } - down(&inode->i_sem); - core = ri->core; - optbuf = kmalloc(SHARES_MAX_INPUT_SIZE, GFP_KERNEL); - if (!optbuf) { - up(&inode->i_sem); - return -ENOMEM; - } - __copy_from_user(optbuf, buf, count); - if (optbuf[count - 1] == '\n') - optbuf[count - 1] = '\0'; - done = shares_parse(optbuf, &resname, &newshares); - if (!done) { - printk(KERN_ERR "Error parsing shares\n"); - rc = -EINVAL; - goto write_out; - } - if (core->classtype->set_shares) { - rc = (*core->classtype->set_shares) (core, resname, &newshares); - if (rc) { - printk(KERN_ERR - "shares_write: resctlr share set error\n"); - goto write_out; - } - } - printk(KERN_ERR "Set %s shares to %d %d %d %d\n", - resname, - newshares.my_guarantee, - newshares.my_limit, - newshares.total_guarantee, newshares.max_limit); - rc = count; - -write_out: - up(&inode->i_sem); - kfree(optbuf); - kfree(resname); - return rc; -} - -MAGIC_SHOW(shares); -MAGIC_OPEN(shares); -MAGIC_CLOSE(shares); - -MAGIC_RDWR_FILEOPS(shares); - -/* - * magic file creation/deletion - */ - -int rcfs_clear_magic(struct dentry *parent) -{ - struct dentry *mftmp, *mfdentry; - - list_for_each_entry_safe(mfdentry, mftmp, &parent->d_subdirs, d_child) { - if (!rcfs_is_magic(mfdentry)) - continue; - if (rcfs_delete_internal(mfdentry)) - printk(KERN_ERR - "rcfs_clear_magic: error deleting one\n"); - } - return 0; -} - -EXPORT_SYMBOL_GPL(rcfs_clear_magic); - -int rcfs_create_magic(struct dentry *parent, struct rcfs_magf magf[], int count) -{ - int i; - struct dentry *mfdentry; - - for (i = 0; i < count; i++) { - mfdentry = rcfs_create_internal(parent, &magf[i], 0); - if (IS_ERR(mfdentry)) { - rcfs_clear_magic(parent); - return -ENOMEM; - } - RCFS_I(mfdentry->d_inode)->core = RCFS_I(parent->d_inode)->core; - mfdentry->d_fsdata = &RCFS_IS_MAGIC; - if (magf[i].i_fop) - mfdentry->d_inode->i_fop = magf[i].i_fop; - if (magf[i].i_op) - mfdentry->d_inode->i_op = magf[i].i_op; - } - return 0; -} - -EXPORT_SYMBOL_GPL(rcfs_create_magic); diff --git a/fs/rcfs/rootdir.c b/fs/rcfs/rootdir.c deleted file mode 100644 index 54e199add..000000000 --- a/fs/rcfs/rootdir.c +++ /dev/null @@ -1,227 +0,0 @@ -/* - * fs/rcfs/rootdir.c - * - * Copyright (C) Vivek Kashyap, IBM Corp. 2004 - * - * - * Functions for creating root directories and magic files - * for classtypes and classification engines under rcfs - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* - * Changes - * - * 08 April 2004 - * Created. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -rbce_eng_callback_t rcfs_eng_callbacks = { - NULL, NULL -}; - -int rcfs_register_engine(rbce_eng_callback_t * rcbs) -{ - if (!rcbs->mkdir || rcfs_eng_callbacks.mkdir) { - return -EINVAL; - } - rcfs_eng_callbacks = *rcbs; - rcfs_engine_regd++; - return 0; -} - -EXPORT_SYMBOL_GPL(rcfs_register_engine); - -int rcfs_unregister_engine(rbce_eng_callback_t * rcbs) -{ - if (!rcbs->mkdir || !rcfs_eng_callbacks.mkdir || - (rcbs->mkdir != rcfs_eng_callbacks.mkdir)) { - return -EINVAL; - } - rcfs_eng_callbacks.mkdir = NULL; - rcfs_eng_callbacks.rmdir = NULL; - rcfs_engine_regd--; - return 0; -} - -EXPORT_SYMBOL(rcfs_unregister_engine); - -/* - * rcfs_mkroot - * Create and return a "root" dentry under /rcfs. - * Also create associated magic files - * - * @mfdesc: array of rcfs_magf describing root dir and its magic files - * @count: number of entries in mfdesc - * @core: core class to be associated with root - * @rootde: output parameter to return the newly created root dentry - */ - -int rcfs_mkroot(struct rcfs_magf *mfdesc, int mfcount, struct dentry **rootde) -{ - int sz; - struct rcfs_magf *rootdesc = &mfdesc[0]; - struct dentry *dentry; - struct rcfs_inode_info *rootri; - - if ((mfcount < 0) || (!mfdesc)) - return -EINVAL; - - rootdesc = &mfdesc[0]; - printk(KERN_DEBUG "allocating classtype root <%s>\n", rootdesc->name); - dentry = rcfs_create_internal(rcfs_rootde, rootdesc, 0); - - if (!dentry) { - printk(KERN_ERR "Could not create %s\n", rootdesc->name); - return -ENOMEM; - } - rootri = RCFS_I(dentry->d_inode); - sz = strlen(rootdesc->name) + strlen(RCFS_ROOT) + 2; - rootri->name = kmalloc(sz, GFP_KERNEL); - if (!rootri->name) { - printk(KERN_ERR "Error allocating name for %s\n", - rootdesc->name); - rcfs_delete_internal(dentry); - return -ENOMEM; - } - snprintf(rootri->name, sz, "%s/%s", RCFS_ROOT, rootdesc->name); - if (rootdesc->i_fop) - dentry->d_inode->i_fop = rootdesc->i_fop; - if (rootdesc->i_op) - dentry->d_inode->i_op = rootdesc->i_op; - - /* set output parameters */ - *rootde = dentry; - - return 0; -} - -EXPORT_SYMBOL_GPL(rcfs_mkroot); - -int rcfs_rmroot(struct dentry *rootde) -{ - struct rcfs_inode_info *ri; - - if (!rootde) - return -EINVAL; - - rcfs_clear_magic(rootde); - ri = RCFS_I(rootde->d_inode); - kfree(ri->name); - ri->name = NULL; - rcfs_delete_internal(rootde); - return 0; -} - -EXPORT_SYMBOL_GPL(rcfs_rmroot); - -int rcfs_register_classtype(ckrm_classtype_t * clstype) -{ - int rc; - struct rcfs_inode_info *rootri; - struct rcfs_magf *mfdesc; - - if (genmfdesc[clstype->mfidx] == NULL) { - return -ENOMEM; - } - - clstype->mfdesc = (void *)genmfdesc[clstype->mfidx]->rootmf; - clstype->mfcount = genmfdesc[clstype->mfidx]->rootmflen; - - mfdesc = (struct rcfs_magf *)clstype->mfdesc; - - /* rcfs root entry has the same name as the classtype */ - strncpy(mfdesc[0].name, clstype->name, RCFS_MAGF_NAMELEN); - - rc = rcfs_mkroot(mfdesc, clstype->mfcount, - (struct dentry **)&(clstype->rootde)); - if (rc) - return rc; - rootri = RCFS_I(((struct dentry *)(clstype->rootde))->d_inode); - rootri->core = clstype->default_class; - clstype->default_class->name = rootri->name; - ckrm_core_grab(clstype->default_class); - - /* Create magic files under root */ - if ((rc = rcfs_create_magic(clstype->rootde, &mfdesc[1], - clstype->mfcount - 1))) { - kfree(rootri->name); - rootri->name = NULL; - rcfs_delete_internal(clstype->rootde); - return rc; - } - return rc; -} - -EXPORT_SYMBOL_GPL(rcfs_register_classtype); - -int rcfs_deregister_classtype(ckrm_classtype_t * clstype) -{ - int rc; - - rc = rcfs_rmroot((struct dentry *)clstype->rootde); - if (!rc) { - clstype->default_class->name = NULL; - ckrm_core_drop(clstype->default_class); - } - return rc; -} - -EXPORT_SYMBOL_GPL(rcfs_deregister_classtype); - -#ifdef CONFIG_CKRM_TYPE_TASKCLASS -extern struct rcfs_mfdesc tc_mfdesc; -#endif - -#ifdef CONFIG_CKRM_TYPE_SOCKETCLASS -extern struct rcfs_mfdesc sock_mfdesc; -#endif - -/* Common root and magic file entries. - * root name, root permissions, magic file names and magic file permissions - * are needed by all entities (classtypes and classification engines) existing - * under the rcfs mount point - * - * The common sets of these attributes are listed here as a table. Individual - * classtypes and classification engines can simple specify the index into the - * table to initialize their magf entries. - */ - -struct rcfs_mfdesc *genmfdesc[CKRM_MAX_CLASSTYPES] = { -#ifdef CONFIG_CKRM_TYPE_TASKCLASS - &tc_mfdesc, -#else - NULL, -#endif -#ifdef CONFIG_CKRM_TYPE_SOCKETCLASS - &sock_mfdesc, -#else - NULL, -#endif - -}; diff --git a/fs/rcfs/socket_fs.c b/fs/rcfs/socket_fs.c deleted file mode 100644 index 492fb092c..000000000 --- a/fs/rcfs/socket_fs.c +++ /dev/null @@ -1,338 +0,0 @@ -/* ckrm_socketaq.c - * - * Copyright (C) Vivek Kashyap, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * Initial version - */ - -/******************************************************************************* - * Socket class type - * - * Defines the root structure for socket based classes. Currently only inbound - * connection control is supported based on prioritized accept queues. - ******************************************************************************/ - - -#include -#include - -extern int rcfs_create(struct inode *,struct dentry *, int, struct nameidata *); -extern int rcfs_unlink(struct inode *, struct dentry *); -extern int rcfs_symlink(struct inode *, struct dentry *, const char *); -extern int rcfs_mknod(struct inode *, struct dentry *, int mode, dev_t); -extern int rcfs_mkdir(struct inode *, struct dentry *, int); -extern int rcfs_rmdir(struct inode *, struct dentry *); -extern int rcfs_rename(struct inode *, struct dentry *, struct inode *, - struct dentry *); - -extern int rcfs_create_coredir(struct inode *, struct dentry *); -int sock_mkdir(struct inode *, struct dentry *, int mode); -int sock_rmdir(struct inode *, struct dentry *); - - -int sock_create_noperm(struct inode *, struct dentry *,int, struct nameidata *); -int sock_unlink_noperm(struct inode *,struct dentry *); -int sock_mkdir_noperm(struct inode *,struct dentry *,int); -int sock_rmdir_noperm(struct inode *,struct dentry *); -int sock_mknod_noperm(struct inode *,struct dentry *,int, dev_t); - -void sock_set_directory(void); - -extern struct file_operations config_fileops, - members_fileops, - shares_fileops, - stats_fileops, - target_fileops; - - -struct inode_operations my_iops = { - .create = rcfs_create, - .lookup = simple_lookup, - .link = simple_link, - .unlink = rcfs_unlink, - .symlink = rcfs_symlink, - .mkdir = sock_mkdir, - .rmdir = sock_rmdir, - .mknod = rcfs_mknod, - .rename = rcfs_rename, -}; - -struct inode_operations class_iops = { - .create = sock_create_noperm, - .lookup = simple_lookup, - .link = simple_link, - .unlink = sock_unlink_noperm, - .symlink = rcfs_symlink, - .mkdir = sock_mkdir_noperm, - .rmdir = sock_rmdir_noperm, - .mknod = sock_mknod_noperm, - .rename = rcfs_rename, -}; - -struct inode_operations sub_iops = { - .create = sock_create_noperm, - .lookup = simple_lookup, - .link = simple_link, - .unlink = sock_unlink_noperm, - .symlink = rcfs_symlink, - .mkdir = sock_mkdir_noperm, - .rmdir = sock_rmdir_noperm, - .mknod = sock_mknod_noperm, - .rename = rcfs_rename, -}; - -struct rcfs_magf def_magf = { - .mode = RCFS_DEFAULT_DIR_MODE, - .i_op = &sub_iops, - .i_fop = NULL, -}; - -struct rcfs_magf sock_rootdesc[] = { - { - // .name = should not be set, copy from classtype name, - .mode = RCFS_DEFAULT_DIR_MODE, - .i_op = &my_iops, - //.i_fop = &simple_dir_operations, - .i_fop = NULL, - }, - { - .name = "members", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &members_fileops, - }, - { - .name = "target", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &target_fileops, - }, -}; - -struct rcfs_magf sock_magf[] = { - { - .name = "config", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &config_fileops, - }, - { - .name = "members", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop =&members_fileops, - }, - { - .name = "shares", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &shares_fileops, - }, - { - .name = "stats", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &stats_fileops, - }, - { - .name = "target", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &target_fileops, - }, -}; - -struct rcfs_magf sub_magf[] = { - { - .name = "config", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &config_fileops, - }, - { - .name = "shares", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &shares_fileops, - }, - { - .name = "stats", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_op = &my_iops, - .i_fop = &stats_fileops, - }, -}; - -struct rcfs_mfdesc sock_mfdesc = { - .rootmf = sock_rootdesc, - .rootmflen = (sizeof(sock_rootdesc)/sizeof(struct rcfs_magf)), -}; - - -#define SOCK_MAX_MAGF (sizeof(sock_magf)/sizeof(struct rcfs_magf)) -#define LAQ_MAX_SUBMAGF (sizeof(sub_magf)/sizeof(struct rcfs_magf)) - -int -sock_rmdir(struct inode *p, struct dentry *me) -{ - struct dentry *mftmp, *mfdentry ; - - // delete all magic sub directories - list_for_each_entry_safe(mfdentry, mftmp, &me->d_subdirs, d_child) { - if (S_ISDIR(mfdentry->d_inode->i_mode)) - rcfs_rmdir(me->d_inode, mfdentry); - } - // delete ourselves - rcfs_rmdir(p,me); - - return 0; -} - -#ifdef NUM_ACCEPT_QUEUES -#define LAQ_NUM_ACCEPT_QUEUES NUM_ACCEPT_QUEUES -#else -#define LAQ_NUM_ACCEPT_QUEUES 0 -#endif - -int -sock_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - int retval = 0; - int i,j; - struct dentry *pentry, *mfdentry; - - if (_rcfs_mknod(dir, dentry, mode | S_IFDIR, 0)) { - printk(KERN_ERR "rcfs_mkdir: error reaching parent\n"); - return retval; - } - - // Needed if only _rcfs_mknod is used instead of i_op->mkdir - dir->i_nlink++; - - retval = rcfs_create_coredir(dir, dentry); - if (retval) - goto mkdir_err; - - /* create the default set of magic files */ - for (i =0; i < SOCK_MAX_MAGF; i++) { - mfdentry = rcfs_create_internal(dentry, &sock_magf[i],0); - mfdentry->d_fsdata = &RCFS_IS_MAGIC; - RCFS_I(mfdentry->d_inode)->core = - RCFS_I(dentry->d_inode)->core; - if (sock_magf[i].i_fop) - mfdentry->d_inode->i_fop = sock_magf[i].i_fop; - if (sock_magf[i].i_op) - mfdentry->d_inode->i_op = sock_magf[i].i_op; - } - - for (i=1; i < LAQ_NUM_ACCEPT_QUEUES; i++) { - j = sprintf(def_magf.name, "%d",i); - def_magf.name[j] = '\0'; - - pentry = rcfs_create_internal(dentry, &def_magf,0); - retval = rcfs_create_coredir(dentry->d_inode, pentry); - if (retval) - goto mkdir_err; - for (j=0; j < LAQ_MAX_SUBMAGF; j++) { - mfdentry = rcfs_create_internal(pentry, &sub_magf[j],0); - mfdentry->d_fsdata = &RCFS_IS_MAGIC; - RCFS_I(mfdentry->d_inode)->core = - RCFS_I(pentry->d_inode)->core; - if (sub_magf[j].i_fop) - mfdentry->d_inode->i_fop = sub_magf[j].i_fop; - if (sub_magf[j].i_op) - mfdentry->d_inode->i_op = sub_magf[j].i_op; - } - pentry->d_inode->i_op = &sub_iops; - } - dentry->d_inode->i_op = &class_iops; - return 0; - -mkdir_err: - // Needed - dir->i_nlink--; - return retval; -} -#ifndef NUM_ACCEPT_QUEUES -#define NUM_ACCEPT_QUEUES 0 -#endif - -char * -sock_get_name(struct ckrm_core_class *c) -{ - char *p = (char *)c->name; - - while(*p) - p++; - while( *p != '/' && p != c->name) - p--; - - return ++p; -} - -int -sock_create_noperm(struct inode *dir,struct dentry *dentry,int mode, struct nameidata *nd) -{ - return -EPERM; -} - -int -sock_unlink_noperm(struct inode *dir,struct dentry *dentry) -{ - return -EPERM; -} - -int -sock_mkdir_noperm(struct inode *dir,struct dentry *dentry, int mode) -{ - return -EPERM; -} - -int -sock_rmdir_noperm(struct inode *dir,struct dentry *dentry) -{ - return -EPERM; -} - -int -sock_mknod_noperm(struct inode *dir,struct dentry *dentry,int mode, dev_t dev) -{ - return -EPERM; -} - -#if 0 -void -sock_set_directory() -{ - struct dentry *pentry, *dentry; - - pentry = rcfs_set_magf_byname("listen_aq", (void *)&my_dir_magf[0]); - if (pentry) { - dentry = rcfs_create_internal(pentry, &my_dir_magf[1],0); - if (my_dir_magf[1].i_fop) - dentry->d_inode->i_fop = my_dir_magf[1].i_fop; - RCFS_I(dentry->d_inode)->core = - RCFS_I(pentry->d_inode)->core; - dentry = rcfs_create_internal(pentry, &my_dir_magf[2],0); - if (my_dir_magf[2].i_fop) - dentry->d_inode->i_fop = my_dir_magf[2].i_fop; - RCFS_I(dentry->d_inode)->core = - RCFS_I(pentry->d_inode)->core; - } - else { - printk(KERN_ERR "Could not create /rcfs/listen_aq\n" - "Perhaps /rcfs needs to be mounted\n"); - } -} -#endif - diff --git a/fs/rcfs/super.c b/fs/rcfs/super.c deleted file mode 100644 index 8403f28d6..000000000 --- a/fs/rcfs/super.c +++ /dev/null @@ -1,304 +0,0 @@ -/* - * fs/rcfs/super.c - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2004 - * Vivek Kashyap, IBM Corp. 2004 - * - * Super block operations for rcfs - * - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* - * Changes - * - * 08 Mar 2004 - * Created. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -static kmem_cache_t *rcfs_inode_cachep; - -struct rcfs_inode_info *RCFS_I(struct inode *inode) -{ - return container_of(inode, struct rcfs_inode_info, vfs_inode); -} - -EXPORT_SYMBOL_GPL(RCFS_I); - -static struct inode *rcfs_alloc_inode(struct super_block *sb) -{ - struct rcfs_inode_info *ri; - ri = (struct rcfs_inode_info *)kmem_cache_alloc(rcfs_inode_cachep, - SLAB_KERNEL); - if (!ri) - return NULL; - ri->name = NULL; - return &ri->vfs_inode; -} - -static void rcfs_destroy_inode(struct inode *inode) -{ - struct rcfs_inode_info *ri = RCFS_I(inode); - - kfree(ri->name); - kmem_cache_free(rcfs_inode_cachep, ri); -} - -static void -rcfs_init_once(void *foo, kmem_cache_t * cachep, unsigned long flags) -{ - struct rcfs_inode_info *ri = (struct rcfs_inode_info *)foo; - - if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) - inode_init_once(&ri->vfs_inode); -} - -int rcfs_init_inodecache(void) -{ - rcfs_inode_cachep = kmem_cache_create("rcfs_inode_cache", - sizeof(struct rcfs_inode_info), - 0, - SLAB_HWCACHE_ALIGN | - SLAB_RECLAIM_ACCOUNT, - rcfs_init_once, NULL); - if (rcfs_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -void rcfs_destroy_inodecache(void) -{ - printk(KERN_WARNING "destroy inodecache was called\n"); - if (kmem_cache_destroy(rcfs_inode_cachep)) - printk(KERN_INFO - "rcfs_inode_cache: not all structures were freed\n"); -} - -struct super_operations rcfs_super_ops = { - .alloc_inode = rcfs_alloc_inode, - .destroy_inode = rcfs_destroy_inode, - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, -}; - -struct dentry *rcfs_rootde; /* redundant; can also get it from sb */ -static struct inode *rcfs_root; -static struct rcfs_inode_info *rcfs_rootri; - -static int rcfs_fill_super(struct super_block *sb, void *data, int silent) -{ - struct inode *inode; - struct dentry *root; - struct rcfs_inode_info *rootri; - struct ckrm_classtype *clstype; - int i, rc; - - sb->s_fs_info = NULL; - if (rcfs_mounted) { - return -EPERM; - } - rcfs_mounted++; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = RCFS_MAGIC; - sb->s_op = &rcfs_super_ops; - inode = rcfs_get_inode(sb, S_IFDIR | 0755, 0); - if (!inode) - return -ENOMEM; - inode->i_op = &rcfs_rootdir_inode_operations; - - root = d_alloc_root(inode); - if (!root) { - iput(inode); - return -ENOMEM; - } - sb->s_root = root; - - /* Link inode and core class */ - rootri = RCFS_I(inode); - rootri->name = kmalloc(strlen(RCFS_ROOT) + 1, GFP_KERNEL); - if (!rootri->name) { - d_delete(root); - iput(inode); - return -ENOMEM; - } - strcpy(rootri->name, RCFS_ROOT); - rootri->core = NULL; - - rcfs_root = inode; - sb->s_fs_info = rcfs_root = inode; - rcfs_rootde = root; - rcfs_rootri = rootri; - - /* register metatypes */ - for (i = 0; i < CKRM_MAX_CLASSTYPES; i++) { - clstype = ckrm_classtypes[i]; - if (clstype == NULL) - continue; - printk(KERN_DEBUG "A non null classtype\n"); - - if ((rc = rcfs_register_classtype(clstype))) - continue; /* could return with an error too */ - } - - /* - * do post-mount initializations needed by CE - * this is distinct from CE registration done on rcfs module load - */ - if (rcfs_engine_regd) { - if (rcfs_eng_callbacks.mnt) - if ((rc = (*rcfs_eng_callbacks.mnt) ())) { - printk(KERN_ERR "Error in CE mnt %d\n", rc); - } - } - /* - * Following comment handled by code above; keep nonetheless if it - * can be done better - * - * register CE's with rcfs - * check if CE loaded - * call rcfs_register_engine for each classtype - * AND rcfs_mkroot (preferably subsume latter in former) - */ - return 0; -} - -static struct super_block *rcfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) -{ - return get_sb_nodev(fs_type, flags, data, rcfs_fill_super); -} - -void rcfs_kill_sb(struct super_block *sb) -{ - int i, rc; - struct ckrm_classtype *clstype; - - if (sb->s_fs_info != rcfs_root) { - generic_shutdown_super(sb); - return; - } - rcfs_mounted--; - - for (i = 0; i < CKRM_MAX_CLASSTYPES; i++) { - clstype = ckrm_classtypes[i]; - if (clstype == NULL || clstype->rootde == NULL) - continue; - - if ((rc = rcfs_deregister_classtype(clstype))) { - printk(KERN_ERR "Error removing classtype %s\n", - clstype->name); - } - } - - /* - * do pre-umount shutdown needed by CE - * this is distinct from CE deregistration done on rcfs module unload - */ - if (rcfs_engine_regd) { - if (rcfs_eng_callbacks.umnt) - if ((rc = (*rcfs_eng_callbacks.umnt) ())) { - printk(KERN_ERR "Error in CE umnt %d\n", rc); - /* TODO: return ; until error handling improves */ - } - } - /* - * Following comment handled by code above; keep nonetheless if it - * can be done better - * - * deregister CE with rcfs - * Check if loaded - * if ce is in one directory /rcfs/ce, - * rcfs_deregister_engine for all classtypes within above - * codebase - * followed by - * rcfs_rmroot here - * if ce in multiple (per-classtype) directories - * call rbce_deregister_engine within ckrm_deregister_classtype - * - * following will automatically clear rcfs root entry including its - * rcfs_inode_info - */ - - generic_shutdown_super(sb); -} - -static struct file_system_type rcfs_fs_type = { - .name = "rcfs", - .get_sb = rcfs_get_sb, - .kill_sb = rcfs_kill_sb, -}; - -struct rcfs_functions my_rcfs_fn = { - .mkroot = rcfs_mkroot, - .rmroot = rcfs_rmroot, - .register_classtype = rcfs_register_classtype, - .deregister_classtype = rcfs_deregister_classtype, -}; - -extern struct rcfs_functions rcfs_fn; - -static int __init init_rcfs_fs(void) -{ - int ret; - - ret = register_filesystem(&rcfs_fs_type); - if (ret) - goto init_register_err; - ret = rcfs_init_inodecache(); - if (ret) - goto init_cache_err; - rcfs_fn = my_rcfs_fn; - /* - * Due to tight coupling of this module with ckrm - * do not allow this module to be removed. - */ - try_module_get(THIS_MODULE); - return ret; - -init_cache_err: - unregister_filesystem(&rcfs_fs_type); -init_register_err: - return ret; -} - -static void __exit exit_rcfs_fs(void) -{ - rcfs_destroy_inodecache(); - unregister_filesystem(&rcfs_fs_type); -} - -module_init(init_rcfs_fs) -module_exit(exit_rcfs_fs) - -MODULE_LICENSE("GPL"); diff --git a/fs/rcfs/tc_magic.c b/fs/rcfs/tc_magic.c deleted file mode 100644 index 16864094c..000000000 --- a/fs/rcfs/tc_magic.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * fs/rcfs/tc_magic.c - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2004 - * (C) Vivek Kashyap, IBM Corp. 2004 - * (C) Chandra Seetharaman, IBM Corp. 2004 - * (C) Hubertus Franke, IBM Corp. 2004 - * - * - * define magic fileops for taskclass classtype - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 23 Apr 2004 - * Created. - * - */ - -#include -#include - - -/******************************************************************************* - * Taskclass general - * - * Define structures for taskclass root directory and its magic files - * In taskclasses, there is one set of magic files, created automatically under - * the taskclass root (upon classtype registration) and each directory (class) - * created subsequently. However, classtypes can also choose to have different - * sets of magic files created under their root and other directories under root - * using their mkdir function. RCFS only provides helper functions for creating - * the root directory and its magic files - * - *******************************************************************************/ - -#define TC_FILE_MODE (S_IFREG | S_IRUGO | S_IWUSR) - -#define NR_TCROOTMF 6 -struct rcfs_magf tc_rootdesc[NR_TCROOTMF] = { - /* First entry must be root */ - { -// .name = should not be set, copy from classtype name - .mode = RCFS_DEFAULT_DIR_MODE, - .i_op = &rcfs_dir_inode_operations, - .i_fop = &simple_dir_operations, - }, - /* Rest are root's magic files */ - { - .name = "target", - .mode = TC_FILE_MODE, - .i_fop = &target_fileops, - .i_op = &rcfs_file_inode_operations, - }, - { - .name = "config", - .mode = TC_FILE_MODE, - .i_fop = &config_fileops, - .i_op = &rcfs_file_inode_operations, - }, - { - .name = "members", - .mode = TC_FILE_MODE, - .i_fop = &members_fileops, - .i_op = &rcfs_file_inode_operations, - }, - { - .name = "stats", - .mode = TC_FILE_MODE, - .i_fop = &stats_fileops, - .i_op = &rcfs_file_inode_operations, - }, - { - .name = "shares", - .mode = TC_FILE_MODE, - .i_fop = &shares_fileops, - .i_op = &rcfs_file_inode_operations, - }, -}; - -struct rcfs_mfdesc tc_mfdesc = { - .rootmf = tc_rootdesc, - .rootmflen = NR_TCROOTMF, -}; - - diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index e16118309..82567d329 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -564,16 +564,6 @@ __SYSCALL(__NR_kexec_load, sys_kexec_load) __SYSCALL(__NR_waitid, sys_waitid) #define __NR_syscall_max __NR_waitid -#ifdef USE_IOPRIO_SYSCALLS -#warning MEF if necessary may need to adjust ioprio syscalls -#define __NR_ioprio_set 247 -__SYSCALL(__NR_ioprio_set, sys_ioprio_set); -#define __NR_ioprio_get 248 -__SYSCALL(__NR_ioprio_get, sys_ioprio_get); -#else -#warning MEF not including sys_ioprio_{set,get} syscalls -#endif - #ifndef __NO_STUBS /* user-visible error numbers are in the range -1 - -4095 */ diff --git a/include/linux/ckrm-io.h b/include/linux/ckrm-io.h deleted file mode 100644 index 70277c7ef..000000000 --- a/include/linux/ckrm-io.h +++ /dev/null @@ -1,41 +0,0 @@ -/* linux/drivers/block/ckrm_io.c : Block I/O Resource Controller for CKRM - * - * Copyright (C) Shailabh Nagar, IBM Corp. 2004 - * - * - * Provides best-effort block I/O bandwidth control for CKRM - * This file provides the CKRM API. The underlying scheduler is a - * modified Complete-Fair Queueing (CFQ) iosched. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 29 July 2004 - * Third complete rewrite for CKRM's current API - * - */ - - -#ifndef _LINUX_CKRM_IO_H -#define _LINUX_CKRM_IO_H - -typedef void *(*icls_tsk_t) (struct task_struct *tsk); -typedef int (*icls_ioprio_t) (struct task_struct *tsk); - -#ifdef CONFIG_CKRM_RES_BLKIO - -extern void *cki_tsk_icls (struct task_struct *tsk); -extern int cki_tsk_ioprio (struct task_struct *tsk); -extern void *cki_tsk_cfqpriv (struct task_struct *tsk); - -#endif /* CONFIG_CKRM_RES_BLKIO */ - -#endif diff --git a/include/linux/ckrm_ce.h b/include/linux/ckrm_ce.h deleted file mode 100644 index ac53ef3c6..000000000 --- a/include/linux/ckrm_ce.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * ckrm_ce.h - Header file to be used by Classification Engine of CKRM - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Shailabh Nagar, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * - * Provides data structures, macros and kernel API of CKRM for - * classification engine. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -/* Changes - * - * 12 Nov 2003 - * Created. - * 22 Apr 2004 - * Adopted to classtypes - */ - -#ifndef _LINUX_CKRM_CE_H -#define _LINUX_CKRM_CE_H - -#ifdef CONFIG_CKRM - -#include - -/* - * Action parameters identifying the cause of a task<->class notify callback - * these can perculate up to user daemon consuming records send by the - * classification engine - */ - -#ifdef __KERNEL__ - -typedef void *(*ce_classify_fct_t) (enum ckrm_event event, void *obj, ...); -typedef void (*ce_notify_fct_t) (enum ckrm_event event, void *classobj, - void *obj); - -typedef struct ckrm_eng_callback { - /* general state information */ - int always_callback; /* set if CE should always be called back - regardless of numclasses */ - - /* callbacks which are called without holding locks */ - - unsigned long c_interest; /* set of classification events of - * interest to CE - */ - - /* generic classify */ - ce_classify_fct_t classify; - - /* class added */ - void (*class_add) (const char *name, void *core, int classtype); - - /* class deleted */ - void (*class_delete) (const char *name, void *core, int classtype); - - /* callbacks which are called while holding task_lock(tsk) */ - unsigned long n_interest; /* set of notification events of - * interest to CE - */ - /* notify on class switch */ - ce_notify_fct_t notify; -} ckrm_eng_callback_t; - -struct inode; -struct dentry; - -typedef struct rbce_eng_callback { - int (*mkdir) (struct inode *, struct dentry *, int); /* mkdir */ - int (*rmdir) (struct inode *, struct dentry *); /* rmdir */ - int (*mnt) (void); - int (*umnt) (void); -} rbce_eng_callback_t; - -extern int ckrm_register_engine(const char *name, ckrm_eng_callback_t *); -extern int ckrm_unregister_engine(const char *name); - -extern void *ckrm_classobj(const char *, int *classtype); -extern int get_exe_path_name(struct task_struct *t, char *filename, - int max_size); - -extern int rcfs_register_engine(rbce_eng_callback_t *); -extern int rcfs_unregister_engine(rbce_eng_callback_t *); - -extern int ckrm_reclassify(int pid); - -#ifndef _LINUX_CKRM_RC_H - -extern void ckrm_core_grab(void *); -extern void ckrm_core_drop(void *); -#endif - -#endif /* CONFIG_CKRM */ -#endif /* __KERNEL__ */ -#endif /* _LINUX_CKRM_CE_H */ diff --git a/include/linux/ckrm_classqueue.h b/include/linux/ckrm_classqueue.h deleted file mode 100644 index 1453f5e1b..000000000 --- a/include/linux/ckrm_classqueue.h +++ /dev/null @@ -1,130 +0,0 @@ -/* include/linux/ckrm_classqueue.h : cpu control for CKRM - * - * Copyright (C) Haoqiang Zheng, IBM Corp. 2003 - * (C) Hubertus Franke, IBM Corp. 2003 - * - * Circular queue functionality for CKRM cpu controller - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * Aug 28, 2003 - * Created. - * July 07, 2004 - * clean up, add comments - * - */ - -#ifndef _CKRM_CLASSQUEUE_H -#define _CKRM_CLASSQUEUE_H - -#include - -#define CLASSQUEUE_SIZE 1024 // acb: changed from 128 -//#define CLASSQUEUE_SIZE 128 -#define CQ_BITMAP_SIZE ((((CLASSQUEUE_SIZE+1+7)/8)+sizeof(long)-1)/sizeof(long)) - -/** - * struct cq_prio_array: duplicates prio_array defined in sched.c - * - * I duplicate this data structure to make ckrm_classqueue implementation more modular - */ -struct cq_prio_array { - int nr_active; - unsigned long bitmap[CQ_BITMAP_SIZE]; - struct list_head queue[CLASSQUEUE_SIZE]; -}; - -/** - * struct classqueue_struct - a runqueue of class local runqueues - * @array: priority array - * @base: base priority - * @base_offset: index in array for the base - * - * classqueue can be thought of as runqueue of classes (instead of runqueue of tasks) - * as task runqueue, each processor has a classqueue - * a class enters the classqueue when the first task in this class local runqueue shows up - * a class enters the classqueue when the last task in the local runqueue leaves - * class local runqueues are ordered based their priority - * - * status: - * hzheng: is 32bit base long enough? - */ -struct classqueue_struct { - struct cq_prio_array array; - unsigned long base; - unsigned long base_offset; -}; - -/** - * struct cq_node_struct - the link object between class local runqueue and classqueue - * @list: links the class local runqueue to classqueue - * @prio: class priority, which is caculated based on it's progress (cvt) and urgency (top_priority) - * @index: real index into the classqueue array, calculated based on priority - * - * NOTE: make sure list is empty when it's not in classqueue - */ -struct cq_node_struct { - struct list_head list; - int prio; - int index; -}; -typedef struct cq_node_struct cq_node_t; - -typedef unsigned long long CVT_t; // cummulative virtual time - -static inline void cq_node_init(cq_node_t * node) -{ - node->prio = 0; - node->index = -1; - INIT_LIST_HEAD(&node->list); -} - -/*if the class is in classqueue*/ -static inline int cls_in_classqueue(cq_node_t * node) -{ - return !list_empty(&node->list); -} - -/*initialize the data structure*/ -int classqueue_init(struct classqueue_struct *cq); - -/*add the class to classqueue*/ -void classqueue_enqueue(struct classqueue_struct *cq, cq_node_t * node, int prio); - -/** - * classqueue_dequeue - remove the class from classqueue - * - * internal: - * called when the last task is removed from the queue - * checked on load balancing and schedule - * hzheng: why don't I call it on class_dequeue_task? - */ -void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node); - -/*change the position of the class in classqueue*/ -void classqueue_update_prio(struct classqueue_struct *cq, cq_node_t * node, int new_prio); - -/*return the first class in classqueue*/ -cq_node_t *classqueue_get_head(struct classqueue_struct *cq); - -/*update the base priority of the classqueue*/ -void classqueue_update_base(struct classqueue_struct *cq); - -/** - * class_compare_prio: compare the priority of this two nodes - */ -static inline int class_compare_prio(struct cq_node_struct* node1, struct cq_node_struct* node2) -{ - return ( node1->prio - node2->prio); -} - -#endif diff --git a/include/linux/ckrm_events.h b/include/linux/ckrm_events.h deleted file mode 100644 index 14cfbe33a..000000000 --- a/include/linux/ckrm_events.h +++ /dev/null @@ -1,192 +0,0 @@ -/* - * ckrm_events.h - Class-based Kernel Resource Management (CKRM) - * event handling - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004 - * (C) Shailabh Nagar, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * - * - * Provides a base header file including macros and basic data structures. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -/* - * Changes - * - * 28 Aug 2003 - * Created. - * 06 Nov 2003 - * Made modifications to suit the new RBCE module. - * 10 Nov 2003 - * Added callbacks_active and surrounding logic. Added task paramter - * for all CE callbacks. - * 19 Nov 2004 - * New Event callback structure - */ - -#ifndef _LINUX_CKRM_EVENTS_H -#define _LINUX_CKRM_EVENTS_H - -#ifdef CONFIG_CKRM - -/* - * Data structure and function to get the list of registered - * resource controllers. - */ - -/* - * CKRM defines a set of events at particular points in the kernel - * at which callbacks registered by various class types are called - */ - -enum ckrm_event { - /* - * we distinguish these events types: - * - * (a) CKRM_LATCHABLE_EVENTS - * events can be latched for event callbacks by classtypes - * - * (b) CKRM_NONLATACHBLE_EVENTS - * events can not be latched but can be used to call classification - * - * (c) event that are used for notification purposes - * range: [ CKRM_EVENT_CANNOT_CLASSIFY .. ) - */ - - /* events (a) */ - - CKRM_LATCHABLE_EVENTS, - - CKRM_EVENT_NEWTASK = CKRM_LATCHABLE_EVENTS, - CKRM_EVENT_FORK, - CKRM_EVENT_EXIT, - CKRM_EVENT_EXEC, - CKRM_EVENT_UID, - CKRM_EVENT_GID, - CKRM_EVENT_XID, - CKRM_EVENT_LOGIN, - CKRM_EVENT_USERADD, - CKRM_EVENT_USERDEL, - CKRM_EVENT_LISTEN_START, - CKRM_EVENT_LISTEN_STOP, - CKRM_EVENT_APPTAG, - - /* events (b) */ - - CKRM_NONLATCHABLE_EVENTS, - - CKRM_EVENT_RECLASSIFY = CKRM_NONLATCHABLE_EVENTS, - - /* events (c) */ - - CKRM_NOTCLASSIFY_EVENTS, - - CKRM_EVENT_MANUAL = CKRM_NOTCLASSIFY_EVENTS, - - CKRM_NUM_EVENTS -}; -#endif - -#ifdef __KERNEL__ -#ifdef CONFIG_CKRM - -/* - * CKRM event callback specification for the classtypes or resource controllers - * typically an array is specified using CKRM_EVENT_SPEC terminated with - * CKRM_EVENT_SPEC_LAST and then that array is registered using - * ckrm_register_event_set. - * Individual registration of event_cb is also possible - */ - -typedef void (*ckrm_event_cb) (void *arg); - -struct ckrm_hook_cb { - ckrm_event_cb fct; - struct ckrm_hook_cb *next; -}; - -struct ckrm_event_spec { - enum ckrm_event ev; - struct ckrm_hook_cb cb; -}; - -#define CKRM_EVENT_SPEC(EV,FCT) { CKRM_EVENT_##EV, \ - { (ckrm_event_cb)FCT, NULL } } - -int ckrm_register_event_set(struct ckrm_event_spec especs[]); -int ckrm_unregister_event_set(struct ckrm_event_spec especs[]); -int ckrm_register_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb); -int ckrm_unregister_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb); - -extern void ckrm_invoke_event_cb_chain(enum ckrm_event ev, void *arg); - -#define CKRM_DEF_CB(EV,fct) \ -static inline void ckrm_cb_##fct(void) \ -{ \ - ckrm_invoke_event_cb_chain(CKRM_EVENT_##EV,NULL); \ -} - -#define CKRM_DEF_CB_ARG(EV,fct,argtp) \ -static inline void ckrm_cb_##fct(argtp arg) \ -{ \ - ckrm_invoke_event_cb_chain(CKRM_EVENT_##EV,(void*)arg); \ -} - -#else /* !CONFIG_CKRM */ - -#define CKRM_DEF_CB(EV,fct) \ -static inline void ckrm_cb_##fct(void) { } - -#define CKRM_DEF_CB_ARG(EV,fct,argtp) \ -static inline void ckrm_cb_##fct(argtp arg) { } - -#endif /* CONFIG_CKRM */ - -/* - * define the CKRM event functions - * EVENT FCT ARG - */ - -/* forward declarations for function arguments */ -struct task_struct; -struct sock; -struct user_struct; - -CKRM_DEF_CB_ARG(FORK, fork, struct task_struct *); -CKRM_DEF_CB_ARG(EXEC, exec, const char *); -CKRM_DEF_CB(UID, uid); -CKRM_DEF_CB(GID, gid); -CKRM_DEF_CB_ARG(XID, xid, struct task_struct *); -CKRM_DEF_CB(APPTAG, apptag); -CKRM_DEF_CB(LOGIN, login); -CKRM_DEF_CB_ARG(USERADD, useradd, struct user_struct *); -CKRM_DEF_CB_ARG(USERDEL, userdel, struct user_struct *); -CKRM_DEF_CB_ARG(LISTEN_START, listen_start, struct sock *); -CKRM_DEF_CB_ARG(LISTEN_STOP, listen_stop, struct sock *); - -/* some other functions required */ -#ifdef CONFIG_CKRM -extern void ckrm_init(void); -extern void ckrm_cb_newtask(struct task_struct *); -extern void ckrm_cb_exit(struct task_struct *); -#else -#define ckrm_init() do { } while (0) -#define ckrm_cb_newtask(x) do { } while (0) -#define ckrm_cb_exit(x) do { } while (0) -#endif - -extern int get_exe_path_name(struct task_struct *, char *, int); - -#endif /* __KERNEL__ */ -#endif /* _LINUX_CKRM_EVENTS_H */ diff --git a/include/linux/ckrm_mem.h b/include/linux/ckrm_mem.h deleted file mode 100644 index 3712aefb9..000000000 --- a/include/linux/ckrm_mem.h +++ /dev/null @@ -1,105 +0,0 @@ -/* include/linux/ckrm_mem.h : memory control for CKRM - * - * Copyright (C) Jiantao Kong, IBM Corp. 2003 - * (C) Shailabh Nagar, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2004 - * - * - * Memory control functions of the CKRM kernel API - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#ifndef _LINUX_CKRM_MEM_H -#define _LINUX_CKRM_MEM_H - -#ifdef CONFIG_CKRM_RES_MEM - -#include -#include -#include - -struct ckrm_zone { - struct list_head active_list; - struct list_head inactive_list; - - unsigned long nr_active; // # of pages in the active list - unsigned long nr_inactive; // # of pages in the inactive list - unsigned long active_over; - unsigned long inactive_over; - - unsigned long shrink_active; - unsigned long shrink_inactive; - long shrink_weight; - unsigned long shrink_flag; - - struct list_head victim_list; // list of ckrm_zones chosen for shrinking - struct zone *zone; - struct ckrm_mem_res *memcls; -}; - -struct ckrm_mem_res { - unsigned long flags; - struct ckrm_core_class *core; // the core i am part of... - struct ckrm_core_class *parent; // parent of the core i am part of.... - struct ckrm_shares shares; - struct list_head mcls_list; // list of all 1-level classes - struct list_head shrink_list; // list of classes need to be shrunk - struct kref nr_users; // # of references to this class/data structure - atomic_t pg_total; // # of pages used by this class - int pg_guar; // # of pages this class is guaranteed - int pg_limit; // max # of pages this class can get - int pg_borrowed; // # of pages this class borrowed from its parent - int pg_lent; // # of pages this class lent to its children - int pg_unused; // # of pages left to this class (after giving the - // guarantees to children. need to borrow from parent if - // more than this is needed. - int impl_guar; // implicit guarantee for class with don't care guar - int nr_dontcare; // # of children with don't care guarantee - struct ckrm_zone ckrm_zone[MAX_NR_ZONES]; - int shrink_count; - unsigned long last_shrink; - int over_limit_failures; - int shrink_pages; // # of pages to free in this class - int hier; // hiearchy, root = 0 -}; - -extern atomic_t ckrm_mem_real_count; -extern unsigned int ckrm_tot_lru_pages; -extern int ckrm_nr_mem_classes; -extern struct list_head ckrm_shrink_list; -extern struct list_head ckrm_memclass_list; -extern spinlock_t ckrm_mem_lock; -extern struct ckrm_res_ctlr mem_rcbs; -extern struct ckrm_mem_res *ckrm_mem_root_class; - -#define page_ckrmzone(page) ((page)->ckrm_zone) - -#define CLS_SHRINK_BIT (1) - -// used in flags. set when a class is more than 90% of its maxlimit -#define MEM_AT_LIMIT 1 - -extern void ckrm_init_mm_to_task(struct mm_struct *, struct task_struct *); -extern void ckrm_mem_evaluate_mm(struct mm_struct *, struct ckrm_mem_res *); -extern void ckrm_at_limit(struct ckrm_mem_res *); -extern int ckrm_memclass_valid(struct ckrm_mem_res *); -extern int ckrm_mem_get_shrink_to(void); -extern void check_memclass(struct ckrm_mem_res *, char *); -extern void memclass_release(struct kref *); - -#else - -#define ckrm_init_mm_to_current(a) do {} while (0) -#define ckrm_mem_evaluate_mm(a) do {} while (0) -#define ckrm_init_mm_to_task(a,b) do {} while (0) - -#endif // CONFIG_CKRM_RES_MEM - -#endif //_LINUX_CKRM_MEM_H diff --git a/include/linux/ckrm_mem_inline.h b/include/linux/ckrm_mem_inline.h deleted file mode 100644 index 1166956b7..000000000 --- a/include/linux/ckrm_mem_inline.h +++ /dev/null @@ -1,403 +0,0 @@ -/* include/linux/ckrm_mem_inline.h : memory control for CKRM - * - * Copyright (C) Jiantao Kong, IBM Corp. 2003 - * (C) Shailabh Nagar, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2004 - * - * - * Memory control functions of the CKRM kernel API - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#ifndef _LINUX_CKRM_MEM_INLINE_H_ -#define _LINUX_CKRM_MEM_INLINE_H_ - -#include -#include -#include - - -#ifdef CONFIG_CKRM_RES_MEM - -#define INACTIVE 0 -#define ACTIVE 1 - -static inline struct ckrm_mem_res * -ckrm_get_mem_class(struct task_struct *tsk) -{ - return ckrm_get_res_class(tsk->taskclass, mem_rcbs.resid, - struct ckrm_mem_res); -} - -#define ckrm_shrink_list_empty() list_empty(&ckrm_shrink_list) - -static inline void -ckrm_set_shrink(struct ckrm_zone *cz) -{ - set_bit(CLS_SHRINK_BIT, &cz->shrink_flag); -} - -static inline int -ckrm_test_set_shrink(struct ckrm_zone *cz) -{ - return test_and_set_bit(CLS_SHRINK_BIT, &cz->shrink_flag); -} - -static inline void -ckrm_clear_shrink(struct ckrm_zone *cz) -{ - clear_bit(CLS_SHRINK_BIT, &cz->shrink_flag); -} - -/* - * Currently, a shared page that is shared by multiple classes is charged - * to a class with max available guarantee. Simply replace this function - * for other policies. - */ -static inline int -ckrm_mem_share_compare(struct ckrm_mem_res *a, struct ckrm_mem_res *b) -{ - if (a == NULL) - return -(b != NULL); - if (b == NULL) - return 0; - if (a->pg_guar == b->pg_guar) - return 0; - if (a->pg_guar == CKRM_SHARE_DONTCARE) - return 1; - if (b->pg_guar == CKRM_SHARE_DONTCARE) - return -1; - return (a->pg_unused - b->pg_unused); -} - -static inline void -incr_use_count(struct ckrm_mem_res *cls, int borrow) -{ - extern int ckrm_mem_shrink_at; - if (unlikely(!cls)) - return; - BUG_ON(!ckrm_memclass_valid(cls)); - atomic_inc(&cls->pg_total); - - if (borrow) - cls->pg_lent++; - if ((cls->pg_guar == CKRM_SHARE_DONTCARE) || - (atomic_read(&cls->pg_total) > cls->pg_unused)) { - struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, - mem_rcbs.resid, struct ckrm_mem_res); - if (parcls) { - incr_use_count(parcls, 1); - cls->pg_borrowed++; - } - } else { - atomic_inc(&ckrm_mem_real_count); - } - if (unlikely((cls->pg_limit != CKRM_SHARE_DONTCARE) && - (atomic_read(&cls->pg_total) >= - ((ckrm_mem_shrink_at * cls->pg_limit) / 100)) && - ((cls->flags & MEM_AT_LIMIT) != MEM_AT_LIMIT))) { - ckrm_at_limit(cls); - } - return; -} - -static inline void -decr_use_count(struct ckrm_mem_res *cls, int borrowed) -{ - if (unlikely(!cls)) - return; - BUG_ON(!ckrm_memclass_valid(cls)); - atomic_dec(&cls->pg_total); - if (borrowed) - cls->pg_lent--; - if (cls->pg_borrowed > 0) { - struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, - mem_rcbs.resid, struct ckrm_mem_res); - if (parcls) { - decr_use_count(parcls, 1); - cls->pg_borrowed--; - return; - } - } - atomic_dec(&ckrm_mem_real_count); -} - -static inline void -ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls) -{ - if (unlikely(cls == NULL)) { - cls = ckrm_mem_root_class; - } - if (likely(cls != NULL)) { - struct ckrm_zone *czone = &cls->ckrm_zone[page_zonenum(page)]; - if (unlikely(page->ckrm_zone)) { - kref_put(&cls->nr_users, memclass_release); - } - page->ckrm_zone = czone; - kref_get(&cls->nr_users); - } else { - page->ckrm_zone = NULL; - } -} - -static inline void -ckrm_set_pages_class(struct page *pages, int numpages, struct ckrm_mem_res *cls) -{ - int i; - for (i = 0; i < numpages; pages++, i++) { - ckrm_set_page_class(pages, cls); - } -} - -static inline void -ckrm_clear_page_class(struct page *page) -{ - if (likely(page->ckrm_zone != NULL)) { - if (CkrmAccount(page)) { - decr_use_count(page->ckrm_zone->memcls, 0); - ClearCkrmAccount(page); - } - kref_put(&page->ckrm_zone->memcls->nr_users, memclass_release); - page->ckrm_zone = NULL; - } -} - -static inline void -ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls) -{ - struct ckrm_zone *old_czone = page->ckrm_zone, *new_czone; - struct ckrm_mem_res *oldcls; - - if (unlikely(!old_czone || !newcls)) { - BUG_ON(CkrmAccount(page)); - return; - } - BUG_ON(!CkrmAccount(page)); - - oldcls = old_czone->memcls; - if (oldcls == NULL || (oldcls == newcls)) - return; - - kref_put(&oldcls->nr_users, memclass_release); - decr_use_count(oldcls, 0); - - page->ckrm_zone = new_czone = &newcls->ckrm_zone[page_zonenum(page)]; - - kref_get(&newcls->nr_users); - incr_use_count(newcls, 0); - - list_del(&page->lru); - if (PageActive(page)) { - old_czone->nr_active--; - new_czone->nr_active++; - list_add(&page->lru, &new_czone->active_list); - } else { - old_czone->nr_inactive--; - new_czone->nr_inactive++; - list_add(&page->lru, &new_czone->inactive_list); - } -} - -static inline void -ckrm_mem_inc_active(struct page *page) -{ - struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class; - - if (cls == NULL) - return; - BUG_ON(CkrmAccount(page)); - BUG_ON(page->ckrm_zone != NULL); - - ckrm_set_page_class(page, cls); - incr_use_count(cls, 0); - SetCkrmAccount(page); - BUG_ON(page->ckrm_zone == NULL); - page->ckrm_zone->nr_active++; - list_add(&page->lru, &page->ckrm_zone->active_list); -} - -static inline void -ckrm_mem_dec_active(struct page *page) -{ - if (page->ckrm_zone == NULL) - return; - BUG_ON(page->ckrm_zone->memcls == NULL); - BUG_ON(!CkrmAccount(page)); - - list_del(&page->lru); - page->ckrm_zone->nr_active--; - ckrm_clear_page_class(page); -} - - -static inline void -ckrm_mem_inc_inactive(struct page *page) -{ - struct ckrm_mem_res *cls = ckrm_get_mem_class(current) ?: ckrm_mem_root_class; - - if (cls == NULL) - return; - BUG_ON(CkrmAccount(page)); - BUG_ON(page->ckrm_zone != NULL); - - ckrm_set_page_class(page, cls); - incr_use_count(cls, 0); - SetCkrmAccount(page); - BUG_ON(page->ckrm_zone == NULL); - page->ckrm_zone->nr_inactive++; - list_add(&page->lru, &page->ckrm_zone->inactive_list); -} - -static inline void -ckrm_mem_dec_inactive(struct page *page) -{ - if (page->ckrm_zone == NULL) - return; - BUG_ON(page->ckrm_zone->memcls == NULL); - BUG_ON(!CkrmAccount(page)); - - page->ckrm_zone->nr_inactive--; - list_del(&page->lru); - ckrm_clear_page_class(page); -} - -static inline int -ckrm_class_limit_ok(struct ckrm_mem_res *cls) -{ - int ret; - extern int ckrm_mem_fail_over; - - if ((mem_rcbs.resid == -1) || !cls) { - return 1; - } - if (cls->pg_limit == CKRM_SHARE_DONTCARE) { - struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent, - mem_rcbs.resid, struct ckrm_mem_res); - ret = (parcls ? ckrm_class_limit_ok(parcls) : 0); - } else { - ret = (atomic_read(&cls->pg_total) <= - ((ckrm_mem_fail_over * cls->pg_limit) / 100)); - } - - if (ret == 0) { - // if we are failing... just nudge the back end - ckrm_at_limit(cls); - } - return ret; -} - -// task/mm initializations/cleanup - -static inline void -ckrm_task_mm_init(struct task_struct *tsk) -{ - INIT_LIST_HEAD(&tsk->mm_peers); -} - -static inline void -ckrm_task_change_mm(struct task_struct *tsk, struct mm_struct *oldmm, struct mm_struct *newmm) -{ - if (oldmm) { - spin_lock(&oldmm->peertask_lock); - list_del(&tsk->mm_peers); - ckrm_mem_evaluate_mm(oldmm, NULL); - spin_unlock(&oldmm->peertask_lock); - } - spin_lock(&newmm->peertask_lock); - list_add_tail(&tsk->mm_peers, &newmm->tasklist); - ckrm_mem_evaluate_mm(newmm, NULL); - spin_unlock(&newmm->peertask_lock); -} - -static inline void -ckrm_task_clear_mm(struct task_struct *tsk, struct mm_struct *mm) -{ - spin_lock(&mm->peertask_lock); - list_del_init(&tsk->mm_peers); - ckrm_mem_evaluate_mm(mm, NULL); - spin_unlock(&mm->peertask_lock); -} - -static inline void -ckrm_mm_init(struct mm_struct *mm) -{ - INIT_LIST_HEAD(&mm->tasklist); - mm->peertask_lock = SPIN_LOCK_UNLOCKED; -} - -static inline void -ckrm_mm_setclass(struct mm_struct *mm, struct ckrm_mem_res *cls) -{ - mm->memclass = cls; - kref_get(&cls->nr_users); -} - -static inline void -ckrm_mm_clearclass(struct mm_struct *mm) -{ - if (mm->memclass) { - kref_put(&mm->memclass->nr_users, memclass_release); - mm->memclass = NULL; - } -} - -static inline void -ckrm_zone_inc_active(struct ckrm_zone *czone, int cnt) -{ - czone->nr_active += cnt; -} - -static inline void -ckrm_zone_inc_inactive(struct ckrm_zone *czone, int cnt) -{ - czone->nr_inactive += cnt; -} - -static inline void -ckrm_zone_dec_active(struct ckrm_zone *czone, int cnt) -{ - czone->nr_active -= cnt; -} - -static inline void -ckrm_zone_dec_inactive(struct ckrm_zone *czone, int cnt) -{ - czone->nr_inactive -= cnt; -} - -#else // !CONFIG_CKRM_RES_MEM - -#define ckrm_set_page_class(a,b) do{}while(0) -#define ckrm_set_pages_class(a,b,c) do{}while(0) -#define ckrm_clear_page_class(a) do{}while(0) -#define ckrm_clear_pages_class(a,b) do{}while(0) -#define ckrm_change_page_class(a,b) do{}while(0) -#define ckrm_change_pages_class(a,b,c) do{}while(0) -#define ckrm_mem_inc_active(a) do{}while(0) -#define ckrm_mem_dec_active(a) do{}while(0) -#define ckrm_mem_inc_inactive(a) do{}while(0) -#define ckrm_mem_dec_inactive(a) do{}while(0) -#define ckrm_shrink_list_empty() (1) -#define ckrm_kick_page(a,b) (0) -#define ckrm_class_limit_ok(a) (1) -#define ckrm_task_mm_init(a) do{}while(0) -#define ckrm_task_clear_mm(a, b) do{}while(0) -#define ckrm_task_change_mm(a, b, c) do{}while(0) -#define ckrm_mm_init(a) do{}while(0) -#define ckrm_mm_setclass(a, b) do{}while(0) -#define ckrm_mm_clearclass(a) do{}while(0) -#define ckrm_zone_inc_active(a, b) do{}while(0) -#define ckrm_zone_inc_inactive(a, b) do{}while(0) -#define ckrm_zone_dec_active(a, b) do{}while(0) -#define ckrm_zone_dec_inactive(a, b) do{}while(0) - -#endif // CONFIG_CKRM_RES_MEM - -#endif // _LINUX_CKRM_MEM_INLINE_H_ diff --git a/include/linux/ckrm_net.h b/include/linux/ckrm_net.h deleted file mode 100644 index 0cbf784bb..000000000 --- a/include/linux/ckrm_net.h +++ /dev/null @@ -1,41 +0,0 @@ -/* ckrm_rc.h - Header file to be used by Resource controllers of CKRM - * - * Copyright (C) Vivek Kashyap , IBM Corp. 2004 - * - * Provides data structures, macros and kernel API of CKRM for - * resource controllers. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#ifndef _LINUX_CKRM_NET_H -#define _LINUX_CKRM_NET_H - -struct ckrm_sock_class; - -struct ckrm_net_struct { - int ns_type; // type of net class - struct sock *ns_sk; // pointer to socket - pid_t ns_tgid; // real process id - pid_t ns_pid; // calling thread's pid - int ns_family; // IPPROTO_IPV4 || IPPROTO_IPV6 - // Currently only IPV4 is supported - union { - __u32 ns_dipv4; // V4 listener's address - } ns_daddr; - __u16 ns_dport; // listener's port - __u16 ns_sport; // sender's port - atomic_t ns_refcnt; - struct ckrm_sock_class *core; - struct list_head ckrm_link; -}; - -#define ns_daddrv4 ns_daddr.ns_dipv4 - -#endif diff --git a/include/linux/ckrm_rc.h b/include/linux/ckrm_rc.h deleted file mode 100644 index 06e2d2aff..000000000 --- a/include/linux/ckrm_rc.h +++ /dev/null @@ -1,355 +0,0 @@ -/* - * ckrm_rc.h - Header file to be used by Resource controllers of CKRM - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Shailabh Nagar, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Vivek Kashyap , IBM Corp. 2004 - * - * Provides data structures, macros and kernel API of CKRM for - * resource controllers. - * - * More details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* - * Changes - * - * 12 Nov 2003 - * Created. - */ - -#ifndef _LINUX_CKRM_RC_H -#define _LINUX_CKRM_RC_H - -#ifdef __KERNEL__ - -#ifdef CONFIG_CKRM - -#include -#include -#include -#include - -#define CKRM_MAX_CLASSTYPES 32 /* maximum number of class types */ -#define CKRM_MAX_CLASSTYPE_NAME 32 /* maximum classtype name length */ - -#define CKRM_MAX_RES_CTLRS 8 /* maximum resource controllers per classtype */ -#define CKRM_MAX_RES_NAME 128 /* maximum resource controller name length */ - -struct ckrm_core_class; -struct ckrm_classtype; - -/* - * Share specifications - */ - -typedef struct ckrm_shares { - int my_guarantee; - int my_limit; - int total_guarantee; - int max_limit; - int unused_guarantee; /* not used as parameters */ - int cur_max_limit; /* not used as parameters */ -} ckrm_shares_t; - -#define CKRM_SHARE_UNCHANGED (-1) -#define CKRM_SHARE_DONTCARE (-2) -#define CKRM_SHARE_DFLT_TOTAL_GUARANTEE (100) -#define CKRM_SHARE_DFLT_MAX_LIMIT (100) - -/* - * RESOURCE CONTROLLERS - */ - -/* resource controller callback structure */ - -typedef struct ckrm_res_ctlr { - char res_name[CKRM_MAX_RES_NAME]; - int res_hdepth; /* maximum hierarchy */ - int resid; /* (for now) same as the enum resid */ - struct ckrm_classtype *classtype; /* classtype owning this res ctlr */ - - /* allocate/free new resource class object for resource controller */ - void *(*res_alloc) (struct ckrm_core_class * this, - struct ckrm_core_class * parent); - void (*res_free) (void *); - - /* set/get limits/guarantees for a resource controller class */ - int (*set_share_values) (void *, struct ckrm_shares * shares); - int (*get_share_values) (void *, struct ckrm_shares * shares); - - /* statistics and configuration access */ - int (*get_stats) (void *, struct seq_file *); - int (*reset_stats) (void *); - int (*show_config) (void *, struct seq_file *); - int (*set_config) (void *, const char *cfgstr); - - void (*change_resclass) (void *, void *, void *); -} ckrm_res_ctlr_t; - -/* - * CKRM_CLASSTYPE - * - * A object describes a dimension for CKRM to classify - * along. Need to provide methods to create and manipulate class objects in - * this dimension - */ - -/* list of predefined class types, we always recognize */ -#define CKRM_CLASSTYPE_TASK_CLASS 0 -#define CKRM_CLASSTYPE_SOCKET_CLASS 1 -#define CKRM_RESV_CLASSTYPES 2 /* always +1 of last known type */ - -#define CKRM_MAX_TYPENAME_LEN 32 - -typedef struct ckrm_classtype { - /* TODO: Review for cache alignment */ - - /* resource controllers */ - - spinlock_t res_ctlrs_lock; /* protect res ctlr related data */ - int max_res_ctlrs; /* max number of res ctlrs allowed */ - int max_resid; /* max resid used */ - int resid_reserved; /* max number of reserved controllers */ - long bit_res_ctlrs; /* bitmap of resource ID used */ - atomic_t nr_resusers[CKRM_MAX_RES_CTLRS]; - ckrm_res_ctlr_t *res_ctlrs[CKRM_MAX_RES_CTLRS]; - - /* state about my classes */ - - struct ckrm_core_class *default_class; - struct list_head classes; /* link all classes of this classtype */ - int num_classes; - - /* state about my ce interaction */ - atomic_t ce_regd; /* if CE registered */ - int ce_cb_active; /* if Callbacks active */ - atomic_t ce_nr_users; /* number of active transient calls */ - struct ckrm_eng_callback ce_callbacks; /* callback engine */ - - /* Begin classtype-rcfs private data. No rcfs/fs specific types used. */ - - int mfidx; /* Index into genmfdesc array used to initialize */ - void *mfdesc; /* Array of descriptors of root and magic files */ - int mfcount; /* length of above array */ - void *rootde; /* root dentry created by rcfs */ - /* End rcfs private data */ - - char name[CKRM_MAX_TYPENAME_LEN]; /* currently same as mfdesc[0]->name */ - /* but could be different */ - int typeID; /* unique TypeID */ - int maxdepth; /* maximum depth supported */ - - /* functions to be called on any class type by external API's */ - - struct ckrm_core_class *(*alloc) (struct ckrm_core_class * parent, - const char *name); - int (*free) (struct ckrm_core_class * cls); - int (*show_members) (struct ckrm_core_class *, struct seq_file *); - int (*show_stats) (struct ckrm_core_class *, struct seq_file *); - int (*show_config) (struct ckrm_core_class *, struct seq_file *); - int (*show_shares) (struct ckrm_core_class *, struct seq_file *); - - int (*reset_stats) (struct ckrm_core_class *, const char *resname, - const char *); - int (*set_config) (struct ckrm_core_class *, const char *resname, - const char *cfgstr); - int (*set_shares) (struct ckrm_core_class *, const char *resname, - struct ckrm_shares * shares); - int (*forced_reclassify) (struct ckrm_core_class *, const char *); - - /* functions to be called on a class type by ckrm internals */ - - /* class initialization for new RC */ - void (*add_resctrl) (struct ckrm_core_class *, int resid); -} ckrm_classtype_t; - -/* - * CKRM CORE CLASS - * common part to any class structure (i.e. instance of a classtype) - */ - -/* - * basic definition of a hierarchy that is to be used by the the CORE classes - * and can be used by the resource class objects - */ - -#define CKRM_CORE_MAGIC 0xBADCAFFE - -typedef struct ckrm_hnode { - struct ckrm_core_class *parent; - struct list_head siblings; - struct list_head children; -} ckrm_hnode_t; - -typedef struct ckrm_core_class { - struct ckrm_classtype *classtype; - void *res_class[CKRM_MAX_RES_CTLRS]; /* resource classes */ - spinlock_t class_lock; /* protects list,array above */ - - struct list_head objlist; /* generic object list */ - struct list_head clslist; /* peer classtype classes */ - struct dentry *dentry; /* dentry of inode in the RCFS */ - int magic; - - struct ckrm_hnode hnode; /* hierarchy */ - rwlock_t hnode_rwlock; /* protects hnode above. */ - atomic_t refcnt; - const char *name; - int delayed; /* core deletion delayed */ - /* because of race conditions */ -} ckrm_core_class_t; - -/* type coerce between derived class types and ckrm core class type */ -#define class_type(type,coreptr) container_of(coreptr,type,core) -#define class_core(clsptr) (&(clsptr)->core) -/* locking classes */ -#define class_lock(coreptr) spin_lock(&(coreptr)->class_lock) -#define class_unlock(coreptr) spin_unlock(&(coreptr)->class_lock) -/* what type is a class of ISA */ -#define class_isa(clsptr) (class_core(clsptr)->classtype) - -/* - * OTHER - */ - -#define ckrm_get_res_class(rescls, resid, type) \ - ((type*) (((resid != -1) && ((rescls) != NULL) \ - && ((rescls) != (void *)-1)) ? \ - ((struct ckrm_core_class *)(rescls))->res_class[resid] : NULL)) - - -extern int ckrm_register_res_ctlr(struct ckrm_classtype *, ckrm_res_ctlr_t *); -extern int ckrm_unregister_res_ctlr(ckrm_res_ctlr_t *); - -extern int ckrm_validate_and_grab_core(struct ckrm_core_class *core); -extern int ckrm_init_core_class(struct ckrm_classtype *clstype, - struct ckrm_core_class *dcore, - struct ckrm_core_class *parent, - const char *name); -extern int ckrm_release_core_class(struct ckrm_core_class *); - -/* TODO: can disappear after cls del debugging */ - -extern struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *type, - const char *resname); - -extern void ckrm_lock_hier(struct ckrm_core_class *); -extern void ckrm_unlock_hier(struct ckrm_core_class *); -extern struct ckrm_core_class *ckrm_get_next_child(struct ckrm_core_class *, - struct ckrm_core_class *); - -extern void child_guarantee_changed(struct ckrm_shares *, int, int); -extern void child_maxlimit_changed(struct ckrm_shares *, int); -extern int set_shares(struct ckrm_shares *, struct ckrm_shares *, - struct ckrm_shares *); - -/* classtype registration and lookup */ -extern int ckrm_register_classtype(struct ckrm_classtype *clstype); -extern int ckrm_unregister_classtype(struct ckrm_classtype *clstype); -extern struct ckrm_classtype *ckrm_find_classtype_by_name(const char *name); - -/* default functions that can be used in classtypes's function table */ -extern int ckrm_class_show_shares(struct ckrm_core_class *core, - struct seq_file *seq); -extern int ckrm_class_show_stats(struct ckrm_core_class *core, - struct seq_file *seq); -extern int ckrm_class_show_config(struct ckrm_core_class *core, - struct seq_file *seq); -extern int ckrm_class_set_config(struct ckrm_core_class *core, - const char *resname, const char *cfgstr); -extern int ckrm_class_set_shares(struct ckrm_core_class *core, - const char *resname, - struct ckrm_shares *shares); -extern int ckrm_class_reset_stats(struct ckrm_core_class *core, - const char *resname, const char *unused); - -static inline void ckrm_core_grab(struct ckrm_core_class *core) -{ - if (core) - atomic_inc(&core->refcnt); -} - -static inline void ckrm_core_drop(struct ckrm_core_class *core) -{ - /* only make definition available in this context */ - extern void ckrm_free_core_class(struct ckrm_core_class *core); - if (core && (atomic_dec_and_test(&core->refcnt))) - ckrm_free_core_class(core); -} - -static inline unsigned int ckrm_is_core_valid(ckrm_core_class_t * core) -{ - return (core && (core->magic == CKRM_CORE_MAGIC)); -} - -/* - * iterate through all associate resource controllers: - * requires following arguments (ckrm_core_class *cls, - * ckrm_res_ctrl *ctlr, - * void *robj, - * int bmap) - */ - -#define forall_class_resobjs(cls,rcbs,robj,bmap) \ - for ( bmap=((cls->classtype)->bit_res_ctlrs) ; \ - ({ int rid; ((rid=ffs(bmap)-1) >= 0) && \ - (bmap &= ~(1<classtype->res_ctlrs[rid]) \ - && (robj=cls->res_class[rid]))); }); \ - ) - -extern struct ckrm_classtype *ckrm_classtypes[]; - -/* - * CE Invocation interface - */ - -#define ce_protect(ctype) (atomic_inc(&((ctype)->ce_nr_users))) -#define ce_release(ctype) (atomic_dec(&((ctype)->ce_nr_users))) - -/* CE Classification callbacks with */ - -#define CE_CLASSIFY_NORET(ctype, event, objs_to_classify...) \ -do { \ - if ((ctype)->ce_cb_active \ - && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \ - (*(ctype)->ce_callbacks.classify)(event, \ - objs_to_classify); \ -} while (0) - -#define CE_CLASSIFY_RET(ret, ctype, event, objs_to_classify...) \ -do { \ - if ((ctype)->ce_cb_active \ - && (test_bit(event,&(ctype)->ce_callbacks.c_interest))) \ - ret = (*(ctype)->ce_callbacks.classify)(event, \ - objs_to_classify);\ -} while (0) - -#define CE_NOTIFY(ctype, event, cls, objs_to_classify) \ -do { \ - if ((ctype)->ce_cb_active \ - && (test_bit(event,&(ctype)->ce_callbacks.n_interest))) \ - (*(ctype)->ce_callbacks.notify)(event, \ - cls,objs_to_classify); \ -} while (0) - -/* - * RCFS related - */ - -/* vars needed by other modules/core */ - -extern int rcfs_mounted; -extern int rcfs_engine_regd; - -#endif /* CONFIG_CKRM */ -#endif /* __KERNEL__ */ -#endif /* _LINUX_CKRM_RC_H */ diff --git a/include/linux/ckrm_sched.h b/include/linux/ckrm_sched.h deleted file mode 100644 index 088e06c5d..000000000 --- a/include/linux/ckrm_sched.h +++ /dev/null @@ -1,562 +0,0 @@ -/* include/linux/ckrm_sched.h - Supports CKRM scheduling - * - * Copyright (C) Haoqiang Zheng, IBM Corp. 2004 - * Copyright (C) Hubertus Franke, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#ifndef _CKRM_SCHED_H -#define _CKRM_SCHED_H - -#include -#include -#include - -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - -struct prio_array { - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; -}; - -#ifdef CONFIG_CKRM_CPU_SCHEDULE -#define rq_active(p,rq) (get_task_lrq(p)->active) -#define rq_expired(p,rq) (get_task_lrq(p)->expired) -int __init init_ckrm_sched_res(void); -#else -#define rq_active(p,rq) (rq->active) -#define rq_expired(p,rq) (rq->expired) -static inline void init_ckrm_sched_res(void) {} -static inline int ckrm_cpu_monitor_init(void) {return 0;} -#endif //CONFIG_CKRM_CPU_SCHEDULE - -#ifdef CONFIG_CKRM_CPU_SCHEDULE -struct ckrm_runqueue { - cq_node_t classqueue_linkobj; /*links in classqueue */ - struct ckrm_cpu_class *cpu_class; // class it belongs to - struct classqueue_struct *classqueue; // classqueue it belongs tow - unsigned long long uncounted_ns; - - prio_array_t *active, *expired, arrays[2]; - /* - set to 0 on init, become null or array switch - set to jiffies whenever an non-interactive job expires - reset to jiffies if expires - */ - unsigned long expired_timestamp; - - /* - * highest priority of tasks in active - * initialized to be MAX_PRIO - * updated on enqueue, dequeue - */ - int top_priority; - CVT_t local_cvt; - - unsigned long lrq_load; - int local_weight; - - - /* - * unused CPU time accumulated while thoe class - * is inactive goes to savings - * - * initialized to be 0 - * a class can't accumulate more than SAVING_THRESHOLD of savings - */ - unsigned long long savings; - - unsigned long magic; //for debugging -}; - -typedef struct ckrm_runqueue ckrm_lrq_t; - -/** - * ckrm_cpu_class_stat - cpu usage statistics maintained for each class - * - */ -struct ckrm_cpu_class_stat { - spinlock_t stat_lock; - - unsigned long long total_ns; /*how much nano-secs it has consumed */ - - struct ckrm_cpu_demand_stat local_stats[NR_CPUS]; - - /* - * - */ - unsigned long max_demand; /* the maximun a class can consume */ - int egrt,megrt; /*effective guarantee*/ - int ehl,mehl; /*effective hard limit, my effective hard limit*/ - - /* - * eshare: for both default class and its children - * meshare: just for the default class - */ - int eshare; - int meshare; -}; - -#define CKRM_CPU_CLASS_MAGIC 0x7af2abe3 - -#define USAGE_SAMPLE_FREQ HZ //sample every 1 seconds -#define NS_PER_SAMPLE (USAGE_SAMPLE_FREQ*(NSEC_PER_SEC/HZ)) -#define USAGE_WINDOW_SIZE 60 //keep the last 60 sample - -struct ckrm_usage { - unsigned long samples[USAGE_WINDOW_SIZE]; //record usages - unsigned long sample_pointer; // pointer for the sliding window - unsigned long long last_ns; // ns for last sample - long long last_sample_jiffies; // in number of jiffies -}; - -/* - * manages the class status - * there should be only one instance of this object for each class in the whole system - */ -struct ckrm_cpu_class { - struct ckrm_core_class *core; - struct ckrm_core_class *parent; - struct ckrm_shares shares; - spinlock_t cnt_lock; // always grab parent's lock first and then child's - struct ckrm_cpu_class_stat stat; - struct list_head links; // for linking up in cpu classes - ckrm_lrq_t local_queues[NR_CPUS]; // runqueues - struct ckrm_usage usage; - unsigned long magic; //for debugging -}; - -#define cpu_class_weight(cls) (cls->stat.meshare) -#define local_class_weight(lrq) (lrq->local_weight) - -static inline int valid_cpu_class(struct ckrm_cpu_class * cls) -{ - return (cls && cls->magic == CKRM_CPU_CLASS_MAGIC); -} - -struct classqueue_struct *get_cpu_classqueue(int cpu); -struct ckrm_cpu_class * get_default_cpu_class(void); - - -static inline void ckrm_usage_init(struct ckrm_usage* usage) -{ - int i; - - for (i=0; i < USAGE_WINDOW_SIZE; i++) - usage->samples[i] = 0; - usage->sample_pointer = 0; - usage->last_ns = 0; - usage->last_sample_jiffies = 0; -} - -/* - * this function can be called at any frequency - * it's self-contained - */ -static inline void ckrm_sample_usage(struct ckrm_cpu_class* clsptr) -{ - struct ckrm_usage* usage = &clsptr->usage; - unsigned long long cur_sample; - int duration = jiffies - usage->last_sample_jiffies; - - //jiffies wasn't start from 0 - //so it need to be properly handled - if (unlikely(!usage->last_sample_jiffies)) - usage->last_sample_jiffies = jiffies; - - //called too frequenctly - if (duration < USAGE_SAMPLE_FREQ) - return; - - usage->last_sample_jiffies = jiffies; - - cur_sample = clsptr->stat.total_ns - usage->last_ns; - usage->last_ns = clsptr->stat.total_ns; - - //scale it based on the sample duration - cur_sample *= ((USAGE_SAMPLE_FREQ<< 15)/duration); - cur_sample >>= 15; - usage->samples[usage->sample_pointer] = cur_sample; - // printk("sample = %llu jiffies=%lu \n",cur_sample, jiffies); - - usage->sample_pointer ++; - if (usage->sample_pointer >= USAGE_WINDOW_SIZE) - usage->sample_pointer = 0; -} - -//duration is specified in number of jiffies -//return the usage in percentage -static inline int get_ckrm_usage(struct ckrm_cpu_class* clsptr, int duration) -{ - int nr_samples = duration/USAGE_SAMPLE_FREQ?:1; - struct ckrm_usage* usage = &clsptr->usage; - unsigned long long total = 0; - int i, idx; - - if (nr_samples > USAGE_WINDOW_SIZE) - nr_samples = USAGE_WINDOW_SIZE; - - idx = usage->sample_pointer; - for (i = 0; i< nr_samples; i++) { - if (! idx) - idx = USAGE_WINDOW_SIZE; - idx --; - total += usage->samples[idx]; - } - total *= 100; - do_div(total,nr_samples); - do_div(total,NS_PER_SAMPLE); - do_div(total,cpus_weight(cpu_online_map)); - return total; -} - - -#define lrq_nr_running(lrq) \ - (lrq->active->nr_active + lrq->expired->nr_active) - -static inline ckrm_lrq_t * -get_ckrm_lrq(struct ckrm_cpu_class*cls, int cpu) -{ - return &(cls->local_queues[cpu]); -} - -static inline ckrm_lrq_t *get_task_lrq(struct task_struct *p) -{ - return &(p->cpu_class->local_queues[task_cpu(p)]); -} - -#define task_list_entry(list) list_entry(list,struct task_struct,run_list) -#define class_list_entry(list) list_entry(list,struct ckrm_runqueue,classqueue_linkobj) - -/* some additional interfaces exported from sched.c */ -struct runqueue; -extern rwlock_t class_list_lock; -extern struct list_head active_cpu_classes; -unsigned int task_timeslice(task_t *p); -void _ckrm_cpu_change_class(task_t *task, struct ckrm_cpu_class *newcls); - -void init_cpu_classes(void); -void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares); -void ckrm_cpu_change_class(void *task, void *old, void *new); - -#define CPU_DEMAND_ENQUEUE 0 -#define CPU_DEMAND_DEQUEUE 1 -#define CPU_DEMAND_DESCHEDULE 2 -#define CPU_DEMAND_INIT 3 - -/*functions exported by ckrm_cpu_monitor.c*/ -void ckrm_cpu_monitor(int check_min); -int ckrm_cpu_monitor_init(void); -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat); -void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len); -void adjust_local_weight(void); - -#define get_task_lrq_stat(p) (&(p)->cpu_class->stat.local_stats[task_cpu(p)]) -#define get_cls_local_stat(cls,cpu) (&(cls)->stat.local_stats[cpu]) -#define get_rq_local_stat(lrq,cpu) (get_cls_local_stat((lrq)->cpu_class,cpu)) - -/******************************************************************** - * Parameters that determine how quickly CVT's progress and how - * priority can impact a LRQ's runqueue position. See also - * get_effective_prio(). These parameters need to adjusted - * in accordance to the following example and understanding. - * - * CLASS_QUANTIZER: - * - * A class with 50% share, can execute 500 ms / per sec ~ 2^29 ns. - * It's share will be set to 512 = 2^9. The globl CLASSQUEUE_SIZE is set to 2^7. - * With CLASS_QUANTIZER=16, the local_cvt of this class will increase - * by 2^29/2^9 = 2^20 = 1024K. - * Setting CLASS_QUANTIZER to 16, 2^(20-16) = 16 slots / per second. - * Do the same math, a class with any share value, will cover 16 slots / per second. - * So 2^8 total slots is good track for 8 seconds of system execution - * - * PRIORITY_QUANTIZER: - * - * How much can top priorities of class impact slot bonus. - * There are 40 nice priorities, range from -20 to 19, with default nice = 0 - * "2" will allow upto 5 slots improvement - * when certain task within the class has a nice value of -20 - * in the RQ thus for 50% class it can perform ~300 msec starvation. - * - *******************************************************************/ - -#define CLASS_QUANTIZER 16 //shift from ns to increase class bonus -#define PRIORITY_QUANTIZER 2 //controls how much a high prio task can borrow - -#define CKRM_SHARE_ACCURACY 13 -#define NSEC_PER_MS 1000000 -#define NSEC_PER_JIFFIES (NSEC_PER_SEC/HZ) - - -#define MAX_SAVINGS_ABSOLUTE (10LLU*NSEC_PER_SEC) // 10 seconds - -#define CVT_UPDATE_TICK ((HZ/2)?:1) - -// ABSOLUTE_CKRM_TUNING determines whether classes can make up -// lost time in absolute time or in relative values - -#define ABSOLUTE_CKRM_TUNING // preferred due to more predictable behavior - -#ifdef ABSOLUTE_CKRM_TUNING - -#define MAX_SAVINGS MAX_SAVINGS_ABSOLUTE -//an absolute bonus of 200ms for classes when reactivated -#define INTERACTIVE_BONUS(lrq) ((200*NSEC_PER_MS)/local_class_weight(lrq)) -#define SAVINGS_LEAK_SPEED (CVT_UPDATE_TICK/10*NSEC_PER_JIFFIES) - -#define scale_cvt(val,lrq) ((val)*local_class_weight(lrq)) -#define unscale_cvt(val,lrq) (do_div(val,local_class_weight(lrq))) - -#else - -#define MAX_SAVINGS (MAX_SAVINGS_ABSOLUTE >> CKRM_SHARE_ACCURACY) -/* - * to improve system responsiveness - * an inactive class is put a little bit ahead of the current class when it wakes up - * the amount is set in normalized term to simplify the calculation - * for class with 100% share, it can be 2s ahead - * while for class with 10% share, it can be 200ms ahead - */ -#define INTERACTIVE_BONUS(lrq) (2*NSEC_PER_MS) - -/* - * normalized savings can't be more than MAX_NORMALIZED_SAVINGS - * based on the current configuration - * this means that a class with share 100% will accumulate 10s at most - * while a class with 1% of the share can only accumulate 100ms - */ - -//a class with share 100% can get 100ms every 500ms -//while a class with share 10% can only get 10ms every 500ms -#define SAVINGS_LEAK_SPEED ((CVT_UPDATE_TICK/5*NSEC_PER_JIFFIES) >> CKRM_SHARE_ACCURACY) - -#define scale_cvt(val,lrq) (val) -#define unscale_cvt(val,lrq) (val) - -#endif - - -/** - * get_effective_prio: return the effective priority of a class local queue - * - * class priority = progress * a + urgency * b - * progress = queue cvt - * urgency = queue top priority - * a and b are scaling factors - * currently, prio increases by 1 if either: top_priority increase by one - * or, local_cvt increases by 4ms - */ -static inline int get_effective_prio(ckrm_lrq_t * lrq) -{ - int prio; - - prio = lrq->local_cvt >> CLASS_QUANTIZER; // cumulative usage -#ifndef URGENCY_SUPPORT -#warning "ACB removing urgency calculation from get_effective_prio" -#else - prio += lrq->top_priority >> PRIORITY_QUANTIZER; // queue urgency -#endif - - return prio; -} - -CVT_t get_local_cur_cvt(int cpu); - -/** - * update_class_priority: - * - * called whenever cvt or top_priority changes - * - * internal: (calling structure) - * update_class_priority - * -- set_top_priority - * -- class_enqueue_task - * -- class_dequeue_task - * -- rq_get_next_task (queue switch) - * -- update_local_cvt - * -- schedule - */ -static inline void update_class_priority(ckrm_lrq_t *local_rq) -{ - int effective_prio = get_effective_prio(local_rq); - classqueue_update_prio(local_rq->classqueue, - &local_rq->classqueue_linkobj, - effective_prio); -} - -/* - * set the new top priority and reposition the queue - * called when: task enqueue/dequeue and queue switch - */ -static inline void set_top_priority(ckrm_lrq_t *lrq, - int new_priority) -{ - lrq->top_priority = new_priority; - update_class_priority(lrq); -} - -/* - * task_load: how much load this task counts - */ -static inline unsigned long task_load(struct task_struct* p) -{ - return (task_timeslice(p) * p->demand_stat.cpu_demand); -} - -/* - * runqueue load is the local_weight of all the classes on this cpu - * must be called with class_list_lock held - */ -static inline unsigned long ckrm_cpu_load(int cpu) -{ - struct ckrm_cpu_class *clsptr; - ckrm_lrq_t* lrq; - struct ckrm_cpu_demand_stat* l_stat; - int total_load = 0; - int load; - - list_for_each_entry(clsptr,&active_cpu_classes,links) { - lrq = get_ckrm_lrq(clsptr,cpu); - l_stat = get_cls_local_stat(clsptr,cpu); - load = lrq->local_weight; - if (l_stat->cpu_demand < load) - load = l_stat->cpu_demand; - total_load += load; - } - return total_load; -} - -static inline void class_enqueue_task(struct task_struct *p, - prio_array_t * array) -{ - ckrm_lrq_t *lrq; - int effective_prio; - - lrq = get_task_lrq(p); - - cpu_demand_event(&p->demand_stat,CPU_DEMAND_ENQUEUE,0); - lrq->lrq_load += task_load(p); - - if ((p->prio < lrq->top_priority) && (array == lrq->active)) - set_top_priority(lrq, p->prio); - - if (! cls_in_classqueue(&lrq->classqueue_linkobj)) { - cpu_demand_event(get_task_lrq_stat(p),CPU_DEMAND_ENQUEUE,0); - effective_prio = get_effective_prio(lrq); - classqueue_enqueue(lrq->classqueue, &lrq->classqueue_linkobj, effective_prio); - } - -} - -static inline void class_dequeue_task(struct task_struct *p, - prio_array_t * array) -{ - ckrm_lrq_t *lrq = get_task_lrq(p); - unsigned long load = task_load(p); - - BUG_ON(lrq->lrq_load < load); - lrq->lrq_load -= load; - - cpu_demand_event(&p->demand_stat,CPU_DEMAND_DEQUEUE,0); - - if ((array == lrq->active) && (p->prio == lrq->top_priority) - && list_empty(&(array->queue[p->prio]))) - set_top_priority(lrq, - find_next_bit(array->bitmap, MAX_PRIO, - p->prio)); -} - -/* - * called after a task is switched out. Update the local cvt accounting - * we need to stick with long instead of long long due to nonexistent 64-bit division - */ -static inline void update_local_cvt(struct task_struct *p, unsigned long nsec) -{ - ckrm_lrq_t * lrq = get_task_lrq(p); - - unsigned long cvt_inc = nsec / local_class_weight(lrq); - - lrq->local_cvt += cvt_inc; - lrq->uncounted_ns += nsec; - - update_class_priority(lrq); -} - -static inline int class_preempts_curr(struct task_struct * p, struct task_struct* curr) -{ - struct cq_node_struct* node1 = &(get_task_lrq(p)->classqueue_linkobj); - struct cq_node_struct* node2 = &(get_task_lrq(curr)->classqueue_linkobj); - - return (class_compare_prio(node1,node2) < 0); -} - -/* - * return a random value with range [0, (val-1)] - */ -static inline int get_ckrm_rand(unsigned long val) -{ - int rand; - static int last_rand[NR_CPUS]; - int cpu = smp_processor_id(); - - rand = last_rand[cpu]; - rand ++; - if (rand >= val) - rand = 0; - - last_rand[cpu] = rand; - return rand; -} - -void update_class_cputime(int this_cpu); - -/**********************************************/ -/* PID_LOAD_BALANCING */ -/**********************************************/ -struct ckrm_load_struct { - unsigned long load_p; /*propotional*/ - unsigned long load_i; /*integral */ - long load_d; /*derivative */ -}; - -typedef struct ckrm_load_struct ckrm_load_t; - -static inline void ckrm_load_init(ckrm_load_t* ckrm_load) { - ckrm_load->load_p = 0; - ckrm_load->load_i = 0; - ckrm_load->load_d = 0; -} - -void ckrm_load_sample(ckrm_load_t* ckrm_load,int cpu); -long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group); -#define rq_ckrm_load(rq) (&((rq)->ckrm_load)) - -static inline void ckrm_sched_tick(unsigned long j,int this_cpu,struct ckrm_load_struct* ckrm_load) -{ - read_lock(&class_list_lock); - -#ifdef CONFIG_SMP - ckrm_load_sample(ckrm_load,this_cpu); -#endif - - if (! (j % CVT_UPDATE_TICK)) { - // printk("ckrm_sched j=%lu\n",j); - classqueue_update_base(get_cpu_classqueue(this_cpu)); - update_class_cputime(this_cpu); - } - - read_unlock(&class_list_lock); -} - -#endif //CONFIG_CKRM_CPU_SCHEDULE - -#endif diff --git a/include/linux/ckrm_tc.h b/include/linux/ckrm_tc.h deleted file mode 100644 index 5949af190..000000000 --- a/include/linux/ckrm_tc.h +++ /dev/null @@ -1,50 +0,0 @@ -/* ckrm_tc.h - Header file to be used by task class users - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004 - * - * Provides data structures, macros and kernel API for the - * classtype, taskclass. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -/* Changes - * - * 12 Apr 2004 - * Created. - */ - -#ifndef _LINUX_CKRM_TC_H_ -#define _LINUX_CKRM_TC_H_ - -#ifdef CONFIG_CKRM_TYPE_TASKCLASS -#include - -#define TASK_CLASS_TYPE_NAME "taskclass" - -typedef struct ckrm_task_class { - struct ckrm_core_class core; -} ckrm_task_class_t; - -// Index into genmfdesc array, defined in rcfs/dir_modules.c, -// which has the mfdesc entry that taskclass wants to use -#define TC_MF_IDX 0 - -extern int ckrm_forced_reclassify_pid(int, struct ckrm_task_class *); - -#else // CONFIG_CKRM_TYPE_TASKCLASS - -#define ckrm_forced_reclassify_pid(a, b) (0) - -#endif - -#endif // _LINUX_CKRM_TC_H_ diff --git a/include/linux/ckrm_tsk.h b/include/linux/ckrm_tsk.h deleted file mode 100644 index f61453901..000000000 --- a/include/linux/ckrm_tsk.h +++ /dev/null @@ -1,35 +0,0 @@ -/* ckrm_tsk.h - No. of tasks resource controller for CKRM - * - * Copyright (C) Chandra Seetharaman, IBM Corp. 2003 - * - * Provides No. of tasks resource controller for CKRM - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#ifndef _LINUX_CKRM_TSK_H -#define _LINUX_CKRM_TSK_H - -#ifdef CONFIG_CKRM_TYPE_TASKCLASS -#include - -typedef int (*get_ref_t) (struct ckrm_core_class *, int); -typedef void (*put_ref_t) (struct ckrm_core_class *); - -extern int numtasks_get_ref(struct ckrm_core_class *, int); -extern void numtasks_put_ref(struct ckrm_core_class *); -extern void ckrm_numtasks_register(get_ref_t, put_ref_t); - -#else /* CONFIG_CKRM_TYPE_TASKCLASS */ - -#define numtasks_get_ref(core_class, ref) (1) -#define numtasks_put_ref(core_class) do {} while (0) - -#endif /* CONFIG_CKRM_TYPE_TASKCLASS */ -#endif /* _LINUX_CKRM_RES_H */ diff --git a/include/linux/crbce.h b/include/linux/crbce.h deleted file mode 100644 index 6a2190dd8..000000000 --- a/include/linux/crbce.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * crbce.h - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * - * This files contains the type definition of the record - * created by the CRBCE CKRM classification engine - * - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * - */ - - -/* - * Changes - * - * 2003-11-11 Created by H.Franke - * 2003-12-01 Sanitized for Delivery by H.Franke - * - */ - -#ifndef CRBCE_RECORDS_H -#define CRBCE_RECORDS_H - -#ifdef __KERNEL__ -#include -#else -#define CONFIG_CKRM -#define CONFIG_CRBCE -#define CONFIG_DELAY_ACCT -#endif - -#include -#include -#include - -#define CRBCE_UKCC_NAME "crbce_ukcc" -#define CRBCE_UKCC_PATH "/mnt/relayfs" - -#define CRBCE_UKCC_PATH_NAME CRBCE_UKCC_PATH"/"CRBCE_UKCC_NAME - -#define CRBCE_MAX_CLASS_NAME_LEN 256 - -/**************************************************************** - * - * CRBCE EVENT SET is and extension to the standard CKRM_EVENTS - * - ****************************************************************/ -enum { - - /* we use the standard CKRM_EVENT_<..> - * to identify reclassification cause actions - * and extend by additional ones we need - */ - - /* up event flow */ - - CRBCE_REC_EXIT = CKRM_NUM_EVENTS, - CRBCE_REC_DATA_DELIMITER, - CRBCE_REC_SAMPLE, - CRBCE_REC_TASKINFO, - CRBCE_REC_SYS_INFO, - CRBCE_REC_CLASS_INFO, - CRBCE_REC_KERNEL_CMD_DONE, - CRBCE_REC_UKCC_FULL, - - /* down command issueance */ - CRBCE_REC_KERNEL_CMD, - - CRBCE_NUM_EVENTS -}; - -struct task_sample_info { - uint32_t cpu_running; - uint32_t cpu_waiting; - uint32_t io_delayed; - uint32_t memio_delayed; -}; - -/********************************************* - * KERNEL -> USER records * - *********************************************/ - -/* we have records with either a time stamp or not */ -struct crbce_hdr { - int type; - pid_t pid; -}; - -struct crbce_hdr_ts { - int type; - pid_t pid; - uint32_t jiffies; - uint64_t cls; -}; - -/* individual records */ - -struct crbce_rec_fork { - struct crbce_hdr_ts hdr; - pid_t ppid; -}; - -struct crbce_rec_data_delim { - struct crbce_hdr_ts hdr; - int is_stop; /* 0 start, 1 stop */ -}; - -struct crbce_rec_task_data { - struct crbce_hdr_ts hdr; - struct task_sample_info sample; - struct task_delay_info delay; -}; - -struct crbce_ukcc_full { - struct crbce_hdr_ts hdr; -}; - -struct crbce_class_info { - struct crbce_hdr_ts hdr; - int action; - int namelen; - char name[CRBCE_MAX_CLASS_NAME_LEN]; -}; - -/********************************************* - * USER -> KERNEL records * - *********************************************/ - -enum crbce_kernel_cmd { - CRBCE_CMD_START, - CRBCE_CMD_STOP, - CRBCE_CMD_SET_TIMER, - CRBCE_CMD_SEND_DATA, -}; - -struct crbce_command { - int type; /* we need this for the K->U reflection */ - int cmd; - uint32_t len; /* added in the kernel for reflection */ -}; - -#define set_cmd_hdr(rec,tok) \ -((rec).hdr.type=CRBCE_REC_KERNEL_CMD,(rec).hdr.cmd=(tok)) - -struct crbce_cmd_done { - struct crbce_command hdr; - int rc; -}; - -struct crbce_cmd { - struct crbce_command hdr; -}; - -struct crbce_cmd_send_data { - struct crbce_command hdr; - int delta_mode; -}; - -struct crbce_cmd_settimer { - struct crbce_command hdr; - uint32_t interval; /* in msec .. 0 means stop */ -}; - -#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 93a6a10d5..9c73197d9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1683,27 +1683,5 @@ static inline char *alloc_secdata(void) static inline void free_secdata(void *secdata) { } #endif /* CONFIG_SECURITY */ - -/* io priorities */ - -#define IOPRIO_NR 21 - -#define IOPRIO_IDLE 0 -#define IOPRIO_NORM 10 -#define IOPRIO_RT 20 - -asmlinkage int sys_ioprio_set(int ioprio); -asmlinkage int sys_ioprio_get(void); - -/* common structure for cfq & ckrm I/O controller */ -typedef struct cfqlim { - int nskip; - unsigned long navsec; - int timedout; - atomic_t sectorate; - u64 sec[2]; -} cfqlim_t ; - - #endif /* __KERNEL__ */ #endif /* _LINUX_FS_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 8f535d451..f4143cabe 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -116,7 +116,6 @@ extern struct group_info init_groups; .vx_info = NULL, \ .nid = 0, \ .nx_info = NULL, \ - .ioprio = IOPRIO_NORM, \ } diff --git a/include/linux/mm.h b/include/linux/mm.h index d025bcbc6..bb5b3596b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -13,7 +13,6 @@ #include #include #include -#include struct mempolicy; struct anon_vma; @@ -238,9 +237,6 @@ struct page { void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ -#ifdef CONFIG_CKRM_RES_MEM - struct ckrm_zone *ckrm_zone; -#endif // CONFIG_CKRM_RES_MEM }; /* diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 0402eb087..47762ca69 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,57 +1,40 @@ -#include static inline void add_page_to_active_list(struct zone *zone, struct page *page) { -#ifndef CONFIG_CKRM_RES_MEM list_add(&page->lru, &zone->active_list); -#endif zone->nr_active++; - ckrm_mem_inc_active(page); } static inline void add_page_to_inactive_list(struct zone *zone, struct page *page) { -#ifndef CONFIG_CKRM_RES_MEM list_add(&page->lru, &zone->inactive_list); -#endif zone->nr_inactive++; - ckrm_mem_inc_inactive(page); } static inline void del_page_from_active_list(struct zone *zone, struct page *page) { -#ifndef CONFIG_CKRM_RES_MEM list_del(&page->lru); -#endif zone->nr_active--; - ckrm_mem_dec_active(page); } static inline void del_page_from_inactive_list(struct zone *zone, struct page *page) { -#ifndef CONFIG_CKRM_RES_MEM list_del(&page->lru); -#endif zone->nr_inactive--; - ckrm_mem_dec_inactive(page); } static inline void del_page_from_lru(struct zone *zone, struct page *page) { -#ifndef CONFIG_CKRM_RES_MEM list_del(&page->lru); -#endif if (PageActive(page)) { ClearPageActive(page); zone->nr_active--; - ckrm_mem_dec_active(page); } else { zone->nr_inactive--; - ckrm_mem_dec_inactive(page); } } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 08dd6a0f7..fb782c81b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -138,10 +138,8 @@ struct zone { /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; -#ifndef CONFIG_CKRM_RES_MEM - struct list_head active_list; - struct list_head inactive_list; -#endif + struct list_head active_list; + struct list_head inactive_list; unsigned long nr_scan_active; unsigned long nr_scan_inactive; unsigned long nr_active; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c99f570b7..04a2555e0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -75,11 +75,6 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ -#ifdef CONFIG_CKRM_RES_MEM -#define PG_ckrm_account 19 /* This page is accounted by CKRM */ -#endif - - /* * Global page accounting. One instance per CPU. Only unsigned longs are * allowed. @@ -302,12 +297,6 @@ extern unsigned long __read_page_state(unsigned offset); #define PageSwapCache(page) 0 #endif -#ifdef CONFIG_CKRM_RES_MEM -#define CkrmAccount(page) test_bit(PG_ckrm_account, &(page)->flags) -#define SetCkrmAccount(page) set_bit(PG_ckrm_account, &(page)->flags) -#define ClearCkrmAccount(page) clear_bit(PG_ckrm_account, &(page)->flags) -#endif - struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); diff --git a/include/linux/rbce.h b/include/linux/rbce.h deleted file mode 100644 index 91afba9ba..000000000 --- a/include/linux/rbce.h +++ /dev/null @@ -1,127 +0,0 @@ -/* Rule-based Classification Engine (RBCE) module - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * - * Module for loading of classification policies and providing - * a user API for Class-based Kernel Resource Management (CKRM) - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * - */ - -/* Changes - * - * 25 Mar 2004 - * Integrate RBCE and CRBE into a single module - * - */ - -#ifndef RBCE_H -#define RBCE_H - -// data types defined in main rbcemod.c -struct rbce_private_data; -struct rbce_class; -struct ckrm_core_class; - -#ifndef RBCE_EXTENSION - -/**************************************************************************** - * - * RBCE STANDALONE VERSION, NO CHOICE FOR DATA COLLECTION - * - ****************************************************************************/ - -#ifdef RBCE_SHOW_INCL -#warning " ... RBCE .." -#endif - -#define RBCE_MOD_DESCR "Rule Based Classification Engine Module for CKRM" -#define RBCE_MOD_NAME "rbce" - -/* extension to private data: NONE */ -struct rbce_ext_private_data { - /* empty data */ -}; -static inline void init_ext_private_data(struct rbce_private_data *dst) -{ -} - -/* sending notification to user: NONE */ - -static void notify_class_action(struct rbce_class *cls, int action) -{ -} -static inline void send_fork_notification(struct task_struct *tsk, - struct ckrm_core_class *cls) -{ -} -static inline void send_exit_notification(struct task_struct *tsk) -{ -} -static inline void send_manual_notification(struct task_struct *tsk) -{ -} - -/* extension initialization and destruction at module init and exit */ -static inline int init_rbce_ext_pre(void) -{ - return 0; -} -static inline int init_rbce_ext_post(void) -{ - return 0; -} -static inline void exit_rbce_ext(void) -{ -} - -#else - -/*************************************************************************** - * - * RBCE with User Level Notification - * - ***************************************************************************/ - -#ifdef RBCE_SHOW_INCL -#warning " ... CRBCE .." -#ifdef RBCE_DO_SAMPLE -#warning " ... CRBCE doing sampling ..." -#endif -#ifdef RBCE_DO_DELAY -#warning " ... CRBCE doing delay ..." -#endif -#endif - -#define RBCE_MOD_DESCR "Rule Based Classification Engine Module" \ - "with Data Sampling/Delivery for CKRM" -#define RBCE_MOD_NAME "crbce" - -#include - -struct rbce_ext_private_data { - struct task_sample_info sample; -}; - -static void notify_class_action(struct rbce_class *cls, int action); -#if 0 -static void send_fork_notification(struct task_struct *tsk, - struct ckrm_core_class *cls); -static void send_exit_notification(struct task_struct *tsk); -static void send_manual_notification(struct task_struct *tsk); -#endif - -#endif - -#endif // RBCE_H diff --git a/include/linux/rcfs.h b/include/linux/rcfs.h deleted file mode 100644 index e7846e136..000000000 --- a/include/linux/rcfs.h +++ /dev/null @@ -1,96 +0,0 @@ -#ifndef _LINUX_RCFS_H -#define _LINUX_RCFS_H - -#include -#include -#include -#include -#include - -/* - * The following declarations cannot be included in any of ckrm*.h files - * without jumping hoops. Remove later when rearrangements done - */ - -#define RCFS_MAGIC 0x4feedbac -#define RCFS_MAGF_NAMELEN 20 -extern int RCFS_IS_MAGIC; - -#define rcfs_is_magic(dentry) ((dentry)->d_fsdata == &RCFS_IS_MAGIC) - -typedef struct rcfs_inode_info { - ckrm_core_class_t *core; - char *name; - struct inode vfs_inode; -} rcfs_inode_info_t; - -#define RCFS_DEFAULT_DIR_MODE (S_IFDIR | S_IRUGO | S_IXUGO) -#define RCFS_DEFAULT_FILE_MODE (S_IFREG | S_IRUSR | S_IWUSR | S_IRGRP |S_IROTH) - -struct rcfs_magf { - char name[RCFS_MAGF_NAMELEN]; - int mode; - struct inode_operations *i_op; - struct file_operations *i_fop; -}; - -struct rcfs_mfdesc { - struct rcfs_magf *rootmf; /* Root directory and its magic files */ - int rootmflen; /* length of above array */ - /* - * Can have a different magf describing magic files - * for non-root entries too. - */ -}; - -extern struct rcfs_mfdesc *genmfdesc[]; - -extern struct rcfs_inode_info *RCFS_I(struct inode *inode); - -int rcfs_empty(struct dentry *); -struct inode *rcfs_get_inode(struct super_block *, int, dev_t); -int rcfs_mknod(struct inode *, struct dentry *, int, dev_t); -int _rcfs_mknod(struct inode *, struct dentry *, int, dev_t); -int rcfs_mkdir(struct inode *, struct dentry *, int); -ckrm_core_class_t *rcfs_make_core(struct dentry *, struct ckrm_core_class *); -struct dentry *rcfs_set_magf_byname(char *, void *); - -struct dentry *rcfs_create_internal(struct dentry *, struct rcfs_magf *, int); -int rcfs_delete_internal(struct dentry *); -int rcfs_create_magic(struct dentry *, struct rcfs_magf *, int); -int rcfs_clear_magic(struct dentry *); - -extern struct super_operations rcfs_super_ops; -extern struct address_space_operations rcfs_aops; - -extern struct inode_operations rcfs_dir_inode_operations; -extern struct inode_operations rcfs_rootdir_inode_operations; -extern struct inode_operations rcfs_file_inode_operations; - -extern struct file_operations target_fileops; -extern struct file_operations shares_fileops; -extern struct file_operations stats_fileops; -extern struct file_operations config_fileops; -extern struct file_operations members_fileops; -extern struct file_operations reclassify_fileops; -extern struct file_operations rcfs_file_operations; - -/* Callbacks into rcfs from ckrm */ - -typedef struct rcfs_functions { - int (*mkroot) (struct rcfs_magf *, int, struct dentry **); - int (*rmroot) (struct dentry *); - int (*register_classtype) (ckrm_classtype_t *); - int (*deregister_classtype) (ckrm_classtype_t *); -} rcfs_fn_t; - -int rcfs_register_classtype(ckrm_classtype_t *); -int rcfs_deregister_classtype(ckrm_classtype_t *); -int rcfs_mkroot(struct rcfs_magf *, int, struct dentry **); -int rcfs_rmroot(struct dentry *); - -#define RCFS_ROOT "/rcfs" /* TODO: Should use the mount point */ -extern struct dentry *rcfs_rootde; -extern rbce_eng_callback_t rcfs_eng_callbacks; - -#endif /* _LINUX_RCFS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9cb07d16b..04ac189e0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -95,7 +95,6 @@ extern unsigned long avenrun[]; /* Load averages */ extern int nr_threads; extern int last_pid; DECLARE_PER_CPU(unsigned long, process_counts); -// DECLARE_PER_CPU(struct runqueue, runqueues); -- removed after ckrm cpu v7 merge extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); @@ -140,7 +139,6 @@ struct sched_param { #ifdef __KERNEL__ -#include #include /* @@ -265,11 +263,6 @@ struct mm_struct { struct kioctx *ioctx_list; struct kioctx default_kioctx; -#ifdef CONFIG_CKRM_RES_MEM - struct ckrm_mem_res *memclass; - struct list_head tasklist; /* list of all tasks sharing this address space */ - spinlock_t peertask_lock; /* protect above tasklist */ -#endif }; struct sighand_struct { @@ -531,25 +524,6 @@ int set_current_groups(struct group_info *group_info); struct audit_context; /* See audit.c */ struct mempolicy; -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/** - * ckrm_cpu_demand_stat - used to track the cpu demand of a task/class - * @run: how much time it has been running since the counter started - * @total: total time since the counter started - * @last_sleep: the last time it sleeps, last_sleep = 0 when not sleeping - * @recalc_interval: how often do we recalculate the cpu_demand - * @cpu_demand: moving average of run/total - */ -struct ckrm_cpu_demand_stat { - unsigned long long run; - unsigned long long total; - unsigned long long last_sleep; - unsigned long long recalc_interval; - unsigned long cpu_demand; /*estimated cpu demand */ -}; -#endif - - struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -718,25 +692,6 @@ struct task_struct { struct mempolicy *mempolicy; short il_next; /* could be shared with used_math */ #endif - -#ifdef CONFIG_CKRM - spinlock_t ckrm_tsklock; - void *ce_data; -#ifdef CONFIG_CKRM_TYPE_TASKCLASS - // .. Hubertus should change to CONFIG_CKRM_TYPE_TASKCLASS - struct ckrm_task_class *taskclass; - struct list_head taskclass_link; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - struct ckrm_cpu_class *cpu_class; - //track cpu demand of this task - struct ckrm_cpu_demand_stat demand_stat; -#endif //CONFIG_CKRM_CPU_SCHEDULE -#endif // CONFIG_CKRM_TYPE_TASKCLASS -#ifdef CONFIG_CKRM_RES_MEM - struct list_head mm_peers; // list of tasks using same mm_struct -#endif // CONFIG_CKRM_RES_MEM -#endif // CONFIG_CKRM - struct task_delay_info delays; }; static inline pid_t process_group(struct task_struct *tsk) @@ -789,8 +744,6 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0) #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ #define PF_RELOCEXEC 0x00800000 /* relocate shared libraries */ -#define PF_MEMIO 0x01000000 /* I am potentially doing I/O for mem */ -#define PF_IOWAIT 0x02000000 /* I am waiting on disk I/O */ #ifdef CONFIG_SMP extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); diff --git a/include/linux/taskdelays.h b/include/linux/taskdelays.h deleted file mode 100644 index 698b23b61..000000000 --- a/include/linux/taskdelays.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _LINUX_TASKDELAYS_H -#define _LINUX_TASKDELAYS_H - -#include - -struct task_delay_info { -#ifdef CONFIG_DELAY_ACCT - /* delay statistics in usecs */ - unsigned long runs; - unsigned long waitcpu_total; - unsigned long runcpu_total; - unsigned long iowait_total; - unsigned long mem_iowait_total; - unsigned long num_iowaits; - unsigned long num_memwaits; -#endif -}; - -#endif // _LINUX_TASKDELAYS_H - diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b8a1a3544..871b60137 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -128,10 +128,6 @@ enum { #define TCP_INFO 11 /* Information about this connection. */ #define TCP_QUICKACK 12 /* Block/reenable quick acks */ -#ifdef CONFIG_ACCEPT_QUEUES -#define TCP_ACCEPTQ_SHARE 13 /* Set accept queue share */ -#endif - #define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_SACK 2 #define TCPI_OPT_WSCALE 4 @@ -194,18 +190,6 @@ struct tcp_info __u32 tcpi_total_retrans; }; -#ifdef CONFIG_ACCEPT_QUEUES - -#define NUM_ACCEPT_QUEUES 8 /* Must be power of 2 */ - -struct tcp_acceptq_info { - unsigned char acceptq_shares; - unsigned long acceptq_wait_time; - unsigned int acceptq_qcount; - unsigned int acceptq_count; -}; -#endif - #ifdef __KERNEL__ #include @@ -399,9 +383,7 @@ struct tcp_opt { /* FIFO of established children */ struct open_request *accept_queue; -#ifndef CONFIG_ACCEPT_QUEUES struct open_request *accept_queue_tail; -#endif unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; @@ -453,21 +435,6 @@ struct tcp_opt { __u32 last_cwnd; /* the last snd_cwnd */ __u32 last_stamp; /* time when updated last_cwnd */ } bictcp; - -#ifdef CONFIG_ACCEPT_QUEUES - /* move to listen opt... */ - char class_index; - struct { - struct open_request *aq_head; - struct open_request *aq_tail; - unsigned int aq_cnt; - unsigned int aq_ratio; - unsigned int aq_count; - unsigned int aq_qcount; - unsigned int aq_backlog; - unsigned int aq_wait_time; - } acceptq[NUM_ACCEPT_QUEUES]; -#endif }; /* WARNING: don't change the layout of the members in tcp_sock! */ diff --git a/include/net/sock.h b/include/net/sock.h index c2f958c16..6a6ef8886 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -258,7 +258,6 @@ struct sock { struct timeval sk_stamp; struct socket *sk_socket; void *sk_user_data; - void *sk_ns; // For use by CKRM struct module *sk_owner; struct page *sk_sndmsg_page; __u32 sk_sndmsg_off; @@ -419,7 +418,6 @@ static inline int sock_flag(struct sock *sk, enum sock_flags flag) return test_bit(flag, &sk->sk_flags); } -#ifndef CONFIG_ACCEPT_QUEUES static inline void sk_acceptq_removed(struct sock *sk) { sk->sk_ack_backlog--; @@ -434,7 +432,6 @@ static inline int sk_acceptq_is_full(struct sock *sk) { return sk->sk_ack_backlog > sk->sk_max_ack_backlog; } -#endif /* * Compute minimal free write space needed to queue new packets. diff --git a/include/net/tcp.h b/include/net/tcp.h index b7591b7bd..8afbb54ae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -672,10 +672,6 @@ struct open_request { struct tcp_v6_open_req v6_req; #endif } af; -#ifdef CONFIG_ACCEPT_QUEUES - unsigned long acceptq_time_stamp; - int acceptq_class; -#endif }; /* SLAB cache for open requests. */ @@ -1803,79 +1799,12 @@ struct tcp_listen_opt { u8 max_qlen_log; /* log_2 of maximal queued SYNs */ int qlen; -#ifdef CONFIG_ACCEPT_QUEUES - int qlen_young[NUM_ACCEPT_QUEUES]; -#else int qlen_young; -#endif int clock_hand; u32 hash_rnd; struct open_request *syn_table[TCP_SYNQ_HSIZE]; }; -#ifdef CONFIG_ACCEPT_QUEUES -static inline void sk_acceptq_removed(struct sock *sk, int class) -{ - tcp_sk(sk)->acceptq[class].aq_backlog--; -} - -static inline void sk_acceptq_added(struct sock *sk, int class) -{ - tcp_sk(sk)->acceptq[class].aq_backlog++; -} - -static inline int sk_acceptq_is_full(struct sock *sk, int class) -{ - return tcp_sk(sk)->acceptq[class].aq_backlog > - sk->sk_max_ack_backlog; -} - -static inline void tcp_set_acceptq(struct tcp_opt *tp, struct open_request *req) -{ - int class = req->acceptq_class; - int prev_class; - - if (!tp->acceptq[class].aq_ratio) { - req->acceptq_class = 0; - class = 0; - } - - tp->acceptq[class].aq_qcount++; - req->acceptq_time_stamp = jiffies; - - if (tp->acceptq[class].aq_tail) { - req->dl_next = tp->acceptq[class].aq_tail->dl_next; - tp->acceptq[class].aq_tail->dl_next = req; - tp->acceptq[class].aq_tail = req; - } else { /* if first request in the class */ - tp->acceptq[class].aq_head = req; - tp->acceptq[class].aq_tail = req; - - prev_class = class - 1; - while (prev_class >= 0) { - if (tp->acceptq[prev_class].aq_tail) - break; - prev_class--; - } - if (prev_class < 0) { - req->dl_next = tp->accept_queue; - tp->accept_queue = req; - } - else { - req->dl_next = tp->acceptq[prev_class].aq_tail->dl_next; - tp->acceptq[prev_class].aq_tail->dl_next = req; - } - } -} -static inline void tcp_acceptq_queue(struct sock *sk, struct open_request *req, - struct sock *child) -{ - tcp_set_acceptq(tcp_sk(sk),req); - req->sk = child; - sk_acceptq_added(sk,req->acceptq_class); -} - -#else static inline void tcp_acceptq_queue(struct sock *sk, struct open_request *req, struct sock *child) { @@ -1893,41 +1822,6 @@ static inline void tcp_acceptq_queue(struct sock *sk, struct open_request *req, req->dl_next = NULL; } -#endif - - -#ifdef CONFIG_ACCEPT_QUEUES -static inline void -tcp_synq_removed(struct sock *sk, struct open_request *req) -{ - struct tcp_listen_opt *lopt = tcp_sk(sk)->listen_opt; - - if (--lopt->qlen == 0) - tcp_delete_keepalive_timer(sk); - if (req->retrans == 0) - lopt->qlen_young[req->acceptq_class]--; -} - -static inline void tcp_synq_added(struct sock *sk, struct open_request *req) -{ - struct tcp_listen_opt *lopt = tcp_sk(sk)->listen_opt; - - if (lopt->qlen++ == 0) - tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT); - lopt->qlen_young[req->acceptq_class]++; -} - -static inline int tcp_synq_len(struct sock *sk) -{ - return tcp_sk(sk)->listen_opt->qlen; -} - -static inline int tcp_synq_young(struct sock *sk, int class) -{ - return tcp_sk(sk)->listen_opt->qlen_young[class]; -} - -#else static inline void tcp_synq_removed(struct sock *sk, struct open_request *req) @@ -1958,7 +1852,6 @@ static inline int tcp_synq_young(struct sock *sk) { return tcp_sk(sk)->listen_opt->qlen_young; } -#endif static inline int tcp_synq_is_full(struct sock *sk) { diff --git a/init/Kconfig b/init/Kconfig index 509119525..1ca9fa1b7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -138,187 +138,6 @@ config BSD_PROCESS_ACCT_V3 for processing it. A preliminary version of these tools is available at . -menu "Class Based Kernel Resource Management" - -config CKRM - bool "Class Based Kernel Resource Management Core" - depends on EXPERIMENTAL - help - Class-based Kernel Resource Management is a framework for controlling - and monitoring resource allocation of user-defined groups of tasks or - incoming socket connections. For more information, please visit - http://ckrm.sf.net. - - If you say Y here, enable the Resource Class File System and atleast - one of the resource controllers below. Say N if you are unsure. - -config RCFS_FS - tristate "Resource Class File System (User API)" - depends on CKRM - help - RCFS is the filesystem API for CKRM. This separate configuration - option is provided only for debugging and will eventually disappear - since rcfs will be automounted whenever CKRM is configured. - - Say N if unsure, Y if you've enabled CKRM, M to debug rcfs - initialization. - -config CKRM_TYPE_TASKCLASS - bool "Class Manager for Task Groups" - depends on CKRM && RCFS_FS - help - TASKCLASS provides the extensions for CKRM to track task classes - This is the base to enable task class based resource control for - cpu, memory and disk I/O. - - Say N if unsure - -config CKRM_RES_NULL - tristate "Null Tasks Resource Manager" - depends on CKRM_TYPE_TASKCLASS - default m - -config CKRM_RES_MEM - bool "Class based physical memory controller" - default y - depends on CKRM - help - Provide the basic support for collecting physical memory usage - information among classes. Say Y if you want to know the memory - usage of each class. - -config CKRM_TYPE_SOCKETCLASS - bool "Class Manager for socket groups" - depends on CKRM && RCFS_FS - help - Provides a Null Resource Controller for CKRM that is purely for - demonstration purposes. - - Say N if unsure, Y to use the feature. - - -config CKRM_RES_NUMTASKS - tristate "Number of Tasks Resource Manager" - depends on CKRM_TYPE_TASKCLASS - default m - help - Provides a Resource Controller for CKRM that allows limiting number of - tasks a task class can have. - - Say N if unsure, Y to use the feature. - -config CKRM_RES_NUMTASKS_FORKRATE - tristate "Number of Tasks Resource Manager for Fork Rate" - depends on CKRM_RES_NUMTASKS - default y - help - Provides a Resource Controller for CKRM that allows limiting the rate - of tasks a task class can fork per hour. - - Say N if unsure, Y to use the feature. - - -config CKRM_CPU_SCHEDULE - bool "CKRM CPU scheduler" - depends on CKRM_TYPE_TASKCLASS - default y - help - Use CKRM CPU scheduler instead of Linux Scheduler - - Say N if unsure, Y to use the feature. - -config CKRM_RES_BLKIO - tristate " Disk I/O Resource Controller" - depends on CKRM_TYPE_TASKCLASS && IOSCHED_CFQ - default m - help - Provides a resource controller for best-effort block I/O - bandwidth control. The controller attempts this by proportional - servicing of requests in the I/O scheduler. However, seek - optimizations and reordering by device drivers/disk controllers may - alter the actual bandwidth delivered to a class. - - Say N if unsure, Y to use the feature. - -config CKRM_CPU_SCHEDULE_AT_BOOT - bool "Turn on at boot time" - depends on CKRM_CPU_SCHEDULE - default n - help - Enable CKRM CPU Scheduler at boot time. Otherwise - it can be turned on dynamically at runtime. If not - turned on the default Linux Scheduler behavior - will be obtained. - - Say N if unsure, Y to use this feature - -config CKRM_TYPE_SOCKETCLASS - bool "Class Manager for socket groups" - depends on CKRM - help - SOCKET provides the extensions for CKRM to track per socket - classes. This is the base to enable socket based resource - control for inbound connection control, bandwidth control etc. - - Say N if unsure. - -config CKRM_RES_LISTENAQ - tristate "Multiple Accept Queues Resource Manager" - depends on CKRM_TYPE_SOCKETCLASS && ACCEPT_QUEUES - default m - help - Provides a resource controller for CKRM to prioritize inbound - connection requests. See inbound control description for - "IP: TCP Multiple accept queues support". If you choose that - option choose this option to control the queue weights. - - If unsure, say N. - -choice - prompt "Classification Engine" - depends on CKRM && RCFS_FS - optional - help - Select a classification engine (CE) that assists in - automatic classification of kernel objects managed by CKRM when - they are created. Without a CE, a user must manually - classify objects into classes. Processes inherit their parent's - classification. - - Only one engine can be built into the kernel though all can be - built as modules (only one will load). - - If unsure, say N. - -config CKRM_RBCE - tristate "Vanilla RBCE" - help - Vanilla Rule-based Classification Engine (RBCE). Rules for - classifying kernel objects are created/deleted/modified through - a RCFS directory using a filesystem interface. - - Any CE is optional. If unsure, say N. - -config CKRM_CRBCE - tristate "Enhanced RBCE" - depends on DELAY_ACCT && RELAYFS_FS - help - Enhanced Rule-based Classification Engine (CRBCE). Like the Vanilla - RBCE, rules for classifying kernel objects are created, deleted and - modified through a RCFS directory using a filesystem interface - (requires CKRM_RCFS configured). - - In addition, CRBCE provides per-process delay data - (requires DELAY_ACCT configured) and makes information on significant - kernel events available to userspace tools through relayfs - (requires RELAYFS_FS configured). - - Any CE is optional. If unsure, say N. - -endchoice - -endmenu - config SYSCTL bool "Sysctl support" ---help--- diff --git a/init/main.c b/init/main.c index 4efd7b84f..c9f50b731 100644 --- a/init/main.c +++ b/init/main.c @@ -46,8 +46,6 @@ #include #include #include -#include -#include #include #include @@ -535,7 +533,6 @@ asmlinkage void __init start_kernel(void) rcu_init(); init_IRQ(); pidhash_init(); - ckrm_init(); init_timers(); softirq_init(); time_init(); @@ -729,9 +726,6 @@ static int init(void * unused) */ populate_rootfs(); do_basic_setup(); - - init_ckrm_sched_res(); - sched_init_smp(); /* diff --git a/kernel/Makefile b/kernel/Makefile index 2096afd0c..0368746b5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o dump.o ckrm/ + kthread.o wait.o kfifo.o sys_ni.o dump.o # mod-subdirs := vserver @@ -31,7 +31,6 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o -obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_classqueue.o ckrm_sched.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o diff --git a/kernel/ckrm/Makefile b/kernel/ckrm/Makefile deleted file mode 100644 index 0c3c98036..000000000 --- a/kernel/ckrm/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -# -# Makefile for CKRM -# - -ifeq ($(CONFIG_CKRM),y) - obj-y = ckrm_events.o ckrm.o ckrmutils.o ckrm_numtasks_stub.o rbce/ -endif - -obj-$(CONFIG_CKRM_TYPE_TASKCLASS) += ckrm_tc.o -obj-$(CONFIG_CKRM_TYPE_SOCKETCLASS) += ckrm_sockc.o -obj-$(CONFIG_CKRM_RES_NUMTASKS) += ckrm_numtasks.o -obj-$(CONFIG_CKRM_RES_LISTENAQ) += ckrm_listenaq.o -obj-$(CONFIG_CKRM_CPU_SCHEDULE) += ckrm_cpu_class.o ckrm_cpu_monitor.o -obj-$(CONFIG_CKRM_RES_MEM) += ckrm_mem.o -obj-$(CONFIG_CKRM_RES_NULL) += ckrm_null_class.o diff --git a/kernel/ckrm/ckrm.c b/kernel/ckrm/ckrm.c deleted file mode 100644 index 278aec95b..000000000 --- a/kernel/ckrm/ckrm.c +++ /dev/null @@ -1,927 +0,0 @@ -/* ckrm.c - Class-based Kernel Resource Management (CKRM) - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004 - * (C) Shailabh Nagar, IBM Corp. 2003, 2004 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Vivek Kashyap, IBM Corp. 2004 - * - * - * Provides kernel API of CKRM for in-kernel,per-resource controllers - * (one each for cpu, memory, io, network) and callbacks for - * classification modules. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* - * Changes - * - * 28 Aug 2003 - * Created. - * 06 Nov 2003 - * Made modifications to suit the new RBCE module. - * 10 Nov 2003 - * Fixed a bug in fork and exit callbacks. Added callbacks_active and - * surrounding logic. Added task paramter for all CE callbacks. - * 23 Mar 2004 - * moved to referenced counted class objects and correct locking - * 19 Apr 2004 - * Integrated ckrm hooks, classtypes, ... - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -rwlock_t ckrm_class_lock = RW_LOCK_UNLOCKED; /* protects classlists */ - -struct rcfs_functions rcfs_fn; -EXPORT_SYMBOL_GPL(rcfs_fn); - -int rcfs_engine_regd; /* rcfs state needed by another module */ -EXPORT_SYMBOL_GPL(rcfs_engine_regd); - -int rcfs_mounted; -EXPORT_SYMBOL_GPL(rcfs_mounted); - -/* - * Helper Functions - */ - -/* - * Return TRUE if the given resource is registered. - */ -inline unsigned int is_res_regd(struct ckrm_classtype *clstype, int resid) -{ - return ((resid >= 0) && (resid < clstype->max_resid) && - test_bit(resid, &clstype->bit_res_ctlrs) - ); -} - -/* - * Return TRUE if the given core class pointer is valid. - */ -static struct ckrm_res_ctlr *ckrm_resctlr_lookup(struct ckrm_classtype *clstype, - const char *resname) -{ - int resid = -1; - - if (!clstype || !resname) { - return NULL; - } - for (resid = 0; resid < clstype->max_resid; resid++) { - if (test_bit(resid, &clstype->bit_res_ctlrs)) { - struct ckrm_res_ctlr *rctrl = clstype->res_ctlrs[resid]; - if (!strncmp(resname, rctrl->res_name, - CKRM_MAX_RES_NAME)) - return rctrl; - } - } - return NULL; -} - - -/* given a classname return the class handle and its classtype*/ -void *ckrm_classobj(const char *classname, int *classTypeID) -{ - int i; - - *classTypeID = -1; - if (!classname || !*classname) { - return NULL; - } - - read_lock(&ckrm_class_lock); - for (i = 0; i < CKRM_MAX_CLASSTYPES; i++) { - struct ckrm_classtype *ctype = ckrm_classtypes[i]; - struct ckrm_core_class *core; - - if (ctype == NULL) - continue; - list_for_each_entry(core, &ctype->classes, clslist) { - if (core->name && !strcmp(core->name, classname)) { - // FIXME: should grep reference.. - read_unlock(&ckrm_class_lock); - *classTypeID = ctype->typeID; - return core; - } - } - } - read_unlock(&ckrm_class_lock); - return NULL; -} - -EXPORT_SYMBOL_GPL(is_res_regd); -EXPORT_SYMBOL_GPL(ckrm_classobj); - -/* - * Internal Functions/macros - */ - -static inline void set_callbacks_active(struct ckrm_classtype *ctype) -{ - ctype->ce_cb_active = ((atomic_read(&ctype->ce_regd) > 0) && - (ctype->ce_callbacks.always_callback - || (ctype->num_classes > 1))); -} - -int ckrm_validate_and_grab_core(struct ckrm_core_class *core) -{ - int rc = 0; - read_lock(&ckrm_class_lock); - if (likely(ckrm_is_core_valid(core))) { - ckrm_core_grab(core); - rc = 1; - } - read_unlock(&ckrm_class_lock); - return rc; -} - -/* - * Interfaces for classification engine - */ - -/* - * Registering a callback structure by the classification engine. - * - * Returns typeId of class on success -errno for failure. - */ -int ckrm_register_engine(const char *typename, ckrm_eng_callback_t * ecbs) -{ - struct ckrm_classtype *ctype; - - ctype = ckrm_find_classtype_by_name(typename); - if (ctype == NULL) - return (-ENOENT); - - atomic_inc(&ctype->ce_regd); - - /* another engine registered or trying to register ? */ - if (atomic_read(&ctype->ce_regd) != 1) { - atomic_dec(&ctype->ce_regd); - return (-EBUSY); - } - - /* - * One of the following must be set: - * classify, class_delete (due to object reference) or - * notify (case where notification supported but not classification) - * The function pointer must be set the momement the mask is non-null - */ - if (!(((ecbs->classify) && (ecbs->class_delete)) || (ecbs->notify)) || - (ecbs->c_interest && ecbs->classify == NULL) || - (ecbs->n_interest && ecbs->notify == NULL)) { - atomic_dec(&ctype->ce_regd); - return (-EINVAL); - } - - ctype->ce_callbacks = *ecbs; - set_callbacks_active(ctype); - - if (ctype->ce_callbacks.class_add) { - struct ckrm_core_class *core; - - read_lock(&ckrm_class_lock); - list_for_each_entry(core, &ctype->classes, clslist) { - (*ctype->ce_callbacks.class_add) (core->name, core, - ctype->typeID); - } - read_unlock(&ckrm_class_lock); - } - return ctype->typeID; -} - -/* - * Unregistering a callback structure by the classification engine. - * - * Returns 0 on success -errno for failure. - */ -int ckrm_unregister_engine(const char *typename) -{ - struct ckrm_classtype *ctype; - - ctype = ckrm_find_classtype_by_name(typename); - if (ctype == NULL) - return (-ENOENT); - - ctype->ce_cb_active = 0; - if (atomic_read(&ctype->ce_nr_users) > 1) { - /* Somebody is currently using the engine, cannot deregister. */ - return (-EAGAIN); - } - atomic_set(&ctype->ce_regd, 0); - memset(&ctype->ce_callbacks, 0, sizeof(ckrm_eng_callback_t)); - return 0; -} - -/* - * Interfaces to manipulate class (core or resource) hierarchies - */ - -static void -ckrm_add_child(struct ckrm_core_class *parent, struct ckrm_core_class *child) -{ - struct ckrm_hnode *cnode = &child->hnode; - - if (!ckrm_is_core_valid(child)) { - printk(KERN_ERR "Invalid child %p given in ckrm_add_child\n", - child); - return; - } - class_lock(child); - INIT_LIST_HEAD(&cnode->children); - INIT_LIST_HEAD(&cnode->siblings); - - if (parent) { - struct ckrm_hnode *pnode; - - if (!ckrm_is_core_valid(parent)) { - printk(KERN_ERR - "Invalid parent %p given in ckrm_add_child\n", - parent); - parent = NULL; - } else { - pnode = &parent->hnode; - write_lock(&parent->hnode_rwlock); - list_add(&cnode->siblings, &pnode->children); - write_unlock(&parent->hnode_rwlock); - } - } - cnode->parent = parent; - class_unlock(child); - return; -} - -static int ckrm_remove_child(struct ckrm_core_class *child) -{ - struct ckrm_hnode *cnode, *pnode; - struct ckrm_core_class *parent; - - if (!ckrm_is_core_valid(child)) { - printk(KERN_ERR "Invalid child %p given" - " in ckrm_remove_child\n", - child); - return 0; - } - - cnode = &child->hnode; - parent = cnode->parent; - if (!ckrm_is_core_valid(parent)) { - printk(KERN_ERR "Invalid parent %p in ckrm_remove_child\n", - parent); - return 0; - } - - pnode = &parent->hnode; - - class_lock(child); - /* ensure that the node does not have children */ - if (!list_empty(&cnode->children)) { - class_unlock(child); - return 0; - } - write_lock(&parent->hnode_rwlock); - list_del(&cnode->siblings); - write_unlock(&parent->hnode_rwlock); - cnode->parent = NULL; - class_unlock(child); - return 1; -} - -void ckrm_lock_hier(struct ckrm_core_class *parent) -{ - if (ckrm_is_core_valid(parent)) { - read_lock(&parent->hnode_rwlock); - } -} - -void ckrm_unlock_hier(struct ckrm_core_class *parent) -{ - if (ckrm_is_core_valid(parent)) { - read_unlock(&parent->hnode_rwlock); - } -} - -/* - * hnode_rwlock of the parent core class must held in read mode. - * external callers should 've called ckrm_lock_hier before calling this - * function. - */ -#define hnode_2_core(ptr) \ -((ptr)? container_of(ptr, struct ckrm_core_class, hnode) : NULL) - -struct ckrm_core_class *ckrm_get_next_child(struct ckrm_core_class *parent, - struct ckrm_core_class *child) -{ - struct list_head *cnode; - struct ckrm_hnode *next_cnode; - struct ckrm_core_class *next_childcore; - - if (!ckrm_is_core_valid(parent)) { - printk(KERN_ERR "Invalid parent %p in ckrm_get_next_child\n", - parent); - return NULL; - } - if (list_empty(&parent->hnode.children)) { - return NULL; - } - if (child) { - if (!ckrm_is_core_valid(child)) { - printk(KERN_ERR - "Invalid child %p in ckrm_get_next_child\n", - child); - return NULL; - } - cnode = child->hnode.siblings.next; - } else { - cnode = parent->hnode.children.next; - } - - if (cnode == &parent->hnode.children) { /* back at the anchor */ - return NULL; - } - - next_cnode = container_of(cnode, struct ckrm_hnode, siblings); - next_childcore = hnode_2_core(next_cnode); - - if (!ckrm_is_core_valid(next_childcore)) { - printk(KERN_ERR - "Invalid next child %p in ckrm_get_next_child\n", - next_childcore); - return NULL; - } - return next_childcore; -} - -EXPORT_SYMBOL_GPL(ckrm_lock_hier); -EXPORT_SYMBOL_GPL(ckrm_unlock_hier); -EXPORT_SYMBOL_GPL(ckrm_get_next_child); - -static void -ckrm_alloc_res_class(struct ckrm_core_class *core, - struct ckrm_core_class *parent, int resid) -{ - - struct ckrm_classtype *clstype; - /* - * Allocate a resource class only if the resource controller has - * registered with core and the engine requests for the class. - */ - if (!ckrm_is_core_valid(core)) - return; - clstype = core->classtype; - core->res_class[resid] = NULL; - - if (test_bit(resid, &clstype->bit_res_ctlrs)) { - ckrm_res_ctlr_t *rcbs; - - atomic_inc(&clstype->nr_resusers[resid]); - rcbs = clstype->res_ctlrs[resid]; - - if (rcbs && rcbs->res_alloc) { - core->res_class[resid] = - (*rcbs->res_alloc) (core, parent); - if (core->res_class[resid]) - return; - printk(KERN_ERR "Error creating res class\n"); - } - atomic_dec(&clstype->nr_resusers[resid]); - } -} - -/* - * Initialize a core class - * - */ - -#define CLS_DEBUG(fmt, args...) \ -do { /* printk("%s: " fmt, __FUNCTION__ , ## args); */ } while (0) - -int -ckrm_init_core_class(struct ckrm_classtype *clstype, - struct ckrm_core_class *dcore, - struct ckrm_core_class *parent, const char *name) -{ - /* TODO: Should replace name with dentry or add dentry? */ - int i; - - /* TODO: How is this used in initialization? */ - CLS_DEBUG("name %s => %p\n", name ? name : "default", dcore); - if ((dcore != clstype->default_class) && (!ckrm_is_core_valid(parent))){ - printk(KERN_DEBUG "error not a valid parent %p\n", parent); - return -EINVAL; - } - dcore->classtype = clstype; - dcore->magic = CKRM_CORE_MAGIC; - dcore->name = name; - dcore->class_lock = SPIN_LOCK_UNLOCKED; - dcore->hnode_rwlock = RW_LOCK_UNLOCKED; - dcore->delayed = 0; - - atomic_set(&dcore->refcnt, 0); - write_lock(&ckrm_class_lock); - - INIT_LIST_HEAD(&dcore->objlist); - list_add_tail(&dcore->clslist, &clstype->classes); - - clstype->num_classes++; - set_callbacks_active(clstype); - - write_unlock(&ckrm_class_lock); - ckrm_add_child(parent, dcore); - - for (i = 0; i < clstype->max_resid; i++) - ckrm_alloc_res_class(dcore, parent, i); - - /* fix for race condition seen in stress with numtasks */ - if (parent) - ckrm_core_grab(parent); - - ckrm_core_grab(dcore); - return 0; -} - -static void ckrm_free_res_class(struct ckrm_core_class *core, int resid) -{ - /* - * Free a resource class only if the resource controller has - * registered with core - */ - if (core->res_class[resid]) { - ckrm_res_ctlr_t *rcbs; - struct ckrm_classtype *clstype = core->classtype; - - atomic_inc(&clstype->nr_resusers[resid]); - rcbs = clstype->res_ctlrs[resid]; - - if (rcbs->res_free) { - (*rcbs->res_free) (core->res_class[resid]); - // compensate inc in alloc - atomic_dec(&clstype->nr_resusers[resid]); - } - atomic_dec(&clstype->nr_resusers[resid]); - } - core->res_class[resid] = NULL; -} - -/* - * Free a core class - * requires that all tasks were previously reassigned to another class - * - * Returns 0 on success -errno on failure. - */ - -void ckrm_free_core_class(struct ckrm_core_class *core) -{ - int i; - struct ckrm_classtype *clstype = core->classtype; - struct ckrm_core_class *parent = core->hnode.parent; - - CLS_DEBUG("core=%p:%s parent=%p:%s\n", core, core->name, parent, - parent->name); - if (core->delayed) { - /* this core was marked as late */ - printk(KERN_DEBUG "class <%s> finally deleted %lu\n", core->name, jiffies); - } - if (ckrm_remove_child(core) == 0) { - printk(KERN_DEBUG "Core class removal failed. Chilren present\n"); - } - for (i = 0; i < clstype->max_resid; i++) { - ckrm_free_res_class(core, i); - } - - write_lock(&ckrm_class_lock); - /* Clear the magic, so we would know if this core is reused. */ - core->magic = 0; -#if 0 /* Dynamic not yet enabled */ - core->res_class = NULL; -#endif - /* Remove this core class from its linked list. */ - list_del(&core->clslist); - clstype->num_classes--; - set_callbacks_active(clstype); - write_unlock(&ckrm_class_lock); - - /* fix for race condition seen in stress with numtasks */ - if (parent) - ckrm_core_drop(parent); - - kfree(core); -} - -int ckrm_release_core_class(struct ckrm_core_class *core) -{ - if (!ckrm_is_core_valid(core)) { - // Invalid core - return (-EINVAL); - } - - if (core == core->classtype->default_class) - return 0; - - /* need to make sure that the classgot really dropped */ - if (atomic_read(&core->refcnt) != 1) { - CLS_DEBUG("class <%s> deletion delayed refcnt=%d jif=%ld\n", - core->name, atomic_read(&core->refcnt), jiffies); - core->delayed = 1; /* just so we have a ref point */ - } - ckrm_core_drop(core); - return 0; -} - -/* - * Interfaces for the resource controller - */ -/* - * Registering a callback structure by the resource controller. - * - * Returns the resource id(0 or +ve) on success, -errno for failure. - */ -static int -ckrm_register_res_ctlr_intern(struct ckrm_classtype *clstype, - ckrm_res_ctlr_t * rcbs) -{ - int resid, ret, i; - - if (!rcbs) - return -EINVAL; - - resid = rcbs->resid; - - spin_lock(&clstype->res_ctlrs_lock); - printk(KERN_WARNING "resid is %d name is %s %s\n", - resid, rcbs->res_name, clstype->res_ctlrs[resid]->res_name); - if (resid >= 0) { - if ((resid < CKRM_MAX_RES_CTLRS) - && (clstype->res_ctlrs[resid] == NULL)) { - clstype->res_ctlrs[resid] = rcbs; - atomic_set(&clstype->nr_resusers[resid], 0); - set_bit(resid, &clstype->bit_res_ctlrs); - ret = resid; - if (resid >= clstype->max_resid) { - clstype->max_resid = resid + 1; - } - } else { - ret = -EBUSY; - } - spin_unlock(&clstype->res_ctlrs_lock); - return ret; - } - for (i = clstype->resid_reserved; i < clstype->max_res_ctlrs; i++) { - if (clstype->res_ctlrs[i] == NULL) { - clstype->res_ctlrs[i] = rcbs; - rcbs->resid = i; - atomic_set(&clstype->nr_resusers[i], 0); - set_bit(i, &clstype->bit_res_ctlrs); - if (i >= clstype->max_resid) { - clstype->max_resid = i + 1; - } - spin_unlock(&clstype->res_ctlrs_lock); - return i; - } - } - spin_unlock(&clstype->res_ctlrs_lock); - return (-ENOMEM); -} - -int -ckrm_register_res_ctlr(struct ckrm_classtype *clstype, ckrm_res_ctlr_t * rcbs) -{ - struct ckrm_core_class *core; - int resid; - - resid = ckrm_register_res_ctlr_intern(clstype, rcbs); - - if (resid >= 0) { - /* run through all classes and create the resource class - * object and if necessary "initialize" class in context - * of this resource - */ - read_lock(&ckrm_class_lock); - list_for_each_entry(core, &clstype->classes, clslist) { - printk(KERN_INFO "CKRM .. create res clsobj for resouce <%s>" - "class <%s> par=%p\n", rcbs->res_name, - core->name, core->hnode.parent); - ckrm_alloc_res_class(core, core->hnode.parent, resid); - - if (clstype->add_resctrl) { - /* FIXME: this should be mandatory */ - (*clstype->add_resctrl) (core, resid); - } - } - read_unlock(&ckrm_class_lock); - } - return resid; -} - -/* - * Unregistering a callback structure by the resource controller. - * - * Returns 0 on success -errno for failure. - */ -int ckrm_unregister_res_ctlr(struct ckrm_res_ctlr *rcbs) -{ - struct ckrm_classtype *clstype = rcbs->classtype; - struct ckrm_core_class *core = NULL; - int resid = rcbs->resid; - - if ((clstype == NULL) || (resid < 0)) { - return -EINVAL; - } - /* TODO: probably need to also call deregistration function */ - - read_lock(&ckrm_class_lock); - /* free up this resource from all the classes */ - list_for_each_entry(core, &clstype->classes, clslist) { - ckrm_free_res_class(core, resid); - } - read_unlock(&ckrm_class_lock); - - if (atomic_read(&clstype->nr_resusers[resid])) { - return -EBUSY; - } - - spin_lock(&clstype->res_ctlrs_lock); - clstype->res_ctlrs[resid] = NULL; - clear_bit(resid, &clstype->bit_res_ctlrs); - clstype->max_resid = fls(clstype->bit_res_ctlrs); - rcbs->resid = -1; - spin_unlock(&clstype->res_ctlrs_lock); - - return 0; -} - -/* - * Class Type Registration - */ - -/* TODO: What locking is needed here?*/ - -struct ckrm_classtype *ckrm_classtypes[CKRM_MAX_CLASSTYPES]; -EXPORT_SYMBOL_GPL(ckrm_classtypes); - -int ckrm_register_classtype(struct ckrm_classtype *clstype) -{ - int tid = clstype->typeID; - - if (tid != -1) { - if ((tid < 0) || (tid > CKRM_MAX_CLASSTYPES) - || (ckrm_classtypes[tid])) - return -EINVAL; - } else { - int i; - for (i = CKRM_RESV_CLASSTYPES; i < CKRM_MAX_CLASSTYPES; i++) { - if (ckrm_classtypes[i] == NULL) { - tid = i; - break; - } - } - } - if (tid == -1) - return -EBUSY; - clstype->typeID = tid; - ckrm_classtypes[tid] = clstype; - - /* TODO: Need to call the callbacks of the RCFS client */ - if (rcfs_fn.register_classtype) { - (*rcfs_fn.register_classtype) (clstype); - /* No error return for now. */ - } - return tid; -} - -int ckrm_unregister_classtype(struct ckrm_classtype *clstype) -{ - int tid = clstype->typeID; - - if ((tid < 0) || (tid > CKRM_MAX_CLASSTYPES) - || (ckrm_classtypes[tid] != clstype)) - return -EINVAL; - - if (rcfs_fn.deregister_classtype) { - (*rcfs_fn.deregister_classtype) (clstype); - // No error return for now - } - - ckrm_classtypes[tid] = NULL; - clstype->typeID = -1; - return 0; -} - -struct ckrm_classtype *ckrm_find_classtype_by_name(const char *name) -{ - int i; - for (i = 0; i < CKRM_MAX_CLASSTYPES; i++) { - struct ckrm_classtype *ctype = ckrm_classtypes[i]; - if (ctype && !strncmp(ctype->name, name, CKRM_MAX_TYPENAME_LEN)) - return ctype; - } - return NULL; -} - -/* - * Generic Functions that can be used as default functions - * in almost all classtypes - * (a) function iterator over all resource classes of a class - * (b) function invoker on a named resource - */ - -int ckrm_class_show_shares(struct ckrm_core_class *core, struct seq_file *seq) -{ - int i; - struct ckrm_res_ctlr *rcbs; - struct ckrm_classtype *clstype = core->classtype; - struct ckrm_shares shares; - - for (i = 0; i < clstype->max_resid; i++) { - atomic_inc(&clstype->nr_resusers[i]); - rcbs = clstype->res_ctlrs[i]; - if (rcbs && rcbs->get_share_values) { - (*rcbs->get_share_values) (core->res_class[i], &shares); - seq_printf(seq,"res=%s,guarantee=%d,limit=%d," - "total_guarantee=%d,max_limit=%d\n", - rcbs->res_name, shares.my_guarantee, - shares.my_limit, shares.total_guarantee, - shares.max_limit); - } - atomic_dec(&clstype->nr_resusers[i]); - } - return 0; -} - -int ckrm_class_show_stats(struct ckrm_core_class *core, struct seq_file *seq) -{ - int i; - struct ckrm_res_ctlr *rcbs; - struct ckrm_classtype *clstype = core->classtype; - - for (i = 0; i < clstype->max_resid; i++) { - atomic_inc(&clstype->nr_resusers[i]); - rcbs = clstype->res_ctlrs[i]; - if (rcbs && rcbs->get_stats) - (*rcbs->get_stats) (core->res_class[i], seq); - atomic_dec(&clstype->nr_resusers[i]); - } - return 0; -} - -int ckrm_class_show_config(struct ckrm_core_class *core, struct seq_file *seq) -{ - int i; - struct ckrm_res_ctlr *rcbs; - struct ckrm_classtype *clstype = core->classtype; - - for (i = 0; i < clstype->max_resid; i++) { - atomic_inc(&clstype->nr_resusers[i]); - rcbs = clstype->res_ctlrs[i]; - if (rcbs && rcbs->show_config) - (*rcbs->show_config) (core->res_class[i], seq); - atomic_dec(&clstype->nr_resusers[i]); - } - return 0; -} - -int ckrm_class_set_config(struct ckrm_core_class *core, const char *resname, - const char *cfgstr) -{ - struct ckrm_classtype *clstype = core->classtype; - struct ckrm_res_ctlr *rcbs = ckrm_resctlr_lookup(clstype, resname); - int rc; - - if (rcbs == NULL || rcbs->set_config == NULL) - return -EINVAL; - rc = (*rcbs->set_config) (core->res_class[rcbs->resid], cfgstr); - return rc; -} - -#define legalshare(a) \ - ( ((a) >=0) \ - || ((a) == CKRM_SHARE_UNCHANGED) \ - || ((a) == CKRM_SHARE_DONTCARE) ) - -int ckrm_class_set_shares(struct ckrm_core_class *core, const char *resname, - struct ckrm_shares *shares) -{ - struct ckrm_classtype *clstype = core->classtype; - struct ckrm_res_ctlr *rcbs; - int rc; - - /* Check for legal values */ - if (!legalshare(shares->my_guarantee) || !legalshare(shares->my_limit) - || !legalshare(shares->total_guarantee) - || !legalshare(shares->max_limit)) - return -EINVAL; - - rcbs = ckrm_resctlr_lookup(clstype, resname); - if (rcbs == NULL || rcbs->set_share_values == NULL) - return -EINVAL; - rc = (*rcbs->set_share_values) (core->res_class[rcbs->resid], shares); - return rc; -} - -int ckrm_class_reset_stats(struct ckrm_core_class *core, const char *resname, - const char *unused) -{ - struct ckrm_classtype *clstype = core->classtype; - struct ckrm_res_ctlr *rcbs = ckrm_resctlr_lookup(clstype, resname); - int rc; - - if (rcbs == NULL || rcbs->reset_stats == NULL) - return -EINVAL; - rc = (*rcbs->reset_stats) (core->res_class[rcbs->resid]); - return rc; -} - -/* - * Initialization - */ - -void ckrm_cb_newtask(struct task_struct *tsk) -{ - tsk->ce_data = NULL; - spin_lock_init(&tsk->ckrm_tsklock); - ckrm_invoke_event_cb_chain(CKRM_EVENT_NEWTASK, tsk); -} - -void ckrm_cb_exit(struct task_struct *tsk) -{ - ckrm_invoke_event_cb_chain(CKRM_EVENT_EXIT, tsk); - tsk->ce_data = NULL; -} - -void __init ckrm_init(void) -{ - printk(KERN_DEBUG "CKRM Initialization\n"); - - // prepare init_task and then rely on inheritance of properties - ckrm_cb_newtask(&init_task); - - // register/initialize the Metatypes - -#ifdef CONFIG_CKRM_TYPE_TASKCLASS - { - extern void ckrm_meta_init_taskclass(void); - ckrm_meta_init_taskclass(); - } -#endif -#ifdef CONFIG_CKRM_TYPE_SOCKETCLASS - { - extern void ckrm_meta_init_sockclass(void); - ckrm_meta_init_sockclass(); - } -#endif - printk("CKRM Initialization done\n"); -} - -EXPORT_SYMBOL_GPL(ckrm_register_engine); -EXPORT_SYMBOL_GPL(ckrm_unregister_engine); - -EXPORT_SYMBOL_GPL(ckrm_register_res_ctlr); -EXPORT_SYMBOL_GPL(ckrm_unregister_res_ctlr); - -EXPORT_SYMBOL_GPL(ckrm_init_core_class); -EXPORT_SYMBOL_GPL(ckrm_free_core_class); -EXPORT_SYMBOL_GPL(ckrm_release_core_class); - -EXPORT_SYMBOL_GPL(ckrm_register_classtype); -EXPORT_SYMBOL_GPL(ckrm_unregister_classtype); -EXPORT_SYMBOL_GPL(ckrm_find_classtype_by_name); - -EXPORT_SYMBOL_GPL(ckrm_core_grab); -EXPORT_SYMBOL_GPL(ckrm_core_drop); -EXPORT_SYMBOL_GPL(ckrm_is_core_valid); -EXPORT_SYMBOL_GPL(ckrm_validate_and_grab_core); - -EXPORT_SYMBOL_GPL(ckrm_register_event_set); -EXPORT_SYMBOL_GPL(ckrm_unregister_event_set); -EXPORT_SYMBOL_GPL(ckrm_register_event_cb); -EXPORT_SYMBOL_GPL(ckrm_unregister_event_cb); - -EXPORT_SYMBOL_GPL(ckrm_class_show_stats); -EXPORT_SYMBOL_GPL(ckrm_class_show_config); -EXPORT_SYMBOL_GPL(ckrm_class_show_shares); - -EXPORT_SYMBOL_GPL(ckrm_class_set_config); -EXPORT_SYMBOL_GPL(ckrm_class_set_shares); - -EXPORT_SYMBOL_GPL(ckrm_class_reset_stats); diff --git a/kernel/ckrm/ckrm_cpu_class.c b/kernel/ckrm/ckrm_cpu_class.c deleted file mode 100644 index 929c22d97..000000000 --- a/kernel/ckrm/ckrm_cpu_class.c +++ /dev/null @@ -1,388 +0,0 @@ -/* kernel/ckrm/ckrm_cpu_class.c - CPU Class resource controller for CKRM - * - * Copyright (C) Haoqiang Zheng, IBM Corp. 2004 - * (C) Hubertus Franke, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct ckrm_res_ctlr cpu_rcbs; - -/** - * insert_cpu_class - insert a class to active_cpu_class list - * - * insert the class in decreasing order of class weight - */ -static inline void insert_cpu_class(struct ckrm_cpu_class *cls) -{ - list_add(&cls->links,&active_cpu_classes); -} - -/* - * initialize a class object and its local queues - */ -void init_cpu_class(struct ckrm_cpu_class *cls,ckrm_shares_t* shares) -{ - int i,j,k; - prio_array_t *array; - ckrm_lrq_t* queue; - - cls->shares = *shares; - cls->cnt_lock = SPIN_LOCK_UNLOCKED; - ckrm_cpu_stat_init(&cls->stat); - ckrm_usage_init(&cls->usage); - cls->magic = CKRM_CPU_CLASS_MAGIC; - - for (i = 0 ; i < NR_CPUS ; i++) { - queue = &cls->local_queues[i]; - queue->active = queue->arrays; - queue->expired = queue->arrays+1; - - for (j = 0; j < 2; j++) { - array = queue->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - array->nr_active = 0; - } - - queue->expired_timestamp = 0; - - queue->cpu_class = cls; - queue->classqueue = get_cpu_classqueue(i); - queue->top_priority = MAX_PRIO; - cq_node_init(&queue->classqueue_linkobj); - queue->local_cvt = 0; - queue->lrq_load = 0; - queue->local_weight = cpu_class_weight(cls); - queue->uncounted_ns = 0; - queue->savings = 0; - queue->magic = 0x43FF43D7; - } - - // add to class list - write_lock(&class_list_lock); - insert_cpu_class(cls); - write_unlock(&class_list_lock); -} - -static inline void set_default_share(ckrm_shares_t *shares) -{ - shares->my_guarantee = 0; - shares->total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - shares->my_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - shares->max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - shares->cur_max_limit = 0; -} - -struct ckrm_cpu_class * ckrm_get_cpu_class(struct ckrm_core_class *core) -{ - struct ckrm_cpu_class * cls; - cls = ckrm_get_res_class(core, cpu_rcbs.resid, struct ckrm_cpu_class); - if (valid_cpu_class(cls)) - return cls; - else - return NULL; -} - - -void* ckrm_alloc_cpu_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) -{ - struct ckrm_cpu_class *cls; - - if (! parent) /*root class*/ - cls = get_default_cpu_class(); - else - cls = (struct ckrm_cpu_class *) kmalloc(sizeof(struct ckrm_cpu_class),GFP_ATOMIC); - - if (cls) { - ckrm_shares_t shares; - if ((! parent) && (core)) { - /* - * the default class is already initialized - * so only update the core structure - */ - cls->core = core; - } else { - set_default_share(&shares); - init_cpu_class(cls,&shares); - cls->core = core; - cls->parent = parent; - } - } else - printk(KERN_ERR"alloc_cpu_class failed\n"); - - return cls; -} - -/* - * hzheng: this is not a stable implementation - * need to check race condition issue here - */ -static void ckrm_free_cpu_class(void *my_res) -{ - struct ckrm_cpu_class *cls = my_res, *parres, *childres; - ckrm_core_class_t *child = NULL; - int maxlimit; - ckrm_lrq_t* queue; - int i; - - if (!cls) - return; - - /*the default class can't be freed*/ - if (cls == get_default_cpu_class()) - return; -#if 1 -#warning "ACB: Remove freed class from any classqueues [PL #4233]" - for (i = 0 ; i < NR_CPUS ; i++) { - queue = &cls->local_queues[i]; - if (cls_in_classqueue(&queue->classqueue_linkobj)) - classqueue_dequeue(queue->classqueue, - &queue->classqueue_linkobj); - } -#endif - - // Assuming there will be no children when this function is called - parres = ckrm_get_cpu_class(cls->parent); - - // return child's limit/guarantee to parent node - spin_lock(&parres->cnt_lock); - child_guarantee_changed(&parres->shares, cls->shares.my_guarantee, 0); - // run thru parent's children and get the new max_limit of the parent - ckrm_lock_hier(parres->core); - maxlimit = 0; - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - childres = ckrm_get_cpu_class(child); - if (maxlimit < childres->shares.my_limit) { - maxlimit = childres->shares.my_limit; - } - } - ckrm_unlock_hier(parres->core); - if (parres->shares.cur_max_limit < maxlimit) { - parres->shares.cur_max_limit = maxlimit; - } - - spin_unlock(&parres->cnt_lock); - - write_lock(&class_list_lock); - list_del(&cls->links); - write_unlock(&class_list_lock); - - kfree(cls); - - //call ckrm_cpu_monitor after class removed - ckrm_cpu_monitor(0); -} - -/* - * the system will adjust to the new share automatically - */ -int ckrm_cpu_set_share(void *my_res, struct ckrm_shares *new_share) -{ - struct ckrm_cpu_class *parres, *cls = my_res; - struct ckrm_shares *cur = &cls->shares, *par; - int rc = -EINVAL; - - if (!cls) - return rc; - - if (cls->parent) { - parres = ckrm_get_cpu_class(cls->parent); - spin_lock(&parres->cnt_lock); - spin_lock(&cls->cnt_lock); - par = &parres->shares; - } else { - spin_lock(&cls->cnt_lock); - par = NULL; - parres = NULL; - } - - /* - * hzheng: CKRM_SHARE_DONTCARE should be handled - */ - if (new_share->my_guarantee == CKRM_SHARE_DONTCARE) - new_share->my_guarantee = 0; - - rc = set_shares(new_share, cur, par); - if (cur->my_limit == CKRM_SHARE_DONTCARE) - cur->my_limit = cur->max_limit; - - - spin_unlock(&cls->cnt_lock); - if (cls->parent) { - spin_unlock(&parres->cnt_lock); - } - - //call ckrm_cpu_monitor after changes are changed - ckrm_cpu_monitor(0); - - return rc; -} - -static int ckrm_cpu_get_share(void *my_res, - struct ckrm_shares *shares) -{ - struct ckrm_cpu_class *cls = my_res; - - if (!cls) - return -EINVAL; - *shares = cls->shares; - return 0; -} - -int ckrm_cpu_get_stats(void *my_res, struct seq_file * sfile) -{ - struct ckrm_cpu_class *cls = my_res; - struct ckrm_cpu_class_stat* stat = &cls->stat; - ckrm_lrq_t* lrq; - int i; - - if (!cls) - return -EINVAL; - - seq_printf(sfile, "-------- CPU Class Status Start---------\n"); - seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n", - cls->shares.my_guarantee, - cls->shares.my_limit, - cls->shares.total_guarantee, - cls->shares.max_limit); - seq_printf(sfile, "\tunused_grt= %d cur_max_limit= %d\n", - cls->shares.unused_guarantee, - cls->shares.cur_max_limit); - - seq_printf(sfile, "Effective:\n\tegrt= %d\n",stat->egrt); - seq_printf(sfile, "\tmegrt= %d\n",stat->megrt); - seq_printf(sfile, "\tehl= %d\n",stat->ehl); - seq_printf(sfile, "\tmehl= %d\n",stat->mehl); - seq_printf(sfile, "\teshare= %d\n",stat->eshare); - seq_printf(sfile, "\tmeshare= %d\n",cpu_class_weight(cls)); - seq_printf(sfile, "\tmax_demand= %lu\n",stat->max_demand); - seq_printf(sfile, "\ttotal_ns= %llu\n",stat->total_ns); - seq_printf(sfile, "\tusage(2,10,60)= %d %d %d\n", - get_ckrm_usage(cls,2*HZ), - get_ckrm_usage(cls,10*HZ), - get_ckrm_usage(cls,60*HZ) - ); - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(cls,i); - seq_printf(sfile, "\tlrq %d demand= %lu weight= %d lrq_load= %lu cvt= %llu sav= %llu\n",i,stat->local_stats[i].cpu_demand,local_class_weight(lrq),lrq->lrq_load,lrq->local_cvt,lrq->savings); - } - - seq_printf(sfile, "-------- CPU Class Status END ---------\n"); - - return 0; -} - -/* - * task will remain in the same cpu but on a different local runqueue - */ -void ckrm_cpu_change_class(void *task, void *old, void *new) -{ - struct task_struct *tsk = task; - struct ckrm_cpu_class *newcls = new; - - /*sanity checking*/ - if (!task || ! old || !new) - return; - - _ckrm_cpu_change_class(tsk,newcls); -} - -/*dummy function, not used*/ -static int ckrm_cpu_show_config(void *my_res, struct seq_file *sfile) -{ - struct ckrm_cpu_class *cls = my_res; - - if (!cls) - return -EINVAL; - - seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_cpu class"); - return 0; -} - -/*dummy function, not used*/ -static int ckrm_cpu_set_config(void *my_res, const char *cfgstr) -{ - struct ckrm_cpu_class *cls = my_res; - - if (!cls) - return -EINVAL; - printk(KERN_DEBUG "ckrm_cpu config='%s'\n",cfgstr); - return 0; -} - -struct ckrm_res_ctlr cpu_rcbs = { - .res_name = "cpu", - .res_hdepth = 1, - .resid = -1, - .res_alloc = ckrm_alloc_cpu_class, - .res_free = ckrm_free_cpu_class, - .set_share_values = ckrm_cpu_set_share, - .get_share_values = ckrm_cpu_get_share, - .get_stats = ckrm_cpu_get_stats, - .show_config = ckrm_cpu_show_config, - .set_config = ckrm_cpu_set_config, - .change_resclass = ckrm_cpu_change_class, -}; - -int __init init_ckrm_sched_res(void) -{ - struct ckrm_classtype *clstype; - int resid = cpu_rcbs.resid; - - clstype = ckrm_find_classtype_by_name("taskclass"); - if (clstype == NULL) { - printk(KERN_INFO" Unknown ckrm classtype"); - return -ENOENT; - } - - if (resid == -1) { /*not registered */ - resid = ckrm_register_res_ctlr(clstype,&cpu_rcbs); - printk(KERN_DEBUG "........init_ckrm_sched_res , resid= %d\n",resid); - } - return 0; -} - -/* - * initialize the class structure - * add the default class: class 0 - */ -void init_cpu_classes(void) -{ - int i; - - //init classqueues for each processor - for (i=0; i < NR_CPUS; i++) - classqueue_init(get_cpu_classqueue(i)); - - /* - * hzheng: initialize the default cpu class - * required for E14/E15 since ckrm_init is called after sched_init - */ - ckrm_alloc_cpu_class(NULL,NULL); - } - - -EXPORT_SYMBOL(ckrm_get_cpu_class); diff --git a/kernel/ckrm/ckrm_cpu_monitor.c b/kernel/ckrm/ckrm_cpu_monitor.c deleted file mode 100644 index 5f59b375e..000000000 --- a/kernel/ckrm/ckrm_cpu_monitor.c +++ /dev/null @@ -1,1023 +0,0 @@ -/* ckrm_cpu_monitor.c - Hierarchical CKRM CPU Resource Monitor - * - * Copyright (C) Haoqiang Zheng, IBM Corp. 2004 - * (C) Hubertus Franke, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 23 June 2004: Created - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#warning MEF I cannot believe that vserver changes force the following include statement: FIX THIS! -#include - -#define CPU_MONITOR_INTERVAL (HZ) /*how often do we adjust the shares*/ -#define CKRM_SHARE_MAX (1<shares.my_limit; -} - -static inline int get_mysoft_limit(struct ckrm_cpu_class *cls) -{ - return cls->shares.total_guarantee; -} - -static inline int get_hard_limit(struct ckrm_cpu_class *cls) -{ - return cls->shares.total_guarantee; -} - -static inline int get_myhard_limit(struct ckrm_cpu_class *cls) -{ - return cls->shares.total_guarantee; -} - - -static inline void cpu_demand_stat_init(struct ckrm_cpu_demand_stat* local_stat, int type) -{ - unsigned long long now = sched_clock(); - - local_stat->run = 0; - local_stat->total = 0; - local_stat->last_sleep = now; - switch (type) { - case CPU_DEMAND_TP_CLASS: - local_stat->recalc_interval = CPU_DEMAND_CLASS_RECALC; - local_stat->cpu_demand = 0; - break; - case CPU_DEMAND_TP_TASK: - local_stat->recalc_interval = CPU_DEMAND_TASK_RECALC; - //for task, the init cpu_demand is copied from its parent - break; - default: - BUG(); - } -} - -void ckrm_cpu_stat_init(struct ckrm_cpu_class_stat *stat) -{ - int i; - - stat->stat_lock = SPIN_LOCK_UNLOCKED; - stat->total_ns = 0; - stat->max_demand = 0; - - for (i=0; ilocal_stats[i],CPU_DEMAND_TP_CLASS); - } - - stat->egrt = 0; - stat->megrt = 0; - stat->ehl = CKRM_SHARE_MAX; /*default: no limit*/ - stat->mehl = CKRM_SHARE_MAX; /*default: no limit */ - - stat->eshare = CKRM_SHARE_MAX; - stat->meshare = CKRM_SHARE_MAX; -} - -/**********************************************/ -/* cpu demand */ -/**********************************************/ - -/* - * How CPU demand is calculated: - * consider class local runqueue (clr) first - * at any time, a clr can at the following three states - * -- run: a task belonning to this class is running on this cpu - * -- wait: at least one of its task is running, but the class is not running - * -- sleep: none of the task of this class is runnable - * - * cpu_demand(t1,t2) = r(t1,t2)/(r(t1,t2)+s(t1,t2)) - * - * the cpu_demand of a class = - * sum of cpu_demand of all the class local runqueues - */ - -/** - * update_cpu_demand_stat - - * - * should be called whenever the state of a task/task local queue changes - * -- when deschedule : report how much run - * -- when enqueue: report how much sleep - * - * how often should we recalculate the cpu demand - * the number is in ns - */ -static inline void update_cpu_demand_stat(struct ckrm_cpu_demand_stat* local_stat,int state, unsigned long long len) -{ - local_stat->total += len; - if (state == CKRM_CPU_DEMAND_RUN) - local_stat->run += len; - - if (local_stat->total >= local_stat->recalc_interval) { - local_stat->total >>= CKRM_SHARE_ACCURACY; - if (unlikely(local_stat->run > 0xFFFFFFFF)) - local_stat->run = 0xFFFFFFFF; - - if (local_stat->total > 0xFFFFFFFF) - local_stat->total = 0xFFFFFFFF; - - do_div(local_stat->run,(unsigned long)local_stat->total); - - if (local_stat->total > 0xFFFFFFFF) //happens after very long sleep - local_stat->cpu_demand = local_stat->run; - else { - local_stat->cpu_demand += local_stat->run; - local_stat->cpu_demand >>= 1; - } - local_stat->total = 0; - local_stat->run = 0; - } -} - -/** - * cpu_demand_event - and cpu_demand event occured - * @event: one of the following three events: - * CPU_DEMAND_ENQUEUE: local class enqueue - * CPU_DEMAND_DEQUEUE: local class dequeue - * CPU_DEMAND_DESCHEDULE: one task belong a certain local class deschedule - * @len: valid only for CPU_DEMAND_DESCHEDULE, how long the task has been run - */ -void cpu_demand_event(struct ckrm_cpu_demand_stat* local_stat, int event, unsigned long long len) -{ - switch (event) { - case CPU_DEMAND_ENQUEUE: - len = sched_clock() - local_stat->last_sleep; - local_stat->last_sleep = 0; - update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,len); - break; - case CPU_DEMAND_DEQUEUE: - if (! local_stat->last_sleep) { - local_stat->last_sleep = sched_clock(); - } - break; - case CPU_DEMAND_DESCHEDULE: - update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_RUN,len); - break; - case CPU_DEMAND_INIT: //for task init only - cpu_demand_stat_init(local_stat,CPU_DEMAND_TP_TASK); - break; - default: - BUG(); - } -} - -/** - * check all the class local queue - * - * to deal with excessive long run/sleep state - * -- whenever the the ckrm_cpu_monitor is called, check if the class is in sleep state, if yes, then update sleep record - */ -static inline void cpu_demand_check_sleep(struct ckrm_cpu_class_stat *stat, int cpu) -{ - struct ckrm_cpu_demand_stat * local_stat = &stat->local_stats[cpu]; - unsigned long long sleep,now; - if (local_stat->last_sleep) { - now = sched_clock(); - sleep = now - local_stat->last_sleep; - local_stat->last_sleep = now; - update_cpu_demand_stat(local_stat,CKRM_CPU_DEMAND_SLEEP,sleep); - } -} - -/** - *get_self_cpu_demand - get cpu demand of the class itself (excluding children) - * - * self_cpu_demand = sum(cpu demand of all local queues) - */ -static inline unsigned long get_self_cpu_demand(struct ckrm_cpu_class_stat *stat) -{ - int cpu_demand = 0; - int i; - int cpuonline = 0; - - for_each_online_cpu(i) { - cpu_demand_check_sleep(stat,i); - cpu_demand += stat->local_stats[i].cpu_demand; - cpuonline ++; - } - - return (cpu_demand/cpuonline); -} - -/* - * my max demand = min(cpu_demand, my effective hard limit) - */ -static inline unsigned long get_mmax_demand(struct ckrm_cpu_class_stat* stat) -{ - unsigned long mmax_demand = get_self_cpu_demand(stat); - if (mmax_demand > stat->mehl) - mmax_demand = stat->mehl; - - return mmax_demand; -} - -/** - * update_max_demand: update effective cpu demand for each class - * return -1 on error - * - * Assume: the root_core->parent == NULL - */ -static int update_max_demand(struct ckrm_core_class *root_core) -{ - struct ckrm_core_class *cur_core, *child_core; - struct ckrm_cpu_class *cls,*c_cls; - int ret = -1; - - cur_core = root_core; - child_core = NULL; - - repeat: - if (!cur_core) { //normal exit - ret = 0; - goto out; - } - - cls = ckrm_get_cpu_class(cur_core); - if (! cls) //invalid c_cls, abort - goto out; - - if (!child_core) //first child - cls->stat.max_demand = get_mmax_demand(&cls->stat); - else { - c_cls = ckrm_get_cpu_class(child_core); - if (c_cls) - cls->stat.max_demand += c_cls->stat.max_demand; - else //invalid c_cls, abort - goto out; - } - - //check class hard limit - if (cls->stat.max_demand > cls->stat.ehl) - cls->stat.max_demand = cls->stat.ehl; - - //next child - child_core = ckrm_get_next_child(cur_core, child_core); - if (child_core) { - //go down - cur_core = child_core; - child_core = NULL; - goto repeat; - } else { //no more child, go back - child_core = cur_core; - cur_core = child_core->hnode.parent; - } - goto repeat; - out: - return ret; -} - -/**********************************************/ -/* effective guarantee & limit */ -/**********************************************/ -static inline void set_eshare(struct ckrm_cpu_class_stat *stat, - int new_share) -{ - if (!new_share) - new_share = 1; - - BUG_ON(new_share < 0); - stat->eshare = new_share; -} - -static inline void set_meshare(struct ckrm_cpu_class_stat *stat, - int new_share) -{ - if (!new_share) - new_share = 1; - - BUG_ON(new_share < 0); - stat->meshare = new_share; -} - -/** - *update_child_effective - update egrt, ehl, mehl for all children of parent - *@parent: the parent node - *return -1 if anything wrong - * - */ -static int update_child_effective(struct ckrm_core_class *parent) -{ - struct ckrm_cpu_class *p_cls = ckrm_get_cpu_class(parent); - struct ckrm_core_class *child_core; - int ret = -1; - - if (! p_cls) - return ret; - - child_core = ckrm_get_next_child(parent, NULL); - while (child_core) { - struct ckrm_cpu_class *c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - return ret; - - c_cls->stat.egrt = - p_cls->stat.egrt * - c_cls->shares.my_guarantee / p_cls->shares.total_guarantee; - - c_cls->stat.megrt = c_cls->stat.egrt * c_cls->shares.unused_guarantee - / c_cls->shares.total_guarantee; - - c_cls->stat.ehl = - p_cls->stat.ehl * - get_hard_limit(c_cls) / p_cls->shares.total_guarantee; - - c_cls->stat.mehl = - c_cls->stat.ehl * - get_myhard_limit(c_cls) / c_cls->shares.total_guarantee; - - set_eshare(&c_cls->stat,c_cls->stat.egrt); - set_meshare(&c_cls->stat,c_cls->stat.megrt); - - - child_core = ckrm_get_next_child(parent, child_core); - }; - return 0; -} - -/** - * update_effectives: update egrt, ehl, mehl for the whole tree - * should be called only when class structure changed - * - * return -1 if anything wrong happened (eg: the structure changed during the process) - */ -static int update_effectives(struct ckrm_core_class *root_core) -{ - struct ckrm_core_class *cur_core, *child_core; - struct ckrm_cpu_class *cls; - int ret = -1; - - cur_core = root_core; - child_core = NULL; - cls = ckrm_get_cpu_class(cur_core); - - //initialize the effectives for root - cls->stat.egrt = CKRM_SHARE_MAX; /*egrt of the root is always 100% */ - cls->stat.megrt = cls->stat.egrt * cls->shares.unused_guarantee - / cls->shares.total_guarantee; - cls->stat.ehl = CKRM_SHARE_MAX * get_hard_limit(cls) - / cls->shares.total_guarantee; - cls->stat.mehl = cls->stat.ehl * get_myhard_limit(cls) - / cls->shares.total_guarantee; - set_eshare(&cls->stat,cls->stat.egrt); - set_meshare(&cls->stat,cls->stat.megrt); - - repeat: - //check exit - if (!cur_core) - return 0; - - //visit this node only once - if (! child_core) - if (update_child_effective(cur_core) < 0) - return ret; //invalid cur_core node - - //next child - child_core = ckrm_get_next_child(cur_core, child_core); - - if (child_core) { - //go down to the next hier - cur_core = child_core; - child_core = NULL; - } else { //no more child, go back - child_core = cur_core; - cur_core = child_core->hnode.parent; - } - goto repeat; -} - -/**********************************************/ -/* surplus allocation */ -/**********************************************/ - -/* - * surplus = egrt - demand - * if surplus < 0, surplus = 0 - */ -static inline int get_node_surplus(struct ckrm_cpu_class *cls) -{ - int surplus = cls->stat.egrt - cls->stat.max_demand; - - if (surplus < 0) - surplus = 0; - - return surplus; -} - -static inline int get_my_node_surplus(struct ckrm_cpu_class *cls) -{ - int surplus = cls->stat.megrt - get_mmax_demand(&cls->stat); - - if (surplus < 0) - surplus = 0; - - return surplus; -} - -/** - * consume_surplus: decides how much surplus a node can consume - * @ckeck_sl: if check_sl is set, then check soft_limitx - * return how much consumed - * - * implements all the CKRM Scheduling Requirement - * assume c_cls is valid - */ -static inline int consume_surplus(int surplus, - struct ckrm_cpu_class *c_cls, - struct ckrm_cpu_class *p_cls, - int check_sl - ) -{ - int consumed = 0; - int inc_limit; - int total_grt = p_cls->shares.total_guarantee; - - BUG_ON(surplus < 0); - - /*can't consume more than demand or hard limit*/ - if (c_cls->stat.eshare >= c_cls->stat.max_demand) - goto out; - - //the surplus allocation is propotional to grt - consumed = - surplus * c_cls->shares.my_guarantee / total_grt; - - if (! consumed) //no more share - goto out; - - //hard limit and demand limit - inc_limit = c_cls->stat.max_demand - c_cls->stat.eshare; - - if (check_sl) { - int esl = p_cls->stat.eshare * get_soft_limit(c_cls) - /total_grt; - if (esl < c_cls->stat.max_demand) - inc_limit = esl - c_cls->stat.eshare; - } - - if (consumed > inc_limit) - consumed = inc_limit; - - BUG_ON(consumed < 0); - out: - return consumed; -} - -/* - * how much a node can consume for itself? - */ -static inline int consume_self_surplus(int surplus, - struct ckrm_cpu_class *p_cls, - int check_sl - ) -{ - int consumed = 0; - int inc_limit; - int total_grt = p_cls->shares.total_guarantee; - int max_demand = get_mmax_demand(&p_cls->stat); - - BUG_ON(surplus < 0); - - /*can't consume more than demand or hard limit*/ - if (p_cls->stat.meshare >= max_demand) - goto out; - - //the surplus allocation is propotional to grt - consumed = - surplus * p_cls->shares.unused_guarantee / total_grt; - - if (! consumed) //no more share - goto out; - - //hard limit and demand limit - inc_limit = max_demand - p_cls->stat.meshare; - - if (check_sl) { - int mesl = p_cls->stat.eshare * get_mysoft_limit(p_cls) - /total_grt; - if (mesl < max_demand) - inc_limit = mesl - p_cls->stat.meshare; - } - - if (consumed > inc_limit) - consumed = inc_limit; - - BUG_ON(consumed < 0); - out: - return consumed; -} - - -/* - * allocate surplus to all its children and also its default class - */ -static int alloc_surplus_single_round( - int surplus, - struct ckrm_core_class *parent, - struct ckrm_cpu_class *p_cls, - int check_sl) -{ - struct ckrm_cpu_class *c_cls; - struct ckrm_core_class *child_core = NULL; - int total_consumed = 0,consumed; - - //first allocate to the default class - consumed = - consume_self_surplus(surplus,p_cls,check_sl); - - if (consumed > 0) { - set_meshare(&p_cls->stat,p_cls->stat.meshare + consumed); - total_consumed += consumed; - } - - do { - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - return -1; - - consumed = - consume_surplus(surplus, c_cls, - p_cls,check_sl); - if (consumed > 0) { - set_eshare(&c_cls->stat,c_cls->stat.eshare + consumed); - total_consumed += consumed; - } - } - } while (child_core); - - return total_consumed; -} - -/** - * alloc_surplus_node: re-allocate the shares for children under parent - * @parent: parent node - * return the remaining surplus - * - * task: - * 1. get total surplus - * 2. allocate surplus - * 3. set the effective_share of each node - */ -static int alloc_surplus_node(struct ckrm_core_class *parent) -{ - struct ckrm_cpu_class *p_cls,*c_cls; - int total_surplus,consumed; - int check_sl; - int ret = -1; - struct ckrm_core_class *child_core = NULL; - - p_cls = ckrm_get_cpu_class(parent); - if (! p_cls) - goto realloc_out; - - /* - * get total surplus - */ - total_surplus = p_cls->stat.eshare - p_cls->stat.egrt; - BUG_ON(total_surplus < 0); - total_surplus += get_my_node_surplus(p_cls); - - do { - child_core = ckrm_get_next_child(parent, child_core); - if (child_core) { - c_cls = ckrm_get_cpu_class(child_core); - if (! c_cls) - goto realloc_out; - - total_surplus += get_node_surplus(c_cls); - } - } while (child_core); - - - if (! total_surplus) { - ret = 0; - goto realloc_out; - } - - /* - * distributing the surplus - * first with the check_sl enabled - * once all the tasks has research the soft limit, disable check_sl and try again - */ - - check_sl = 1; - do { - consumed = alloc_surplus_single_round(total_surplus,parent,p_cls,check_sl); - if (consumed < 0) //something is wrong - goto realloc_out; - - if (! consumed) - check_sl = 0; - else - total_surplus -= consumed; - - } while ((total_surplus > 0) && (consumed || check_sl) ); - - ret = 0; - - realloc_out: - return ret; -} - -/** - * alloc_surplus - reallocate unused shares - * - * class A's usused share should be allocated to its siblings - * the re-allocation goes downward from the top - */ -static int alloc_surplus(struct ckrm_core_class *root_core) -{ - struct ckrm_core_class *cur_core, *child_core; - // struct ckrm_cpu_class *cls; - int ret = -1; - - /*initialize*/ - cur_core = root_core; - child_core = NULL; - // cls = ckrm_get_cpu_class(cur_core); - - /*the ckrm idle tasks get all what's remaining*/ - /*hzheng: uncomment the following like for hard limit support */ - // update_ckrm_idle(CKRM_SHARE_MAX - cls->stat.max_demand); - - repeat: - //check exit - if (!cur_core) - return 0; - - //visit this node only once - if (! child_core) - if ( alloc_surplus_node(cur_core) < 0 ) - return ret; - - //next child - child_core = ckrm_get_next_child(cur_core, child_core); - if (child_core) { - //go down - cur_core = child_core; - child_core = NULL; - goto repeat; - } else { //no more child, go back - child_core = cur_core; - cur_core = child_core->hnode.parent; - } - goto repeat; -} - -/**********************************************/ -/* CKRM Idle Tasks */ -/**********************************************/ -struct ckrm_cpu_class ckrm_idle_class_obj, *ckrm_idle_class; -struct task_struct* ckrm_idle_tasks[NR_CPUS]; - -/*how many ckrm idle tasks should I wakeup*/ -static inline int get_nr_idle(unsigned long surplus) -{ - int cpu_online = cpus_weight(cpu_online_map); - int nr_idle = 0; - - nr_idle = surplus * cpu_online; - nr_idle >>= CKRM_SHARE_ACCURACY; - - if (surplus) - nr_idle ++; - - if (nr_idle > cpu_online) - nr_idle = cpu_online; - - return nr_idle; -} - -/** - * update_ckrm_idle: update the status of the idle class according to the new surplus - * surplus: new system surplus - * - * Task: - * -- update share of the idle class - * -- wakeup idle tasks according to surplus - */ -void update_ckrm_idle(unsigned long surplus) -{ - int nr_idle = get_nr_idle(surplus); - int i; - struct task_struct* idle_task; - - set_eshare(&ckrm_idle_class->stat,surplus); - set_meshare(&ckrm_idle_class->stat,surplus); - /*wake up nr_idle idle tasks*/ - for_each_online_cpu(i) { - idle_task = ckrm_idle_tasks[i]; - if (unlikely(idle_task->cpu_class != ckrm_idle_class)) { - ckrm_cpu_change_class(idle_task, - idle_task->cpu_class, - ckrm_idle_class); - } - if (! idle_task) - continue; - if (i < nr_idle) { - //activate it - wake_up_process(idle_task); - } else { - //deactivate it - idle_task->state = TASK_INTERRUPTIBLE; - set_tsk_need_resched(idle_task); - } - } -} - -static int ckrm_cpu_idled(void *nothing) -{ - set_user_nice(current,19); - daemonize("ckrm_idle_task"); - - //deactivate it, it will be awakened by ckrm_cpu_monitor - current->state = TASK_INTERRUPTIBLE; - schedule(); - - /*similar to cpu_idle */ - while (1) { - while (!need_resched()) { - ckrm_cpu_monitor(1); - if (current_cpu_data.hlt_works_ok) { - local_irq_disable(); - if (!need_resched()) { - set_tsk_need_resched(current); - safe_halt(); - } else - local_irq_enable(); - } - } - schedule(); - } - return 0; -} - -/** - * ckrm_start_ckrm_idle: - * create the ckrm_idle_class and starts the idle tasks - * - */ -void ckrm_start_ckrm_idle(void) -{ - int i; - int ret; - ckrm_shares_t shares; - - ckrm_idle_class = &ckrm_idle_class_obj; - memset(ckrm_idle_class,0,sizeof(shares)); - /*don't care about the shares */ - init_cpu_class(ckrm_idle_class,&shares); - printk(KERN_INFO"ckrm idle class %x created\n",(int)ckrm_idle_class); - - for_each_online_cpu(i) { - ret = kernel_thread(ckrm_cpu_idled, 0, CLONE_KERNEL); - - /*warn on error, but the system should still work without it*/ - if (ret < 0) - printk(KERN_ERR"Warn: can't start ckrm idle tasks\n"); - else { - ckrm_idle_tasks[i] = find_task_by_pid(ret); - if (!ckrm_idle_tasks[i]) - printk(KERN_ERR"Warn: can't find ckrm idle tasks %d\n",ret); - } - } -} - -/**********************************************/ -/* Local Weight */ -/**********************************************/ -/** - * adjust_class_local_weight: adjust the local weight for each cpu - * - * lrq->weight = lpr->pressure * class->weight / total_pressure - */ -static void adjust_lrq_weight(struct ckrm_cpu_class *clsptr, int cpu_online) -{ - unsigned long total_pressure = 0; - ckrm_lrq_t* lrq; - int i; - unsigned long class_weight; - unsigned long long lw; - - //get total pressure - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(clsptr,i); - total_pressure += lrq->lrq_load; - } - -#define FIX_SHARES -#ifdef FIX_SHARES -#warning "ACB: fix share initialization problem [PL #4227]" -#else - if (! total_pressure) - return; -#endif - - class_weight = cpu_class_weight(clsptr) * cpu_online; - - /* - * update weight for each cpu, minimun is 1 - */ - for_each_online_cpu(i) { - lrq = get_ckrm_lrq(clsptr,i); - if (! lrq->lrq_load) - /*give idle class a high share to boost interactiveness */ - lw = cpu_class_weight(clsptr); - else { -#ifdef FIX_SHARES - if (! total_pressure) - return; -#endif - lw = lrq->lrq_load * class_weight; - do_div(lw,total_pressure); - if (!lw) - lw = 1; - else if (lw > CKRM_SHARE_MAX) - lw = CKRM_SHARE_MAX; - } - - lrq->local_weight = lw; - } -} - -/* - * assume called with class_list_lock read lock held - */ - -void adjust_local_weight(void) -{ - static spinlock_t lock = SPIN_LOCK_UNLOCKED; - struct ckrm_cpu_class *clsptr; - int cpu_online; - - //do nothing if someone already holding the lock - if (! spin_trylock(&lock)) - return; - - cpu_online = cpus_weight(cpu_online_map); - - //class status: demand, share,total_ns prio, index - list_for_each_entry(clsptr,&active_cpu_classes,links) { - adjust_lrq_weight(clsptr,cpu_online); - } - - spin_unlock(&lock); -} - -/**********************************************/ -/* Main */ -/**********************************************/ -/** - *ckrm_cpu_monitor - adjust relative shares of the classes based on their progress - *@check_min: if check_min is set, the call can't be within 100ms of last call - * - * this function is called every CPU_MONITOR_INTERVAL - * it computes the cpu demand of each class - * and re-allocate the un-used shares to other classes - */ -void ckrm_cpu_monitor(int check_min) -{ - static spinlock_t lock = SPIN_LOCK_UNLOCKED; - static unsigned long long last_check = 0; - struct ckrm_core_class *root_core = get_default_cpu_class()->core; - unsigned long long now; -#define MIN_CPU_MONITOR_INTERVAL 100000000UL - - if (!root_core) - return; - - //do nothing if someone already holding the lock - if (! spin_trylock(&lock)) - return; - - read_lock(&class_list_lock); - - now = sched_clock(); - - //consecutive check should be at least 100ms apart - if (check_min && ((now - last_check) < MIN_CPU_MONITOR_INTERVAL)) - goto outunlock; - - last_check = now; - - if (update_effectives(root_core) != 0) - goto outunlock; - - if (update_max_demand(root_core) != 0) - goto outunlock; - -#ifndef ALLOC_SURPLUS_SUPPORT -#warning "MEF taking out alloc_surplus" -#else - if (alloc_surplus(root_core) != 0) - goto outunlock; -#endif - - adjust_local_weight(); - - outunlock: - read_unlock(&class_list_lock); - spin_unlock(&lock); -} - -/*****************************************************/ -/* Supporting Functions */ -/*****************************************************/ -static pid_t cpu_monitor_pid = -1; -static int thread_exit = 0; - -static int ckrm_cpu_monitord(void *nothing) -{ - daemonize("ckrm_cpu_ctrld"); - current->flags |= PF_NOFREEZE; - - for (;;) { - /*sleep for sometime before next try*/ - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(CPU_MONITOR_INTERVAL); - ckrm_cpu_monitor(1); - if (thread_exit) { - break; - } - } - cpu_monitor_pid = -1; - thread_exit = 2; - printk(KERN_DEBUG "cpu_monitord exit\n"); - return 0; -} - -void ckrm_start_monitor(void) -{ - cpu_monitor_pid = kernel_thread(ckrm_cpu_monitord, 0, CLONE_KERNEL); - if (cpu_monitor_pid < 0) { - printk(KERN_DEBUG "ckrm_cpu_monitord for failed\n"); - } -} - -void ckrm_kill_monitor(void) -{ - printk(KERN_DEBUG "killing process %d\n", cpu_monitor_pid); - if (cpu_monitor_pid > 0) { - thread_exit = 1; - while (thread_exit != 2) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(CPU_MONITOR_INTERVAL); - } - } -} - -int ckrm_cpu_monitor_init(void) -{ - ckrm_start_monitor(); - /*hzheng: uncomment the following like for hard limit support */ - // ckrm_start_ckrm_idle(); - return 0; -} - -void ckrm_cpu_monitor_exit(void) -{ - ckrm_kill_monitor(); -} - -module_init(ckrm_cpu_monitor_init); -module_exit(ckrm_cpu_monitor_exit); - -MODULE_AUTHOR("Haoqiang Zheng "); -MODULE_DESCRIPTION("Hierarchical CKRM CPU Resource Monitor"); -MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_events.c b/kernel/ckrm/ckrm_events.c deleted file mode 100644 index aad5e2538..000000000 --- a/kernel/ckrm/ckrm_events.c +++ /dev/null @@ -1,97 +0,0 @@ -/* ckrm_events.c - Class-based Kernel Resource Management (CKRM) - * - event handling routines - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * - * - * Provides API for event registration and handling for different - * classtypes. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 29 Sep 2004 - * Separated from ckrm.c - * - */ - -#include -#include -#include - -/******************************************************************* - * Event callback invocation - *******************************************************************/ - -struct ckrm_hook_cb *ckrm_event_callbacks[CKRM_NONLATCHABLE_EVENTS]; - -/* Registration / Deregistration / Invocation functions */ - -int ckrm_register_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb) -{ - struct ckrm_hook_cb **cbptr; - - if ((ev < CKRM_LATCHABLE_EVENTS) || (ev >= CKRM_NONLATCHABLE_EVENTS)) - return 1; - cbptr = &ckrm_event_callbacks[ev]; - while (*cbptr != NULL) - cbptr = &((*cbptr)->next); - *cbptr = cb; - return 0; -} - -int ckrm_unregister_event_cb(enum ckrm_event ev, struct ckrm_hook_cb *cb) -{ - struct ckrm_hook_cb **cbptr; - - if ((ev < CKRM_LATCHABLE_EVENTS) || (ev >= CKRM_NONLATCHABLE_EVENTS)) - return -1; - cbptr = &ckrm_event_callbacks[ev]; - while ((*cbptr != NULL) && (*cbptr != cb)) - cbptr = &((*cbptr)->next); - if (*cbptr) - (*cbptr)->next = cb->next; - return (*cbptr == NULL); -} - -int ckrm_register_event_set(struct ckrm_event_spec especs[]) -{ - struct ckrm_event_spec *espec = especs; - - for (espec = especs; espec->ev != -1; espec++) - ckrm_register_event_cb(espec->ev, &espec->cb); - return 0; -} - -int ckrm_unregister_event_set(struct ckrm_event_spec especs[]) -{ - struct ckrm_event_spec *espec = especs; - - for (espec = especs; espec->ev != -1; espec++) - ckrm_unregister_event_cb(espec->ev, &espec->cb); - return 0; -} - -#define ECC_PRINTK(fmt, args...) \ -// printk("%s: " fmt, __FUNCTION__ , ## args) - -void ckrm_invoke_event_cb_chain(enum ckrm_event ev, void *arg) -{ - struct ckrm_hook_cb *cb, *anchor; - - ECC_PRINTK("%d %x\n", current, ev, arg); - if ((anchor = ckrm_event_callbacks[ev]) != NULL) { - for (cb = anchor; cb; cb = cb->next) - (*cb->fct) (arg); - } -} - diff --git a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c deleted file mode 100644 index 103e3f957..000000000 --- a/kernel/ckrm/ckrm_listenaq.c +++ /dev/null @@ -1,495 +0,0 @@ -/* ckrm_listenaq.c - accept queue resource controller - * - * Copyright (C) Vivek Kashyap, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * Initial version - */ - -/* Code Description: TBD - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define hnode_2_core(ptr) \ - ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL) - -#define CKRM_SAQ_MAX_DEPTH 3 // 0 => /rcfs - // 1 => socket_aq - // 2 => socket_aq/listen_class - // 3 => socket_aq/listen_class/accept_queues - // 4 => Not allowed - -typedef struct ckrm_laq_res { - spinlock_t reslock; - atomic_t refcnt; - struct ckrm_shares shares; - struct ckrm_core_class *core; - struct ckrm_core_class *pcore; - int my_depth; - int my_id; - unsigned int min_ratio; -} ckrm_laq_res_t; - -static int my_resid = -1; - -extern struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int); -extern struct ckrm_core_class *rcfs_make_core(struct dentry *, - struct ckrm_core_class *); - -void laq_res_hold(struct ckrm_laq_res *res) -{ - atomic_inc(&res->refcnt); - return; -} - -void laq_res_put(struct ckrm_laq_res *res) -{ - if (atomic_dec_and_test(&res->refcnt)) - kfree(res); - return; -} - -/* Initialize rescls values - */ -static void laq_res_initcls(void *my_res) -{ - ckrm_laq_res_t *res = my_res; - - res->shares.my_guarantee = CKRM_SHARE_DONTCARE; - res->shares.my_limit = CKRM_SHARE_DONTCARE; - res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.cur_max_limit = 0; -} - -static int atoi(char *s) -{ - int k = 0; - while (*s) - k = *s++ - '0' + (k * 10); - return k; -} - -static char *laq_get_name(struct ckrm_core_class *c) -{ - char *p = (char *)c->name; - - while (*p) - p++; - while (*p != '/' && p != c->name) - p--; - - return ++p; -} - -static void *laq_res_alloc(struct ckrm_core_class *core, - struct ckrm_core_class *parent) -{ - ckrm_laq_res_t *res, *pres; - int pdepth; - - if (parent) - pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t); - else - pres = NULL; - - if (core == core->classtype->default_class) - pdepth = 1; - else { - if (!parent) - return NULL; - pdepth = 1 + pres->my_depth; - } - - res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC); - if (res) { - memset(res, 0, sizeof(res)); - spin_lock_init(&res->reslock); - laq_res_hold(res); - res->my_depth = pdepth; - if (pdepth == 2) // listen class - res->my_id = 0; - else if (pdepth == 3) - res->my_id = atoi(laq_get_name(core)); - res->core = core; - res->pcore = parent; - - // rescls in place, now initialize contents other than - // hierarchy pointers - laq_res_initcls(res); // acts as initialising value - } - - return res; -} - -static void laq_res_free(void *my_res) -{ - ckrm_laq_res_t *res = (ckrm_laq_res_t *) my_res; - ckrm_laq_res_t *parent; - - if (!res) - return; - - if (res->my_depth != 3) { - kfree(res); - return; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // Should never happen - return; - - spin_lock(&parent->reslock); - spin_lock(&res->reslock); - - // return child's guarantee to parent node - // Limits have no meaning for accept queue control - child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0); - - spin_unlock(&res->reslock); - laq_res_put(res); - spin_unlock(&parent->reslock); - return; -} - -/************************************************************************** - * SHARES *** - **************************************************************************/ - -void laq_set_aq_value(struct ckrm_net_struct *ns, unsigned int *aq_ratio) -{ - int i; - struct tcp_opt *tp; - - tp = tcp_sk(ns->ns_sk); - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) - tp->acceptq[i].aq_ratio = aq_ratio[i]; - return; -} -void laq_set_aq_values(ckrm_laq_res_t * parent, unsigned int *aq_ratio) -{ - - struct ckrm_net_struct *ns; - struct ckrm_core_class *core = parent->core; - - class_lock(core); - list_for_each_entry(ns, &core->objlist, ckrm_link) { - laq_set_aq_value(ns, aq_ratio); - } - class_unlock(core); - return; -} - -static void calculate_aq_ratios(ckrm_laq_res_t * res, unsigned int *aq_ratio) -{ - struct ckrm_hnode *chnode; - ckrm_laq_res_t *child; - unsigned int min; - int i; - - min = aq_ratio[0] = (unsigned int)res->shares.unused_guarantee; - - list_for_each_entry(chnode, &res->core->hnode.children, siblings) { - child = hnode_2_core(chnode)->res_class[my_resid]; - - aq_ratio[child->my_id] = - (unsigned int)child->shares.my_guarantee; - if (aq_ratio[child->my_id] == CKRM_SHARE_DONTCARE) - aq_ratio[child->my_id] = 0; - if (aq_ratio[child->my_id] && - ((unsigned int)aq_ratio[child->my_id] < min)) - min = (unsigned int)child->shares.my_guarantee; - } - - if (min == 0) { - min = 1; - // default takes all if nothing specified - aq_ratio[0] = 1; - } - res->min_ratio = min; - - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) - aq_ratio[i] = aq_ratio[i] / min; -} - -static int laq_set_share_values(void *my_res, struct ckrm_shares *shares) -{ - ckrm_laq_res_t *res = my_res; - ckrm_laq_res_t *parent; - unsigned int aq_ratio[NUM_ACCEPT_QUEUES]; - int rc = 0; - - if (!res) - return -EINVAL; - - if (!res->pcore) { - // something is badly wrong - printk(KERN_ERR "socketaq internal inconsistency\n"); - return -EBADF; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) // socketclass does not have a share interface - return -EINVAL; - - // Ensure that we ignore limit values - shares->my_limit = CKRM_SHARE_DONTCARE; - shares->max_limit = CKRM_SHARE_UNCHANGED; - - if (res->my_depth == 0) { - printk(KERN_ERR "socketaq bad entry\n"); - return -EBADF; - } else if (res->my_depth == 1) { - // can't be written to. This is an internal default. - return -EINVAL; - } else if (res->my_depth == 2) { - //nothin to inherit - if (!shares->total_guarantee) { - return -EINVAL; - } - parent = res; - shares->my_guarantee = CKRM_SHARE_DONTCARE; - } else if (res->my_depth == 3) { - // accept queue itself. - shares->total_guarantee = CKRM_SHARE_UNCHANGED; - } - - ckrm_lock_hier(parent->pcore); - spin_lock(&parent->reslock); - rc = set_shares(shares, &res->shares, - (parent == res) ? NULL : &parent->shares); - if (rc) { - spin_unlock(&res->reslock); - ckrm_unlock_hier(res->pcore); - return rc; - } - calculate_aq_ratios(parent, aq_ratio); - laq_set_aq_values(parent, aq_ratio); - spin_unlock(&parent->reslock); - ckrm_unlock_hier(parent->pcore); - - return rc; -} - -static int laq_get_share_values(void *my_res, struct ckrm_shares *shares) -{ - ckrm_laq_res_t *res = my_res; - - if (!res) - return -EINVAL; - *shares = res->shares; - return 0; -} - -/************************************************************************** - * STATS *** - **************************************************************************/ - -void -laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i) -{ - seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t" - "queued: %u\n\twait_time: %u\n", - i, taq->acceptq_count, taq->acceptq_qcount, - jiffies_to_msecs(taq->acceptq_wait_time)); - - if (i) - return; - - for (i = 1; i < NUM_ACCEPT_QUEUES; i++) { - taq[0].acceptq_wait_time += taq[i].acceptq_wait_time; - taq[0].acceptq_qcount += taq[i].acceptq_qcount; - taq[0].acceptq_count += taq[i].acceptq_count; - } - - seq_printf(sfile, "Totals :\n\taccepted: %u\n\t" - "queued: %u\n\twait_time: %u\n", - taq->acceptq_count, taq->acceptq_qcount, - jiffies_to_msecs(taq->acceptq_wait_time)); - - return; -} - -void -laq_get_aq_stats(ckrm_laq_res_t * pres, ckrm_laq_res_t * mres, - struct tcp_acceptq_info *taq) -{ - struct ckrm_net_struct *ns; - struct ckrm_core_class *core = pres->core; - struct tcp_opt *tp; - int a = mres->my_id; - int z; - - if (a == 0) - z = NUM_ACCEPT_QUEUES; - else - z = a + 1; - - // XXX Instead of holding a class_lock introduce a rw - // lock to be write locked by listen callbacks and read locked here. - // - VK - class_lock(pres->core); - list_for_each_entry(ns, &core->objlist, ckrm_link) { - tp = tcp_sk(ns->ns_sk); - for (; a < z; a++) { - taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time; - taq->acceptq_qcount += tp->acceptq[a].aq_qcount; - taq->acceptq_count += tp->acceptq[a].aq_count; - taq++; - } - } - class_unlock(pres->core); -} - -static int laq_get_stats(void *my_res, struct seq_file *sfile) -{ - ckrm_laq_res_t *res = my_res; - ckrm_laq_res_t *parent; - struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES]; - int rc = 0; - - if (!res) - return -EINVAL; - - if (!res->pcore) { - // something is badly wrong - printk(KERN_ERR "socketaq internal inconsistency\n"); - return -EBADF; - } - - parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t); - if (!parent) { // socketclass does not have a stat interface - printk(KERN_ERR "socketaq internal fs inconsistency\n"); - return -EINVAL; - } - - memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES); - - switch (res->my_depth) { - - default: - case 0: - printk(KERN_ERR "socket class bad entry\n"); - rc = -EBADF; - break; - - case 1: // can't be read from. this is internal default. - // return -EINVAL - rc = -EINVAL; - break; - - case 2: // return the default and total - ckrm_lock_hier(res->core); // block any deletes - laq_get_aq_stats(res, res, &taq[0]); - laq_print_aq_stats(sfile, &taq[0], 0); - ckrm_unlock_hier(res->core); // block any deletes - break; - - case 3: - ckrm_lock_hier(parent->core); // block any deletes - laq_get_aq_stats(parent, res, &taq[res->my_id]); - laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id); - ckrm_unlock_hier(parent->core); // block any deletes - break; - } - - return rc; -} - -/* - * The network connection is reclassified to this class. Update its shares. - * The socket lock is held. - */ -static void laq_change_resclass(void *n, void *old, void *r) -{ - struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n; - struct ckrm_laq_res *res = (struct ckrm_laq_res *)r; - unsigned int aq_ratio[NUM_ACCEPT_QUEUES]; - - if (res->my_depth != 2) - return; - - // a change to my_depth == 3 ie. the accept classes cannot happen. - // there is no target file - if (res->my_depth == 2) { // it is one of the socket classes - ckrm_lock_hier(res->pcore); - // share rule: hold parent resource lock. then self. - // However, since my_depth == 1 is a generic class it is not - // needed here. Self lock is enough. - spin_lock(&res->reslock); - calculate_aq_ratios(res, aq_ratio); - class_lock(res->pcore); - laq_set_aq_value(ns, aq_ratio); - class_unlock(res->pcore); - spin_unlock(&res->reslock); - ckrm_unlock_hier(res->pcore); - } - - return; -} - -struct ckrm_res_ctlr laq_rcbs = { - .res_name = "listenaq", - .resid = -1, // dynamically assigned - .res_alloc = laq_res_alloc, - .res_free = laq_res_free, - .set_share_values = laq_set_share_values, - .get_share_values = laq_get_share_values, - .get_stats = laq_get_stats, - .change_resclass = laq_change_resclass, - //.res_initcls = laq_res_initcls, //HUBERTUS: unnecessary !! -}; - -int __init init_ckrm_laq_res(void) -{ - struct ckrm_classtype *clstype; - int resid; - - clstype = ckrm_find_classtype_by_name("socketclass"); - if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); - return -ENOENT; - } - - if (my_resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &laq_rcbs); - if (resid >= 0) - my_resid = resid; - printk("........init_ckrm_listen_aq_res -> %d\n", my_resid); - } - return 0; - -} - -void __exit exit_ckrm_laq_res(void) -{ - ckrm_unregister_res_ctlr(&laq_rcbs); - my_resid = -1; -} - -module_init(init_ckrm_laq_res) - module_exit(exit_ckrm_laq_res) - - MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_mem.c b/kernel/ckrm/ckrm_mem.c deleted file mode 100644 index 736b579c7..000000000 --- a/kernel/ckrm/ckrm_mem.c +++ /dev/null @@ -1,981 +0,0 @@ -/* ckrm_mem.c - Memory Resource Manager for CKRM - * - * Copyright (C) Chandra Seetharaman, IBM Corp. 2004 - * - * Provides a Memory Resource controller for CKRM - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define MEM_NAME "mem" - -#define CKRM_MEM_MAX_HIERARCHY 2 // allows only upto 2 levels - 0, 1 & 2 - -/* all 1-level memory_share_class are chained together */ -LIST_HEAD(ckrm_memclass_list); -LIST_HEAD(ckrm_shrink_list); -spinlock_t ckrm_mem_lock; // protects both lists above -unsigned int ckrm_tot_lru_pages; // total # of pages in the system - // currently doesn't handle memory add/remove -struct ckrm_mem_res *ckrm_mem_root_class; -atomic_t ckrm_mem_real_count = ATOMIC_INIT(0); -static void ckrm_mem_evaluate_all_pages(struct ckrm_mem_res *); -int ckrm_nr_mem_classes = 0; - -EXPORT_SYMBOL_GPL(ckrm_memclass_list); -EXPORT_SYMBOL_GPL(ckrm_shrink_list); -EXPORT_SYMBOL_GPL(ckrm_mem_lock); -EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages); -EXPORT_SYMBOL_GPL(ckrm_mem_root_class); -EXPORT_SYMBOL_GPL(ckrm_mem_real_count); -EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes); - -/* Initialize rescls values - * May be called on each rcfs unmount or as part of error recovery - * to make share values sane. - * Does not traverse hierarchy reinitializing children. - */ - -void -memclass_release(struct kref *kref) -{ - struct ckrm_mem_res *cls = container_of(kref, struct ckrm_mem_res, nr_users); - BUG_ON(ckrm_memclass_valid(cls)); - kfree(cls); -} -EXPORT_SYMBOL_GPL(memclass_release); - -static void -set_ckrm_tot_pages(void) -{ - struct zone *zone; - int tot_lru_pages = 0; - - for_each_zone(zone) { - tot_lru_pages += zone->nr_active; - tot_lru_pages += zone->nr_inactive; - tot_lru_pages += zone->free_pages; - } - ckrm_tot_lru_pages = tot_lru_pages; -} - -static void -mem_res_initcls_one(struct ckrm_mem_res *res) -{ - int zindex = 0; - struct zone *zone; - - memset(res, 0, sizeof(struct ckrm_mem_res)); - - res->shares.my_guarantee = CKRM_SHARE_DONTCARE; - res->shares.my_limit = CKRM_SHARE_DONTCARE; - res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.cur_max_limit = 0; - - res->pg_guar = CKRM_SHARE_DONTCARE; - res->pg_limit = CKRM_SHARE_DONTCARE; - - INIT_LIST_HEAD(&res->shrink_list); - INIT_LIST_HEAD(&res->mcls_list); - - for_each_zone(zone) { - INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list); - INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list); - INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list); - res->ckrm_zone[zindex].nr_active = 0; - res->ckrm_zone[zindex].nr_inactive = 0; - res->ckrm_zone[zindex].zone = zone; - res->ckrm_zone[zindex].memcls = res; - zindex++; - } - - res->pg_unused = 0; - res->nr_dontcare = 1; // for default class - kref_init(&res->nr_users); -} - -static void -set_impl_guar_children(struct ckrm_mem_res *parres) -{ - ckrm_core_class_t *child = NULL; - struct ckrm_mem_res *cres; - int nr_dontcare = 1; // for defaultclass - int guar, impl_guar; - int resid = mem_rcbs.resid; - - ckrm_lock_hier(parres->core); - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - // treat NULL cres as don't care as that child is just being - // created. - // FIXME: need a better way to handle this case. - if (!cres || cres->pg_guar == CKRM_SHARE_DONTCARE) { - nr_dontcare++; - } - } - - parres->nr_dontcare = nr_dontcare; - guar = (parres->pg_guar == CKRM_SHARE_DONTCARE) ? - parres->impl_guar : parres->pg_unused; - impl_guar = guar / parres->nr_dontcare; - - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) { - cres->impl_guar = impl_guar; - set_impl_guar_children(cres); - } - } - ckrm_unlock_hier(parres->core); - -} - -void -check_memclass(struct ckrm_mem_res *res, char *str) -{ - int i, act = 0, inact = 0; - struct zone *zone; - struct ckrm_zone *ckrm_zone; - struct list_head *pos; - struct page *page; - -#if 0 - printk("Check<%s> %s: total=%d\n", - str, res->core->name, atomic_read(&res->pg_total)); -#endif - for (i = 0; i < MAX_NR_ZONES; i++) { - act = 0; inact = 0; - ckrm_zone = &res->ckrm_zone[i]; - zone = ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - pos = ckrm_zone->inactive_list.next; - while (pos != &ckrm_zone->inactive_list) { - page = list_entry(pos, struct page, lru); - pos = pos->next; - inact++; - } - pos = ckrm_zone->active_list.next; - while (pos != &ckrm_zone->active_list) { - page = list_entry(pos, struct page, lru); - pos = pos->next; - act++; - } - spin_unlock_irq(&zone->lru_lock); -#if 0 - printk("Check<%s>(zone=%d): act %ld, inae %ld lact %d lina %d\n", - str, i, ckrm_zone->nr_active, ckrm_zone->nr_inactive, - act, inact); -#endif - } -} -EXPORT_SYMBOL_GPL(check_memclass); - -static void * -mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent) -{ - struct ckrm_mem_res *res, *pres; - - if (mem_rcbs.resid == -1) { - return NULL; - } - - pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res); - if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) { - printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n", - CKRM_MEM_MAX_HIERARCHY); - return NULL; - } - - if (unlikely((parent == NULL) && (ckrm_mem_root_class != NULL))) { - printk(KERN_ERR "MEM_RC: Only one root class is allowed\n"); - return NULL; - } - - if (unlikely((parent != NULL) && (ckrm_mem_root_class == NULL))) { - printk(KERN_ERR "MEM_RC: child class with no root class!!"); - return NULL; - } - - res = kmalloc(sizeof(struct ckrm_mem_res), GFP_ATOMIC); - - if (res) { - mem_res_initcls_one(res); - res->core = core; - res->parent = parent; - spin_lock_irq(&ckrm_mem_lock); - list_add(&res->mcls_list, &ckrm_memclass_list); - spin_unlock_irq(&ckrm_mem_lock); - if (parent == NULL) { - // I am part of the root class. So, set the max to - // number of pages available - res->pg_guar = ckrm_tot_lru_pages; - res->pg_unused = ckrm_tot_lru_pages; - res->pg_limit = ckrm_tot_lru_pages; - res->hier = 0; - ckrm_mem_root_class = res; - } else { - int guar; - res->hier = pres->hier + 1; - set_impl_guar_children(pres); - guar = (pres->pg_guar == CKRM_SHARE_DONTCARE) ? - pres->impl_guar : pres->pg_unused; - res->impl_guar = guar / pres->nr_dontcare; - } - ckrm_nr_mem_classes++; - } - else - printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n"); - return res; -} - -/* - * It is the caller's responsibility to make sure that the parent only - * has chilren that are to be accounted. i.e if a new child is added - * this function should be called after it has been added, and if a - * child is deleted this should be called after the child is removed. - */ -static void -child_maxlimit_changed_local(struct ckrm_mem_res *parres) -{ - int maxlimit = 0; - struct ckrm_mem_res *childres; - ckrm_core_class_t *child = NULL; - - // run thru parent's children and get the new max_limit of the parent - ckrm_lock_hier(parres->core); - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - childres = ckrm_get_res_class(child, mem_rcbs.resid, - struct ckrm_mem_res); - if (maxlimit < childres->shares.my_limit) { - maxlimit = childres->shares.my_limit; - } - } - ckrm_unlock_hier(parres->core); - parres->shares.cur_max_limit = maxlimit; -} - -/* - * Recalculate the guarantee and limit in # of pages... and propagate the - * same to children. - * Caller is responsible for protecting res and for the integrity of parres - */ -static void -recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres) -{ - ckrm_core_class_t *child = NULL; - struct ckrm_mem_res *cres; - int resid = mem_rcbs.resid; - struct ckrm_shares *self = &res->shares; - - if (parres) { - struct ckrm_shares *par = &parres->shares; - - // calculate pg_guar and pg_limit - // - if (parres->pg_guar == CKRM_SHARE_DONTCARE || - self->my_guarantee == CKRM_SHARE_DONTCARE) { - res->pg_guar = CKRM_SHARE_DONTCARE; - } else if (par->total_guarantee) { - u64 temp = (u64) self->my_guarantee * parres->pg_guar; - do_div(temp, par->total_guarantee); - res->pg_guar = (int) temp; - res->impl_guar = CKRM_SHARE_DONTCARE; - } else { - res->pg_guar = 0; - res->impl_guar = CKRM_SHARE_DONTCARE; - } - - if (parres->pg_limit == CKRM_SHARE_DONTCARE || - self->my_limit == CKRM_SHARE_DONTCARE) { - res->pg_limit = CKRM_SHARE_DONTCARE; - } else if (par->max_limit) { - u64 temp = (u64) self->my_limit * parres->pg_limit; - do_div(temp, par->max_limit); - res->pg_limit = (int) temp; - } else { - res->pg_limit = 0; - } - } - - // Calculate unused units - if (res->pg_guar == CKRM_SHARE_DONTCARE) { - res->pg_unused = CKRM_SHARE_DONTCARE; - } else if (self->total_guarantee) { - u64 temp = (u64) self->unused_guarantee * res->pg_guar; - do_div(temp, self->total_guarantee); - res->pg_unused = (int) temp; - } else { - res->pg_unused = 0; - } - - // propagate to children - ckrm_lock_hier(res->core); - while ((child = ckrm_get_next_child(res->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - recalc_and_propagate(cres, res); - } - ckrm_unlock_hier(res->core); - return; -} - -static void -mem_res_free(void *my_res) -{ - struct ckrm_mem_res *res = my_res; - struct ckrm_mem_res *pres; - - if (!res) - return; - - ckrm_mem_evaluate_all_pages(res); - - pres = ckrm_get_res_class(res->parent, mem_rcbs.resid, - struct ckrm_mem_res); - - if (pres) { - child_guarantee_changed(&pres->shares, - res->shares.my_guarantee, 0); - child_maxlimit_changed_local(pres); - recalc_and_propagate(pres, NULL); - set_impl_guar_children(pres); - } - - res->shares.my_guarantee = 0; - res->shares.my_limit = 0; - res->pg_guar = 0; - res->pg_limit = 0; - res->pg_unused = 0; - - spin_lock_irq(&ckrm_mem_lock); - list_del_init(&res->mcls_list); - spin_unlock_irq(&ckrm_mem_lock); - - res->core = NULL; - res->parent = NULL; - kref_put(&res->nr_users, memclass_release); - ckrm_nr_mem_classes--; - return; -} - -static int -mem_set_share_values(void *my_res, struct ckrm_shares *shares) -{ - struct ckrm_mem_res *res = my_res; - struct ckrm_mem_res *parres; - int rc; - - if (!res) - return -EINVAL; - - parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, - struct ckrm_mem_res); - - rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL); - - if ((rc == 0) && (parres != NULL)) { - child_maxlimit_changed_local(parres); - recalc_and_propagate(parres, NULL); - set_impl_guar_children(parres); - } - - return rc; -} - -static int -mem_get_share_values(void *my_res, struct ckrm_shares *shares) -{ - struct ckrm_mem_res *res = my_res; - - if (!res) - return -EINVAL; - *shares = res->shares; - return 0; -} - -static int -mem_get_stats(void *my_res, struct seq_file *sfile) -{ - struct ckrm_mem_res *res = my_res; - struct zone *zone; - int active = 0, inactive = 0, fr = 0; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "--------- Memory Resource stats start ---------\n"); - if (res == ckrm_mem_root_class) { - int i = 0; - for_each_zone(zone) { - active += zone->nr_active; - inactive += zone->nr_inactive; - fr += zone->free_pages; - i++; - } - seq_printf(sfile,"System: tot_pages=%d,active=%d,inactive=%d" - ",free=%d\n", ckrm_tot_lru_pages, - active, inactive, fr); - } - seq_printf(sfile, "Number of pages used(including pages lent to" - " children): %d\n", atomic_read(&res->pg_total)); - seq_printf(sfile, "Number of pages guaranteed: %d\n", - res->pg_guar); - seq_printf(sfile, "Maximum limit of pages: %d\n", - res->pg_limit); - seq_printf(sfile, "Total number of pages available" - "(after serving guarantees to children): %d\n", - res->pg_unused); - seq_printf(sfile, "Number of pages lent to children: %d\n", - res->pg_lent); - seq_printf(sfile, "Number of pages borrowed from the parent: %d\n", - res->pg_borrowed); - seq_printf(sfile, "---------- Memory Resource stats end ----------\n"); - - return 0; -} - -static void -mem_change_resclass(void *tsk, void *old, void *new) -{ - struct mm_struct *mm; - struct task_struct *task = tsk, *t1; - struct ckrm_mem_res *prev_mmcls; - - if (!task->mm || (new == old) || (old == (void *) -1)) - return; - - mm = task->active_mm; - spin_lock(&mm->peertask_lock); - prev_mmcls = mm->memclass; - - if (new == NULL) { - list_del_init(&task->mm_peers); - } else { - int found = 0; - list_for_each_entry(t1, &mm->tasklist, mm_peers) { - if (t1 == task) { - found++; - break; - } - } - if (!found) { - list_del_init(&task->mm_peers); - list_add_tail(&task->mm_peers, &mm->tasklist); - } - } - - spin_unlock(&mm->peertask_lock); - ckrm_mem_evaluate_mm(mm, (struct ckrm_mem_res *) new); - return; -} - -#define MEM_FAIL_OVER "fail_over" -#define MEM_SHRINK_AT "shrink_at" -#define MEM_SHRINK_TO "shrink_to" -#define MEM_SHRINK_COUNT "num_shrinks" -#define MEM_SHRINK_INTERVAL "shrink_interval" - -int ckrm_mem_fail_over = 110; -int ckrm_mem_shrink_at = 90; -static int ckrm_mem_shrink_to = 80; -static int ckrm_mem_shrink_count = 10; -static int ckrm_mem_shrink_interval = 10; - -EXPORT_SYMBOL_GPL(ckrm_mem_fail_over); -EXPORT_SYMBOL_GPL(ckrm_mem_shrink_at); - -static int -mem_show_config(void *my_res, struct seq_file *sfile) -{ - struct ckrm_mem_res *res = my_res; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d,%s=%d,%s=%d\n", - MEM_NAME, - MEM_FAIL_OVER, ckrm_mem_fail_over, - MEM_SHRINK_AT, ckrm_mem_shrink_at, - MEM_SHRINK_TO, ckrm_mem_shrink_to, - MEM_SHRINK_COUNT, ckrm_mem_shrink_count, - MEM_SHRINK_INTERVAL, ckrm_mem_shrink_interval); - - return 0; -} - -// config file is available only at the root level, -// so assuming my_res to be the system level class -enum memclass_token { - mem_fail_over, - mem_shrink_at, - mem_shrink_to, - mem_shrink_count, - mem_shrink_interval, - mem_err -}; - -static match_table_t mem_tokens = { - {mem_fail_over, MEM_FAIL_OVER "=%d"}, - {mem_shrink_at, MEM_SHRINK_AT "=%d"}, - {mem_shrink_to, MEM_SHRINK_TO "=%d"}, - {mem_shrink_count, MEM_SHRINK_COUNT "=%d"}, - {mem_shrink_interval, MEM_SHRINK_INTERVAL "=%d"}, - {mem_err, NULL}, -}; - -static int -mem_set_config(void *my_res, const char *cfgstr) -{ - char *p; - struct ckrm_mem_res *res = my_res; - int err = 0, val; - - if (!res) - return -EINVAL; - - while ((p = strsep((char**)&cfgstr, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, mem_tokens, args); - switch (token) { - case mem_fail_over: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_fail_over = val; - } - break; - case mem_shrink_at: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_at = val; - } - break; - case mem_shrink_to: - if (match_int(args, &val) || (val < 0) || (val > 100)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_to = val; - } - break; - case mem_shrink_count: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_count = val; - } - break; - case mem_shrink_interval: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_interval = val; - } - break; - default: - err = -EINVAL; - } - } - return err; -} - -static int -mem_reset_stats(void *my_res) -{ - struct ckrm_mem_res *res = my_res; - printk(KERN_INFO "MEM_RC: reset stats called for class %s\n", - res->core->name); - return 0; -} - -struct ckrm_res_ctlr mem_rcbs = { - .res_name = MEM_NAME, - .res_hdepth = CKRM_MEM_MAX_HIERARCHY, - .resid = -1, - .res_alloc = mem_res_alloc, - .res_free = mem_res_free, - .set_share_values = mem_set_share_values, - .get_share_values = mem_get_share_values, - .get_stats = mem_get_stats, - .change_resclass = mem_change_resclass, - .show_config = mem_show_config, - .set_config = mem_set_config, - .reset_stats = mem_reset_stats, -}; - -EXPORT_SYMBOL_GPL(mem_rcbs); - -int __init -init_ckrm_mem_res(void) -{ - struct ckrm_classtype *clstype; - int resid = mem_rcbs.resid; - - set_ckrm_tot_pages(); - spin_lock_init(&ckrm_mem_lock); - clstype = ckrm_find_classtype_by_name("taskclass"); - if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); - return -ENOENT; - } - - if (resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &mem_rcbs); - if (resid != -1) { - mem_rcbs.classtype = clstype; - } - } - return ((resid < 0) ? resid : 0); -} - -void __exit -exit_ckrm_mem_res(void) -{ - ckrm_unregister_res_ctlr(&mem_rcbs); - mem_rcbs.resid = -1; -} - -module_init(init_ckrm_mem_res) -module_exit(exit_ckrm_mem_res) - -int -ckrm_mem_get_shrink_to(void) -{ - return ckrm_mem_shrink_to; -} - -void -ckrm_at_limit(struct ckrm_mem_res *cls) -{ - struct zone *zone; - unsigned long now = jiffies; - - if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || - ((cls->flags & MEM_AT_LIMIT) == MEM_AT_LIMIT)) { - return; - } - if ((cls->last_shrink > now) /* jiffies wrapped around */ || - (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) { - cls->last_shrink = now; - cls->shrink_count = 0; - } - cls->shrink_count++; - if (cls->shrink_count > ckrm_mem_shrink_count) { - return; - } - spin_lock_irq(&ckrm_mem_lock); - list_add(&cls->shrink_list, &ckrm_shrink_list); - spin_unlock_irq(&ckrm_mem_lock); - cls->flags |= MEM_AT_LIMIT; - for_each_zone(zone) { - wakeup_kswapd(zone); - break; // only once is enough - } -} - -static int -ckrm_mem_evaluate_page_anon(struct page* page) -{ - struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; - struct ckrm_mem_res* maxshareclass = NULL; - struct anon_vma *anon_vma = (struct anon_vma *) page->mapping; - struct vm_area_struct *vma; - struct mm_struct* mm; - int ret = 0; - - spin_lock(&anon_vma->lock); - BUG_ON(list_empty(&anon_vma->head)); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - mm = vma->vm_mm; - if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, - mm->memclass) < 0) { - maxshareclass = mm->memclass; - } - } - spin_unlock(&anon_vma->lock); - - if (!maxshareclass) { - maxshareclass = ckrm_mem_root_class; - } - if (pgcls != maxshareclass) { - ckrm_change_page_class(page, maxshareclass); - ret = 1; - } - return ret; -} - -static int -ckrm_mem_evaluate_page_file(struct page* page) -{ - struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; - struct ckrm_mem_res* maxshareclass = NULL; - struct address_space *mapping = page->mapping; - struct vm_area_struct *vma = NULL; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct prio_tree_iter iter; - struct mm_struct* mm; - int ret = 0; - - if (!mapping) - return 0; - - if (!spin_trylock(&mapping->i_mmap_lock)) - return 0; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, - pgoff, pgoff) { - mm = vma->vm_mm; - if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, - mm->memclass)<0) - maxshareclass = mm->memclass; - } - spin_unlock(&mapping->i_mmap_lock); - - if (!maxshareclass) { - maxshareclass = ckrm_mem_root_class; - } - if (pgcls != maxshareclass) { - ckrm_change_page_class(page, maxshareclass); - ret = 1; - } - return ret; -} - -static int -ckrm_mem_evaluate_page(struct page* page) -{ - int ret = 0; - BUG_ON(page->ckrm_zone == NULL); - if (page->mapping) { - if (PageAnon(page)) - ret = ckrm_mem_evaluate_page_anon(page); - else - ret = ckrm_mem_evaluate_page_file(page); - } - return ret; -} - -static void -ckrm_mem_evaluate_all_pages(struct ckrm_mem_res* res) -{ - struct page *page; - struct ckrm_zone *ckrm_zone; - struct zone *zone; - struct list_head *pos, *next; - int i; - - check_memclass(res, "bef_eval_all_pgs"); - for (i = 0; i < MAX_NR_ZONES; i++) { - ckrm_zone = &res->ckrm_zone[i]; - zone = ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - pos = ckrm_zone->inactive_list.next; - while (pos != &ckrm_zone->inactive_list) { - next = pos->next; - page = list_entry(pos, struct page, lru); - if (!ckrm_mem_evaluate_page(page)) - ckrm_change_page_class(page, - ckrm_mem_root_class); - pos = next; - } - pos = ckrm_zone->active_list.next; - while (pos != &ckrm_zone->active_list) { - next = pos->next; - page = list_entry(pos, struct page, lru); - if (!ckrm_mem_evaluate_page(page)) - ckrm_change_page_class(page, - ckrm_mem_root_class); - pos = next; - } - spin_unlock_irq(&zone->lru_lock); - } - check_memclass(res, "aft_eval_all_pgs"); - return; -} - -static inline int -class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma, - pmd_t* pmdir, unsigned long address, unsigned long end) -{ - pte_t *pte; - unsigned long pmd_end; - - if (pmd_none(*pmdir)) - return 0; - BUG_ON(pmd_bad(*pmdir)); - - pmd_end = (address+PMD_SIZE)&PMD_MASK; - if (end>pmd_end) - end = pmd_end; - - do { - pte = pte_offset_map(pmdir,address); - if (pte_present(*pte)) { - struct page *page = pte_page(*pte); - BUG_ON(mm->memclass == NULL); - if (page->mapping && page->ckrm_zone) { - struct zone *zone = page->ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - ckrm_change_page_class(page, mm->memclass); - spin_unlock_irq(&zone->lru_lock); - } - } - address += PAGE_SIZE; - pte_unmap(pte); - pte++; - } while(address && (addresspgd_end)) - end = pgd_end; - - do { - class_migrate_pmd(mm,vma,pmd,address,end); - address = (address+PMD_SIZE)&PMD_MASK; - pmd++; - } while (address && (addressvm_start; - end = vma->vm_end; - - pgdir = pgd_offset(vma->vm_mm, address); - do { - class_migrate_pgd(mm,vma,pgdir,address,end); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while(address && (addresspeertask_lock hold */ -void -ckrm_mem_evaluate_mm(struct mm_struct* mm, struct ckrm_mem_res *def) -{ - struct task_struct *task; - struct ckrm_mem_res *maxshareclass = def; - struct vm_area_struct *vma; - - if (list_empty(&mm->tasklist)) { - /* We leave the mm->memclass untouched since we believe that one - * mm with no task associated will be deleted soon or attach - * with another task later. - */ - return; - } - - list_for_each_entry(task, &mm->tasklist, mm_peers) { - struct ckrm_mem_res* cls = ckrm_get_mem_class(task); - if (!cls) - continue; - if (!maxshareclass || - ckrm_mem_share_compare(maxshareclass,cls)<0 ) - maxshareclass = cls; - } - - if (maxshareclass && (mm->memclass != maxshareclass)) { - if (mm->memclass) { - kref_put(&mm->memclass->nr_users, memclass_release); - } - mm->memclass = maxshareclass; - kref_get(&maxshareclass->nr_users); - - /* Go through all VMA to migrate pages */ - down_read(&mm->mmap_sem); - vma = mm->mmap; - while(vma) { - class_migrate_vma(mm, vma); - vma = vma->vm_next; - } - up_read(&mm->mmap_sem); - } - return; -} - -void -ckrm_init_mm_to_task(struct mm_struct * mm, struct task_struct *task) -{ - spin_lock(&mm->peertask_lock); - if (!list_empty(&task->mm_peers)) { - printk(KERN_ERR "MEM_RC: Task list NOT empty!! emptying...\n"); - list_del_init(&task->mm_peers); - } - list_add_tail(&task->mm_peers, &mm->tasklist); - spin_unlock(&mm->peertask_lock); - if (mm->memclass != ckrm_get_mem_class(task)) - ckrm_mem_evaluate_mm(mm, NULL); - return; -} - -int -ckrm_memclass_valid(struct ckrm_mem_res *cls) -{ - struct ckrm_mem_res *tmp; - unsigned long flags; - - if (!cls || list_empty(&cls->mcls_list)) { - return 0; - } - spin_lock_irqsave(&ckrm_mem_lock, flags); - list_for_each_entry(tmp, &ckrm_memclass_list, mcls_list) { - if (tmp == cls) { - spin_unlock(&ckrm_mem_lock); - return 1; - } - } - spin_unlock_irqrestore(&ckrm_mem_lock, flags); - return 0; -} - -MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_memcore.c b/kernel/ckrm/ckrm_memcore.c deleted file mode 100644 index eeeba2466..000000000 --- a/kernel/ckrm/ckrm_memcore.c +++ /dev/null @@ -1,628 +0,0 @@ -/* ckrm_memcore.c - Memory Resource Manager for CKRM - * - * Copyright (C) Jiantao Kong, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2004 - * - * Provides a Memory Resource controller for CKRM - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define MEM_RES_NAME "mem" - -#define CKRM_MEM_MAX_HIERARCHY 2 /* allows only upto 2 levels - 0, 1 & 2 */ - -/* all 1-level memory_share_class are chained together */ -LIST_HEAD(ckrm_memclass_list); -spinlock_t ckrm_mem_lock; /* protects list above */ -unsigned int ckrm_tot_lru_pages; /* # of pages in the system */ -int ckrm_nr_mem_classes = 0; -struct ckrm_mem_res *ckrm_mem_root_class; -atomic_t ckrm_mem_real_count = ATOMIC_INIT(0); - -EXPORT_SYMBOL_GPL(ckrm_memclass_list); -EXPORT_SYMBOL_GPL(ckrm_mem_lock); -EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages); -EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes); -EXPORT_SYMBOL_GPL(ckrm_mem_root_class); -EXPORT_SYMBOL_GPL(ckrm_mem_real_count); - -void -memclass_release(struct kref *kref) -{ - struct ckrm_mem_res *cls = container_of(kref, - struct ckrm_mem_res, nr_users); - kfree(cls); -} -EXPORT_SYMBOL_GPL(memclass_release); - -static void -set_ckrm_tot_pages(void) -{ - struct zone *zone; - int tot_lru_pages = 0; - - for_each_zone(zone) { - tot_lru_pages += zone->nr_active; - tot_lru_pages += zone->nr_inactive; - tot_lru_pages += zone->free_pages; - } - ckrm_tot_lru_pages = tot_lru_pages; -} - -/* Initialize rescls values - * May be called on each rcfs unmount or as part of error recovery - * to make share values sane. - * Does not traverse hierarchy reinitializing children. - */ -static void -mem_res_initcls_one(struct ckrm_mem_res *res) -{ - int zindex = 0; - struct zone *zone; - - memset(res, 0, sizeof(struct ckrm_mem_res)); - - res->shares.my_guarantee = CKRM_SHARE_DONTCARE; - res->shares.my_limit = CKRM_SHARE_DONTCARE; - res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.cur_max_limit = 0; - - res->pg_guar = CKRM_SHARE_DONTCARE; - res->pg_limit = CKRM_SHARE_DONTCARE; - - INIT_LIST_HEAD(&res->mcls_list); - INIT_LIST_HEAD(&res->shrink_list); - - for_each_zone(zone) { - INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list); - INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list); - INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list); - res->ckrm_zone[zindex].nr_active = 0; - res->ckrm_zone[zindex].nr_inactive = 0; - res->ckrm_zone[zindex].zone = zone; - res->ckrm_zone[zindex].memcls = res; - zindex++; - } - - res->pg_unused = 0; - res->nr_dontcare = 1; /* for default class */ - kref_init(&res->nr_users); -} - -static void -set_impl_guar_children(struct ckrm_mem_res *parres) -{ - struct ckrm_core_class *child = NULL; - struct ckrm_mem_res *cres; - int nr_dontcare = 1; // for defaultclass - int guar, impl_guar; - int resid = mem_rcbs.resid; - - ckrm_lock_hier(parres->core); - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - // treat NULL cres as don't care as that child is just being - // created. - // FIXME: need a better way to handle this case. - if (!cres || cres->pg_guar == CKRM_SHARE_DONTCARE) { - nr_dontcare++; - } - } - - parres->nr_dontcare = nr_dontcare; - guar = (parres->pg_guar == CKRM_SHARE_DONTCARE) ? - parres->impl_guar : parres->pg_unused; - impl_guar = guar / parres->nr_dontcare; - - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) { - cres->impl_guar = impl_guar; - set_impl_guar_children(cres); - } - } - ckrm_unlock_hier(parres->core); - -} - -static void * -mem_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent) -{ - struct ckrm_mem_res *res, *pres; - - BUG_ON(mem_rcbs.resid == -1); - - pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res); - if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) { - printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n", - CKRM_MEM_MAX_HIERARCHY); - return NULL; - } - - if ((parent == NULL) && (ckrm_mem_root_class != NULL)) { - printk(KERN_ERR "MEM_RC: Only one root class is allowed\n"); - return NULL; - } - - if ((parent != NULL) && (ckrm_mem_root_class == NULL)) { - printk(KERN_ERR "MEM_RC: child class with no root class!!"); - return NULL; - } - - res = kmalloc(sizeof(struct ckrm_mem_res), GFP_ATOMIC); - - if (res) { - mem_res_initcls_one(res); - res->core = core; - res->parent = parent; - spin_lock(&ckrm_mem_lock); - list_add(&res->mcls_list, &ckrm_memclass_list); - spin_unlock(&ckrm_mem_lock); - if (parent == NULL) { - /* I am the root class. So, set the max to * - * number of pages available in the system */ - res->pg_guar = ckrm_tot_lru_pages; - res->pg_unused = ckrm_tot_lru_pages; - res->pg_limit = ckrm_tot_lru_pages; - res->hier = 0; - ckrm_mem_root_class = res; - } else { - int guar; - res->hier = pres->hier + 1; - set_impl_guar_children(pres); - guar = (pres->pg_guar == CKRM_SHARE_DONTCARE) ? - pres->impl_guar : pres->pg_unused; - res->impl_guar = guar / pres->nr_dontcare; - } - ckrm_nr_mem_classes++; - } else - printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n"); - return res; -} - -/* - * It is the caller's responsibility to make sure that the parent only - * has chilren that are to be accounted. i.e if a new child is added - * this function should be called after it has been added, and if a - * child is deleted this should be called after the child is removed. - */ -static void -child_maxlimit_changed_local(struct ckrm_mem_res *parres) -{ - int maxlimit = 0; - struct ckrm_mem_res *childres; - struct ckrm_core_class *child = NULL; - - /* run thru parent's children and get new max_limit of parent */ - ckrm_lock_hier(parres->core); - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - childres = ckrm_get_res_class(child, mem_rcbs.resid, - struct ckrm_mem_res); - if (maxlimit < childres->shares.my_limit) { - maxlimit = childres->shares.my_limit; - } - } - ckrm_unlock_hier(parres->core); - parres->shares.cur_max_limit = maxlimit; -} - -/* - * Recalculate the guarantee and limit in # of pages... and propagate the - * same to children. - * Caller is responsible for protecting res and for the integrity of parres - */ -static void -recalc_and_propagate(struct ckrm_mem_res * res, struct ckrm_mem_res * parres) -{ - struct ckrm_core_class *child = NULL; - struct ckrm_mem_res *cres; - int resid = mem_rcbs.resid; - struct ckrm_shares *self = &res->shares; - - if (parres) { - struct ckrm_shares *par = &parres->shares; - - /* calculate pg_guar and pg_limit */ - if (parres->pg_guar == CKRM_SHARE_DONTCARE || - self->my_guarantee == CKRM_SHARE_DONTCARE) { - res->pg_guar = CKRM_SHARE_DONTCARE; - } else if (par->total_guarantee) { - u64 temp = (u64) self->my_guarantee * parres->pg_guar; - do_div(temp, par->total_guarantee); - res->pg_guar = (int) temp; - res->impl_guar = CKRM_SHARE_DONTCARE; - } else { - res->pg_guar = 0; - res->impl_guar = CKRM_SHARE_DONTCARE; - } - - if (parres->pg_limit == CKRM_SHARE_DONTCARE || - self->my_limit == CKRM_SHARE_DONTCARE) { - res->pg_limit = CKRM_SHARE_DONTCARE; - } else if (par->max_limit) { - u64 temp = (u64) self->my_limit * parres->pg_limit; - do_div(temp, par->max_limit); - res->pg_limit = (int) temp; - } else { - res->pg_limit = 0; - } - } - - /* Calculate unused units */ - if (res->pg_guar == CKRM_SHARE_DONTCARE) { - res->pg_unused = CKRM_SHARE_DONTCARE; - } else if (self->total_guarantee) { - u64 temp = (u64) self->unused_guarantee * res->pg_guar; - do_div(temp, self->total_guarantee); - res->pg_unused = (int) temp; - } else { - res->pg_unused = 0; - } - - /* propagate to children */ - ckrm_lock_hier(res->core); - while ((child = ckrm_get_next_child(res->core, child)) != NULL) { - cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res); - recalc_and_propagate(cres, res); - } - ckrm_unlock_hier(res->core); - return; -} - -static void -mem_res_free(void *my_res) -{ - struct ckrm_mem_res *res = my_res; - struct ckrm_mem_res *pres; - - if (!res) - return; - - ckrm_mem_migrate_all_pages(res, ckrm_mem_root_class); - - pres = ckrm_get_res_class(res->parent, mem_rcbs.resid, - struct ckrm_mem_res); - - if (pres) { - child_guarantee_changed(&pres->shares, - res->shares.my_guarantee, 0); - child_maxlimit_changed_local(pres); - recalc_and_propagate(pres, NULL); - set_impl_guar_children(pres); - } - - /* - * Making it all zero as freeing of data structure could - * happen later. - */ - res->shares.my_guarantee = 0; - res->shares.my_limit = 0; - res->pg_guar = 0; - res->pg_limit = 0; - res->pg_unused = 0; - - spin_lock(&ckrm_mem_lock); - list_del_init(&res->mcls_list); - spin_unlock(&ckrm_mem_lock); - - res->core = NULL; - res->parent = NULL; - kref_put(&res->nr_users, memclass_release); - ckrm_nr_mem_classes--; - return; -} - -static int -mem_set_share_values(void *my_res, struct ckrm_shares *shares) -{ - struct ckrm_mem_res *res = my_res; - struct ckrm_mem_res *parres; - int rc; - - if (!res) - return -EINVAL; - - parres = ckrm_get_res_class(res->parent, mem_rcbs.resid, - struct ckrm_mem_res); - - rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL); - - if ((rc == 0) && (parres != NULL)) { - child_maxlimit_changed_local(parres); - recalc_and_propagate(parres, NULL); - set_impl_guar_children(parres); - } - - return rc; -} - -static int -mem_get_share_values(void *my_res, struct ckrm_shares *shares) -{ - struct ckrm_mem_res *res = my_res; - - if (!res) - return -EINVAL; - printk(KERN_INFO "get_share called for %s resource of class %s\n", - MEM_RES_NAME, res->core->name); - *shares = res->shares; - return 0; -} - -static int -mem_get_stats(void *my_res, struct seq_file *sfile) -{ - struct ckrm_mem_res *res = my_res; - struct zone *zone; - int active = 0, inactive = 0, fr = 0; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "--------- Memory Resource stats start ---------\n"); - if (res == ckrm_mem_root_class) { - int i = 0; - for_each_zone(zone) { - active += zone->nr_active; - inactive += zone->nr_inactive; - fr += zone->free_pages; - i++; - } - seq_printf(sfile,"System: tot_pages=%d,active=%d,inactive=%d" - ",free=%d\n", ckrm_tot_lru_pages, - active, inactive, fr); - } - seq_printf(sfile, "Number of pages used(including pages lent to" - " children): %d\n", atomic_read(&res->pg_total)); - seq_printf(sfile, "Number of pages guaranteed: %d\n", - res->pg_guar); - seq_printf(sfile, "Maximum limit of pages: %d\n", - res->pg_limit); - seq_printf(sfile, "Total number of pages available" - "(after serving guarantees to children): %d\n", - res->pg_unused); - seq_printf(sfile, "Number of pages lent to children: %d\n", - res->pg_lent); - seq_printf(sfile, "Number of pages borrowed from the parent: %d\n", - res->pg_borrowed); - seq_printf(sfile, "---------- Memory Resource stats end ----------\n"); - - return 0; -} - -static void -mem_change_resclass(void *tsk, void *old, void *new) -{ - struct mm_struct *mm; - struct task_struct *task = tsk, *t1; - struct ckrm_mem_res *prev_mmcls; - - if (!task->mm || (new == old) || (old == (void *) -1)) - return; - - mm = task->active_mm; - spin_lock(&mm->peertask_lock); - prev_mmcls = mm->memclass; - - if (new == NULL) { - list_del_init(&task->mm_peers); - } else { - int found = 0; - list_for_each_entry(t1, &mm->tasklist, mm_peers) { - if (t1 == task) { - found++; - break; - } - } - if (!found) { - list_del_init(&task->mm_peers); - list_add_tail(&task->mm_peers, &mm->tasklist); - } - } - - spin_unlock(&mm->peertask_lock); - ckrm_mem_migrate_mm(mm, (struct ckrm_mem_res *) new); - return; -} - -#define MEM_FAIL_OVER "fail_over" -#define MEM_SHRINK_AT "shrink_at" -#define MEM_SHRINK_TO "shrink_to" -#define MEM_SHRINK_COUNT "num_shrinks" -#define MEM_SHRINK_INTERVAL "shrink_interval" - -int ckrm_mem_fail_at = 110; -int ckrm_mem_shrink_at = 90; -int ckrm_mem_shrink_to = 80; -int ckrm_mem_shrink_count = 10; -int ckrm_mem_shrink_interval = 10; - -EXPORT_SYMBOL_GPL(ckrm_mem_fail_at); -EXPORT_SYMBOL_GPL(ckrm_mem_shrink_at); -EXPORT_SYMBOL_GPL(ckrm_mem_shrink_to); - -static int -mem_show_config(void *my_res, struct seq_file *sfile) -{ - struct ckrm_mem_res *res = my_res; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "res=%s,%s=%d,%s=%d,%s=%d,%s=%d,%s=%d\n", - MEM_RES_NAME, - MEM_FAIL_OVER, ckrm_mem_fail_at, - MEM_SHRINK_AT, ckrm_mem_shrink_at, - MEM_SHRINK_TO, ckrm_mem_shrink_to, - MEM_SHRINK_COUNT, ckrm_mem_shrink_count, - MEM_SHRINK_INTERVAL, ckrm_mem_shrink_interval); - - return 0; -} - -typedef int __bitwise memclass_token_t; - -enum memclass_token { - mem_fail_over = (__force memclass_token_t) 1, - mem_shrink_at = (__force memclass_token_t) 2, - mem_shrink_to = (__force memclass_token_t) 3, - mem_shrink_count = (__force memclass_token_t) 4, - mem_shrink_interval = (__force memclass_token_t) 5, - mem_err = (__force memclass_token_t) 6 -}; - -static match_table_t mem_tokens = { - {mem_fail_over, MEM_FAIL_OVER "=%d"}, - {mem_shrink_at, MEM_SHRINK_AT "=%d"}, - {mem_shrink_to, MEM_SHRINK_TO "=%d"}, - {mem_shrink_count, MEM_SHRINK_COUNT "=%d"}, - {mem_shrink_interval, MEM_SHRINK_INTERVAL "=%d"}, - {mem_err, NULL}, -}; - -static int -mem_set_config(void *my_res, const char *cfgstr) -{ - char *p; - struct ckrm_mem_res *res = my_res; - int err = 0, val; - - if (!res) - return -EINVAL; - - while ((p = strsep((char**)&cfgstr, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, mem_tokens, args); - switch (token) { - case mem_fail_over: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_fail_at = val; - } - break; - case mem_shrink_at: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_at = val; - } - break; - case mem_shrink_to: - if (match_int(args, &val) || (val < 0) || (val > 100)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_to = val; - } - break; - case mem_shrink_count: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_count = val; - } - break; - case mem_shrink_interval: - if (match_int(args, &val) || (val <= 0)) { - err = -EINVAL; - } else { - ckrm_mem_shrink_interval = val; - } - break; - default: - err = -EINVAL; - } - } - return err; -} - -static int -mem_reset_stats(void *my_res) -{ - struct ckrm_mem_res *res = my_res; - printk(KERN_INFO "MEM_RC: reset stats called for class %s\n", - res->core->name); - return 0; -} - -struct ckrm_res_ctlr mem_rcbs = { - .res_name = MEM_RES_NAME, - .res_hdepth = CKRM_MEM_MAX_HIERARCHY, - .resid = -1, - .res_alloc = mem_res_alloc, - .res_free = mem_res_free, - .set_share_values = mem_set_share_values, - .get_share_values = mem_get_share_values, - .get_stats = mem_get_stats, - .change_resclass = mem_change_resclass, - .show_config = mem_show_config, - .set_config = mem_set_config, - .reset_stats = mem_reset_stats, -}; - -EXPORT_SYMBOL_GPL(mem_rcbs); - -int __init -init_ckrm_mem_res(void) -{ - struct ckrm_classtype *clstype; - int resid = mem_rcbs.resid; - - set_ckrm_tot_pages(); - spin_lock_init(&ckrm_mem_lock); - clstype = ckrm_find_classtype_by_name("taskclass"); - if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); - return -ENOENT; - } - - if (resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &mem_rcbs); - if (resid != -1) { - mem_rcbs.classtype = clstype; - } - } - return ((resid < 0) ? resid : 0); -} - -void __exit -exit_ckrm_mem_res(void) -{ - ckrm_unregister_res_ctlr(&mem_rcbs); - mem_rcbs.resid = -1; -} - -module_init(init_ckrm_mem_res) -module_exit(exit_ckrm_mem_res) -MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_memctlr.c b/kernel/ckrm/ckrm_memctlr.c deleted file mode 100644 index a8ae7a6a6..000000000 --- a/kernel/ckrm/ckrm_memctlr.c +++ /dev/null @@ -1,439 +0,0 @@ -/* ckrm_memctlr.c - Basic routines for the CKRM memory controller - * - * Copyright (C) Jiantao Kong, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2004 - * - * Provides a Memory Resource controller for CKRM - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include -#include -#include - -static int -ckrm_mem_evaluate_page_anon(struct page* page) -{ - struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; - struct ckrm_mem_res* maxshareclass = NULL; - struct anon_vma *anon_vma = (struct anon_vma *) page->mapping; - struct vm_area_struct *vma; - struct mm_struct* mm; - int ret = 0; - - if (!spin_trylock(&anon_vma->lock)) - return 0; - BUG_ON(list_empty(&anon_vma->head)); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - mm = vma->vm_mm; - if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, - mm->memclass) < 0) { - maxshareclass = mm->memclass; - } - } - spin_unlock(&anon_vma->lock); - - if (!maxshareclass) { - maxshareclass = ckrm_mem_root_class; - } - if (pgcls != maxshareclass) { - ckrm_change_page_class(page, maxshareclass); - ret = 1; - } - return ret; -} - -static int -ckrm_mem_evaluate_page_file(struct page* page) -{ - struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls; - struct ckrm_mem_res* maxshareclass = NULL; - struct address_space *mapping = page->mapping; - struct vm_area_struct *vma = NULL; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct prio_tree_iter iter; - struct mm_struct* mm; - int ret = 0; - - if (!mapping) - return 0; - - if (!spin_trylock(&mapping->i_mmap_lock)) - return 0; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, - pgoff, pgoff) { - mm = vma->vm_mm; - if (!maxshareclass || ckrm_mem_share_compare(maxshareclass, - mm->memclass)<0) - maxshareclass = mm->memclass; - } - spin_unlock(&mapping->i_mmap_lock); - - if (!maxshareclass) { - maxshareclass = ckrm_mem_root_class; - } - if (pgcls != maxshareclass) { - ckrm_change_page_class(page, maxshareclass); - ret = 1; - } - return ret; -} - -static int -ckrm_mem_evaluate_page(struct page* page) -{ - int ret = 0; - if (page->mapping) { - if (PageAnon(page)) - ret = ckrm_mem_evaluate_page_anon(page); - else - ret = ckrm_mem_evaluate_page_file(page); - } - return ret; -} - -void -ckrm_mem_migrate_all_pages(struct ckrm_mem_res* from, struct ckrm_mem_res* def) -{ - int i; - struct page *page; - struct zone *zone; - struct list_head *pos, *next; - struct ckrm_zone *ckrm_zone; - - for (i = 0; i < MAX_NR_ZONES; i++) { - ckrm_zone = &from->ckrm_zone[i]; - zone = ckrm_zone->zone; - spin_lock_irq(&zone->lru_lock); - pos = ckrm_zone->inactive_list.next; - while (pos != &ckrm_zone->inactive_list) { - next = pos->next; - page = list_entry(pos, struct page, lru); - if (ckrm_mem_evaluate_page(page)) - ckrm_change_page_class(page, def); - pos = next; - } - pos = ckrm_zone->active_list.next; - while (pos != &ckrm_zone->active_list) { - next = pos->next; - page = list_entry(pos, struct page, lru); - if (ckrm_mem_evaluate_page(page)) - ckrm_change_page_class(page, def); - pos = next; - } - spin_unlock_irq(&zone->lru_lock); - } - return; -} - -static inline int -class_migrate_pmd(struct mm_struct* mm, struct vm_area_struct* vma, - pmd_t* pmdir, unsigned long address, unsigned long end) -{ - pte_t *pte; - unsigned long pmd_end; - - if (pmd_none(*pmdir)) - return 0; - BUG_ON(pmd_bad(*pmdir)); - - pmd_end = (address+ PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; - - do { - pte = pte_offset_map(pmdir, address); - if (pte_present(*pte)) { - struct page *page = pte_page(*pte); - struct ckrm_zone *czone = page_ckrmzone(page); - if (page->mapping && czone) { - struct zone *zone = czone->zone; - spin_lock_irq(&zone->lru_lock); - ckrm_change_page_class(page, mm->memclass); - spin_unlock_irq(&zone->lru_lock); - } - } - address += PAGE_SIZE; - pte_unmap(pte); - pte++; - } while(address && (address < end)); - return 0; -} - -static inline int -class_migrate_pgd(struct mm_struct* mm, struct vm_area_struct* vma, - pgd_t* pgdir, unsigned long address, unsigned long end) -{ - pmd_t* pmd; - unsigned long pgd_end; - - if (pgd_none(*pgdir)) - return 0; - BUG_ON(pgd_bad(*pgdir)); - - pmd = pmd_offset(pgdir, address); - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - - if (pgd_end && (end > pgd_end)) - end = pgd_end; - - do { - class_migrate_pmd(mm, vma, pmd, address, end); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return 0; -} - -static inline int -class_migrate_vma(struct mm_struct* mm, struct vm_area_struct* vma) -{ - pgd_t* pgdir; - unsigned long address, end; - - address = vma->vm_start; - end = vma->vm_end; - - pgdir = pgd_offset(vma->vm_mm, address); - do { - class_migrate_pgd(mm, vma, pgdir, address, end); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while(address && (address < end)); - return 0; -} - -/* this function is called with mm->peertask_lock hold */ -void -ckrm_mem_migrate_mm(struct mm_struct* mm, struct ckrm_mem_res *def) -{ - struct task_struct *task; - struct vm_area_struct *vma; - struct ckrm_mem_res *maxshareclass = def; - - if (list_empty(&mm->tasklist)) { - /* We leave the mm->memclass untouched since we believe that one - * mm with no task associated will be deleted soon or attach - * with another task later. - */ - return; - } - - list_for_each_entry(task, &mm->tasklist, mm_peers) { - struct ckrm_mem_res* cls = ckrm_get_mem_class(task); - if (!cls) - continue; - if (!maxshareclass || - ckrm_mem_share_compare(maxshareclass,cls)<0 ) - maxshareclass = cls; - } - - if (maxshareclass && (mm->memclass != maxshareclass)) { - if (mm->memclass) { - kref_put(&mm->memclass->nr_users, memclass_release); - } - mm->memclass = maxshareclass; - kref_get(&maxshareclass->nr_users); - - /* Go through all VMA to migrate pages */ - down_read(&mm->mmap_sem); - vma = mm->mmap; - while(vma) { - class_migrate_vma(mm, vma); - vma = vma->vm_next; - } - up_read(&mm->mmap_sem); - } - return; -} - -static int -shrink_weight(struct ckrm_zone *czone) -{ - u64 temp; - struct zone *zone = czone->zone; - struct ckrm_mem_res *cls = czone->memcls; - int zone_usage, zone_guar, zone_total, guar, ret, cnt; - - zone_usage = czone->nr_active + czone->nr_inactive; - czone->active_over = czone->inactive_over = 0; - - if (zone_usage < SWAP_CLUSTER_MAX * 4) - return 0; - - if (cls->pg_guar == CKRM_SHARE_DONTCARE) { - // no guarantee for this class. use implicit guarantee - guar = cls->impl_guar / cls->nr_dontcare; - } else { - guar = cls->pg_unused / cls->nr_dontcare; - } - zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; - temp = (u64) guar * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_guar = (int) temp; - - ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ? - (zone_usage - zone_guar) : 0; - if (ret) { - cnt = czone->nr_active - (2 * zone_guar / 3); - if (cnt > 0) - czone->active_over = cnt; - cnt = czone->active_over + czone->nr_inactive - - zone_guar / 3; - if (cnt > 0) - czone->inactive_over = cnt; - } - return ret; -} - -/* insert an entry to the list and sort decendently*/ -static void -list_add_sort(struct list_head *entry, struct list_head *head) -{ - struct ckrm_zone *czone, *new = - list_entry(entry, struct ckrm_zone, victim_list); - struct list_head* pos = head->next; - - while (pos != head) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - if (new->shrink_weight > czone->shrink_weight) { - __list_add(entry, pos->prev, pos); - return; - } - pos = pos->next; - } - list_add_tail(entry, head); - return; -} - -static void -shrink_choose_victims(struct list_head *victims, - unsigned long nr_active, unsigned long nr_inactive) -{ - unsigned long nr; - struct ckrm_zone* czone; - struct list_head *pos, *next; - - pos = victims->next; - while ((pos != victims) && (nr_active || nr_inactive)) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - - if (nr_active && czone->active_over) { - nr = min(nr_active, czone->active_over); - czone->shrink_active += nr; - czone->active_over -= nr; - nr_active -= nr; - } - - if (nr_inactive && czone->inactive_over) { - nr = min(nr_inactive, czone->inactive_over); - czone->shrink_inactive += nr; - czone->inactive_over -= nr; - nr_inactive -= nr; - } - pos = pos->next; - } - - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - next = pos->next; - if (czone->shrink_active == 0 && czone->shrink_inactive == 0) { - list_del_init(pos); - ckrm_clear_shrink(czone); - } - pos = next; - } - return; -} - -void -shrink_get_victims(struct zone *zone, unsigned long nr_active, - unsigned long nr_inactive, struct list_head *victims) -{ - struct list_head *pos; - struct ckrm_mem_res *cls; - struct ckrm_zone *czone; - int zoneindex = zone_idx(zone); - - if (ckrm_nr_mem_classes <= 1) { - if (ckrm_mem_root_class) { - czone = ckrm_mem_root_class->ckrm_zone + zoneindex; - if (!ckrm_test_set_shrink(czone)) { - list_add(&czone->victim_list, victims); - czone->shrink_active = nr_active; - czone->shrink_inactive = nr_inactive; - } - } - return; - } - spin_lock(&ckrm_mem_lock); - list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) { - czone = cls->ckrm_zone + zoneindex; - if (ckrm_test_set_shrink(czone)) - continue; - - czone->shrink_active = 0; - czone->shrink_inactive = 0; - czone->shrink_weight = shrink_weight(czone); - if (czone->shrink_weight) { - list_add_sort(&czone->victim_list, victims); - } else { - ckrm_clear_shrink(czone); - } - } - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - pos = pos->next; - } - shrink_choose_victims(victims, nr_active, nr_inactive); - spin_unlock(&ckrm_mem_lock); - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - pos = pos->next; - } -} - -LIST_HEAD(ckrm_shrink_list); -void -ckrm_shrink_atlimit(struct ckrm_mem_res *cls) -{ - struct zone *zone; - unsigned long now = jiffies; - int order; - - if (!cls || (cls->pg_limit == CKRM_SHARE_DONTCARE) || - ((cls->flags & CLS_AT_LIMIT) == CLS_AT_LIMIT)) { - return; - } - if ((cls->last_shrink > now) /* jiffies wrapped around */ || - (cls->last_shrink + (ckrm_mem_shrink_interval * HZ)) < now) { - cls->last_shrink = now; - cls->shrink_count = 0; - } - cls->shrink_count++; - if (cls->shrink_count > ckrm_mem_shrink_count) { - return; - } - spin_lock(&ckrm_mem_lock); - list_add(&cls->shrink_list, &ckrm_shrink_list); - spin_unlock(&ckrm_mem_lock); - cls->flags |= CLS_AT_LIMIT; - for_each_zone(zone) { - /* This is just a number to get to wakeup kswapd */ - order = atomic_read(&cls->pg_total) - - ((ckrm_mem_shrink_to * cls->pg_limit) / 100); - wakeup_kswapd(zone); - break; // only once is enough - } -} diff --git a/kernel/ckrm/ckrm_null_class.c b/kernel/ckrm/ckrm_null_class.c deleted file mode 100644 index 7ea79d11c..000000000 --- a/kernel/ckrm/ckrm_null_class.c +++ /dev/null @@ -1,308 +0,0 @@ -/* kernel/ckrm/ckrm_null_class.c - NULL TaskClass controller for CKRM - * - * Copyright (C) Haoqiang Zheng, IBM Corp. 2004 - * (C) Hubertus Franke, IBM Corp. 2004 - * - * Copyright (C) Marc E. Fiuczynski, Princeton University 2005 - * Adapted from ckrm_cpu_class.c. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define CKRM_NULL_CLASS_MAGIC 0xdeadbeef - -static struct ckrm_res_ctlr null_rcbs; - -/* - * manages the class status - * there should be only one instance of this object for each class in the whole system - */ -struct ckrm_null_class { - struct ckrm_core_class *core; - struct ckrm_core_class *parent; - struct ckrm_shares shares; - spinlock_t cnt_lock; // always grab parent's lock first and then child's - unsigned long magic; //for debugging -}; - -/* - * initialize a class object and its local queues - */ -static void init_null_class(struct ckrm_null_class *cls,ckrm_shares_t* shares) -{ - cls->shares = *shares; - cls->cnt_lock = SPIN_LOCK_UNLOCKED; - cls->magic = CKRM_NULL_CLASS_MAGIC; -} - -static inline void set_default_share(ckrm_shares_t *shares) -{ - shares->my_guarantee = 0; - shares->total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - shares->unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - shares->my_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - shares->max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - shares->cur_max_limit = 0; -} - -static inline int valid_null_class(struct ckrm_null_class * cls) -{ - return (cls && cls->magic == CKRM_NULL_CLASS_MAGIC); -} - - -static struct ckrm_null_class * ckrm_get_null_class(struct ckrm_core_class *core) -{ - struct ckrm_null_class * cls; - cls = ckrm_get_res_class(core, null_rcbs.resid, struct ckrm_null_class); - if (valid_null_class(cls)) - return cls; - else - return NULL; -} - - -static struct ckrm_null_class default_null_class_obj; - -static struct ckrm_null_class * get_default_null_class(void) { - return (&default_null_class_obj); -} - - -static void* ckrm_alloc_null_class(struct ckrm_core_class *core, struct ckrm_core_class *parent) -{ - struct ckrm_null_class *cls; - - if (! parent) /*root class*/ - cls = get_default_null_class(); - else - cls = (struct ckrm_null_class *) kmalloc(sizeof(struct ckrm_null_class),GFP_ATOMIC); - - if (cls) { - ckrm_shares_t shares; - if ((! parent) && (core)) { - /* - * the default class is already initialized - * so only update the core structure - */ - cls->core = core; - } else { - set_default_share(&shares); - init_null_class(cls,&shares); - cls->core = core; - cls->parent = parent; - } - } else - printk(KERN_ERR"alloc_null_class failed\n"); - - return cls; -} - -/* - * hzheng: this is not a stable implementation - * need to check race condition issue here - */ -static void ckrm_free_null_class(void *my_res) -{ - struct ckrm_null_class *cls = my_res, *parres, *childres; - ckrm_core_class_t *child = NULL; - int maxlimit; - - if (!cls) - return; - - /*the default class can't be freed*/ - if (cls == get_default_null_class()) - return; - - // Assuming there will be no children when this function is called - parres = ckrm_get_null_class(cls->parent); - - // return child's limit/guarantee to parent node - spin_lock(&parres->cnt_lock); - child_guarantee_changed(&parres->shares, cls->shares.my_guarantee, 0); - - // run thru parent's children and get the new max_limit of the parent - ckrm_lock_hier(parres->core); - maxlimit = 0; - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - childres = ckrm_get_null_class(child); - if (maxlimit < childres->shares.my_limit) { - maxlimit = childres->shares.my_limit; - } - } - ckrm_unlock_hier(parres->core); - if (parres->shares.cur_max_limit < maxlimit) { - parres->shares.cur_max_limit = maxlimit; - } - - spin_unlock(&parres->cnt_lock); - kfree(cls); -} - -/* - * the system will adjust to the new share automatically - */ -static int ckrm_null_set_share(void *my_res, struct ckrm_shares *new_share) -{ - struct ckrm_null_class *parres, *cls = my_res; - struct ckrm_shares *cur = &cls->shares, *par; - int rc = -EINVAL; - - if (!cls) - return rc; - - if (cls->parent) { - parres = ckrm_get_null_class(cls->parent); - spin_lock(&parres->cnt_lock); - spin_lock(&cls->cnt_lock); - par = &parres->shares; - } else { - spin_lock(&cls->cnt_lock); - par = NULL; - parres = NULL; - } - - /* - * hzheng: CKRM_SHARE_DONTCARE should be handled - */ - if (new_share->my_guarantee == CKRM_SHARE_DONTCARE) - new_share->my_guarantee = 0; - - rc = set_shares(new_share, cur, par); - if (cur->my_limit == CKRM_SHARE_DONTCARE) - cur->my_limit = cur->max_limit; - - - spin_unlock(&cls->cnt_lock); - if (cls->parent) { - spin_unlock(&parres->cnt_lock); - } - - return rc; -} - -static int ckrm_null_get_share(void *my_res, - struct ckrm_shares *shares) -{ - struct ckrm_null_class *cls = my_res; - - if (!cls) - return -EINVAL; - *shares = cls->shares; - return 0; -} - -static int ckrm_null_get_stats(void *my_res, struct seq_file * sfile) -{ - struct ckrm_null_class *cls = my_res; - - if (!cls) - return -EINVAL; - - seq_printf(sfile, "-------- Null Class Status Start---------\n"); - seq_printf(sfile, "Share:\n\tgrt= %d limit= %d total_grt= %d max_limit= %d\n", - cls->shares.my_guarantee, - cls->shares.my_limit, - cls->shares.total_guarantee, - cls->shares.max_limit); - seq_printf(sfile, "\tunused_grt= %d cur_max_limit= %d\n", - cls->shares.unused_guarantee, - cls->shares.cur_max_limit); - - seq_printf(sfile, "-------- Null Class Status END ---------\n"); - - return 0; -} - -/* - * task will remain in the same null but on a different local runqueue - */ -static void ckrm_null_change_class(void *task, void *old, void *new) -{ - /*sanity checking*/ - if (!task || ! old || !new) - return; - - /* hook to controller */ -} - -/*dummy function, not used*/ -static int ckrm_null_show_config(void *my_res, struct seq_file *sfile) -{ - struct ckrm_null_class *cls = my_res; - - if (!cls) - return -EINVAL; - - seq_printf(sfile, "cls=%s,parameter=somevalue\n","ckrm_null class"); - return 0; -} - -/*dummy function, not used*/ -static int ckrm_null_set_config(void *my_res, const char *cfgstr) -{ - struct ckrm_nullclass *cls = my_res; - - if (!cls) - return -EINVAL; - printk(KERN_DEBUG "ckrm_null config='%s'\n",cfgstr); - return 0; -} - -static struct ckrm_res_ctlr null_rcbs = { - .res_name = "null", - .res_hdepth = 1, - .resid = -1, - .res_alloc = ckrm_alloc_null_class, - .res_free = ckrm_free_null_class, - .set_share_values = ckrm_null_set_share, - .get_share_values = ckrm_null_get_share, - .get_stats = ckrm_null_get_stats, - .show_config = ckrm_null_show_config, - .set_config = ckrm_null_set_config, - .change_resclass = ckrm_null_change_class, -}; - -int __init init_ckrm_null_res(void) -{ - struct ckrm_classtype *clstype; - int resid = null_rcbs.resid; - - clstype = ckrm_find_classtype_by_name("taskclass"); - if (clstype == NULL) { - printk(KERN_INFO" Unknown ckrm classtype"); - return -ENOENT; - } - - /* Initialize default class obj before registering with core */ - ckrm_alloc_null_class(NULL,NULL); - - if (resid == -1) { /*not registered */ - resid = ckrm_register_res_ctlr(clstype,&null_rcbs); - printk(KERN_DEBUG "........init_ckrm_null_res , resid= %d\n",resid); - } - return 0; -} - -void __exit exit_ckrm_null_res(void) -{ - ckrm_unregister_res_ctlr(&null_rcbs); - null_rcbs.resid = -1; -} - -module_init(init_ckrm_null_res) -module_exit(exit_ckrm_null_res) diff --git a/kernel/ckrm/ckrm_numtasks.c b/kernel/ckrm/ckrm_numtasks.c deleted file mode 100644 index f3c94d5f6..000000000 --- a/kernel/ckrm/ckrm_numtasks.c +++ /dev/null @@ -1,496 +0,0 @@ -/* ckrm_numtasks.c - "Number of tasks" resource controller for CKRM - * - * Copyright (C) Chandra Seetharaman, IBM Corp. 2003 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* - * CKRM Resource controller for tracking number of tasks in a class. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define TOTAL_NUM_TASKS (131072) /* 128 K */ -#define NUMTASKS_DEBUG -#define NUMTASKS_NAME "numtasks" - -struct ckrm_numtasks { - struct ckrm_core_class *core; /* the core i am part of... */ - struct ckrm_core_class *parent; /* parent of the core above. */ - struct ckrm_shares shares; - spinlock_t cnt_lock; /* always grab parent's lock before child's */ - int cnt_guarantee; /* num_tasks guarantee in local units */ - int cnt_unused; /* has to borrow if more than this is needed */ - int cnt_limit; /* no tasks over this limit. */ - atomic_t cnt_cur_alloc; /* current alloc from self */ - atomic_t cnt_borrowed; /* borrowed from the parent */ - - int over_guarantee; /* turn on/off when cur_alloc goes */ - /* over/under guarantee */ - - /* internally maintained statictics to compare with max numbers */ - int limit_failures; /* # failures as request was over the limit */ - int borrow_sucesses; /* # successful borrows */ - int borrow_failures; /* # borrow failures */ - - /* Maximum the specific statictics has reached. */ - int max_limit_failures; - int max_borrow_sucesses; - int max_borrow_failures; - - /* Total number of specific statistics */ - int tot_limit_failures; - int tot_borrow_sucesses; - int tot_borrow_failures; -}; - -struct ckrm_res_ctlr numtasks_rcbs; - -/* Initialize rescls values - * May be called on each rcfs unmount or as part of error recovery - * to make share values sane. - * Does not traverse hierarchy reinitializing children. - */ -static void numtasks_res_initcls_one(struct ckrm_numtasks * res) -{ - res->shares.my_guarantee = CKRM_SHARE_DONTCARE; - res->shares.my_limit = CKRM_SHARE_DONTCARE; - res->shares.total_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.max_limit = CKRM_SHARE_DFLT_MAX_LIMIT; - res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE; - res->shares.cur_max_limit = 0; - - res->cnt_guarantee = CKRM_SHARE_DONTCARE; - res->cnt_unused = CKRM_SHARE_DONTCARE; - res->cnt_limit = CKRM_SHARE_DONTCARE; - - res->over_guarantee = 0; - - res->limit_failures = 0; - res->borrow_sucesses = 0; - res->borrow_failures = 0; - - res->max_limit_failures = 0; - res->max_borrow_sucesses = 0; - res->max_borrow_failures = 0; - - res->tot_limit_failures = 0; - res->tot_borrow_sucesses = 0; - res->tot_borrow_failures = 0; - - atomic_set(&res->cnt_cur_alloc, 0); - atomic_set(&res->cnt_borrowed, 0); - return; -} - -static int numtasks_get_ref_local(struct ckrm_core_class *core, int force) -{ - int rc, resid = numtasks_rcbs.resid; - struct ckrm_numtasks *res; - - if ((resid < 0) || (core == NULL)) - return 1; - - res = ckrm_get_res_class(core, resid, struct ckrm_numtasks); - if (res == NULL) - return 1; - - atomic_inc(&res->cnt_cur_alloc); - - rc = 1; - if (((res->parent) && (res->cnt_unused == CKRM_SHARE_DONTCARE)) || - (atomic_read(&res->cnt_cur_alloc) > res->cnt_unused)) { - - rc = 0; - if (!force && (res->cnt_limit != CKRM_SHARE_DONTCARE) && - (atomic_read(&res->cnt_cur_alloc) > res->cnt_limit)) { - res->limit_failures++; - res->tot_limit_failures++; - } else if (res->parent != NULL) { - if ((rc = - numtasks_get_ref_local(res->parent, force)) == 1) { - atomic_inc(&res->cnt_borrowed); - res->borrow_sucesses++; - res->tot_borrow_sucesses++; - res->over_guarantee = 1; - } else { - res->borrow_failures++; - res->tot_borrow_failures++; - } - } else - rc = force; - } else if (res->over_guarantee) { - res->over_guarantee = 0; - - if (res->max_limit_failures < res->limit_failures) - res->max_limit_failures = res->limit_failures; - if (res->max_borrow_sucesses < res->borrow_sucesses) - res->max_borrow_sucesses = res->borrow_sucesses; - if (res->max_borrow_failures < res->borrow_failures) - res->max_borrow_failures = res->borrow_failures; - res->limit_failures = 0; - res->borrow_sucesses = 0; - res->borrow_failures = 0; - } - - if (!rc) - atomic_dec(&res->cnt_cur_alloc); - return rc; -} - -static void numtasks_put_ref_local(struct ckrm_core_class *core) -{ - int resid = numtasks_rcbs.resid; - struct ckrm_numtasks *res; - - if ((resid == -1) || (core == NULL)) - return; - - res = ckrm_get_res_class(core, resid, struct ckrm_numtasks); - if (res == NULL) - return; - - if (atomic_read(&res->cnt_cur_alloc)==0) - return; - - atomic_dec(&res->cnt_cur_alloc); - - if (atomic_read(&res->cnt_borrowed) > 0) { - atomic_dec(&res->cnt_borrowed); - numtasks_put_ref_local(res->parent); - } - return; -} - -static void *numtasks_res_alloc(struct ckrm_core_class *core, - struct ckrm_core_class *parent) -{ - struct ckrm_numtasks *res; - - res = kmalloc(sizeof(struct ckrm_numtasks), GFP_ATOMIC); - - if (res) { - memset(res, 0, sizeof(struct ckrm_numtasks)); - res->core = core; - res->parent = parent; - numtasks_res_initcls_one(res); - res->cnt_lock = SPIN_LOCK_UNLOCKED; - if (parent == NULL) { - /* - * I am part of root class. So set the max tasks - * to available default. - */ - res->cnt_guarantee = TOTAL_NUM_TASKS; - res->cnt_unused = TOTAL_NUM_TASKS; - res->cnt_limit = TOTAL_NUM_TASKS; - } - try_module_get(THIS_MODULE); - } else { - printk(KERN_ERR - "numtasks_res_alloc: failed GFP_ATOMIC alloc\n"); - } - return res; -} - -/* - * No locking of this resource class object necessary as we are not - * supposed to be assigned (or used) when/after this function is called. - */ -static void numtasks_res_free(void *my_res) -{ - struct ckrm_numtasks *res = my_res, *parres, *childres; - struct ckrm_core_class *child = NULL; - int i, borrowed, maxlimit, resid = numtasks_rcbs.resid; - - if (!res) - return; - - /* Assuming there will be no children when this function is called */ - - parres = ckrm_get_res_class(res->parent, resid, struct ckrm_numtasks); - - if ((borrowed = atomic_read(&res->cnt_borrowed)) > 0) - for (i = 0; i < borrowed; i++) - numtasks_put_ref_local(parres->core); - - /* return child's limit/guarantee to parent node */ - spin_lock(&parres->cnt_lock); - child_guarantee_changed(&parres->shares, res->shares.my_guarantee, 0); - - /* run thru parent's children and get the new max_limit of the parent */ - ckrm_lock_hier(parres->core); - maxlimit = 0; - while ((child = ckrm_get_next_child(parres->core, child)) != NULL) { - childres = ckrm_get_res_class(child, resid, struct ckrm_numtasks); - if (maxlimit < childres->shares.my_limit) - maxlimit = childres->shares.my_limit; - } - ckrm_unlock_hier(parres->core); - if (parres->shares.cur_max_limit < maxlimit) - parres->shares.cur_max_limit = maxlimit; - - spin_unlock(&parres->cnt_lock); - kfree(res); - module_put(THIS_MODULE); - return; -} - -/* - * Recalculate the guarantee and limit in real units... and propagate the - * same to children. - * Caller is responsible for protecting res and for the integrity of parres - */ -static void -recalc_and_propagate(struct ckrm_numtasks * res, struct ckrm_numtasks * parres) -{ - struct ckrm_core_class *child = NULL; - struct ckrm_numtasks *childres; - int resid = numtasks_rcbs.resid; - - if (parres) { - struct ckrm_shares *par = &parres->shares; - struct ckrm_shares *self = &res->shares; - - /* calculate cnt_guarantee and cnt_limit */ - if ((parres->cnt_guarantee == CKRM_SHARE_DONTCARE) || - (self->my_guarantee == CKRM_SHARE_DONTCARE)) - res->cnt_guarantee = CKRM_SHARE_DONTCARE; - else if (par->total_guarantee) { - u64 temp = (u64) self->my_guarantee * parres->cnt_guarantee; - do_div(temp, par->total_guarantee); - res->cnt_guarantee = (int) temp; - } else - res->cnt_guarantee = 0; - - if ((parres->cnt_limit == CKRM_SHARE_DONTCARE) || - (self->my_limit == CKRM_SHARE_DONTCARE)) - res->cnt_limit = CKRM_SHARE_DONTCARE; - else if (par->max_limit) { - u64 temp = (u64) self->my_limit * parres->cnt_limit; - do_div(temp, par->max_limit); - res->cnt_limit = (int) temp; - } else - res->cnt_limit = 0; - - /* Calculate unused units */ - if ((res->cnt_guarantee == CKRM_SHARE_DONTCARE) || - (self->my_guarantee == CKRM_SHARE_DONTCARE)) - res->cnt_unused = CKRM_SHARE_DONTCARE; - else if (self->total_guarantee) { - u64 temp = (u64) self->unused_guarantee * res->cnt_guarantee; - do_div(temp, self->total_guarantee); - res->cnt_unused = (int) temp; - } else - res->cnt_unused = 0; - } - - /* propagate to children */ - ckrm_lock_hier(res->core); - while ((child = ckrm_get_next_child(res->core, child)) != NULL) { - childres = ckrm_get_res_class(child, resid, struct ckrm_numtasks); - if (childres) { - spin_lock(&childres->cnt_lock); - recalc_and_propagate(childres, res); - spin_unlock(&childres->cnt_lock); - } else { - printk(KERN_ERR "%s: numtasks resclass missing\n",__FUNCTION__); - } - } - ckrm_unlock_hier(res->core); - return; -} - -static int numtasks_set_share_values(void *my_res, struct ckrm_shares *new) -{ - struct ckrm_numtasks *parres, *res = my_res; - struct ckrm_shares *cur = &res->shares, *par; - int rc = -EINVAL, resid = numtasks_rcbs.resid; - - if (!res) - return rc; - - if (res->parent) { - parres = - ckrm_get_res_class(res->parent, resid, struct ckrm_numtasks); - spin_lock(&parres->cnt_lock); - spin_lock(&res->cnt_lock); - par = &parres->shares; - } else { - spin_lock(&res->cnt_lock); - par = NULL; - parres = NULL; - } - - rc = set_shares(new, cur, par); - - if ((rc == 0) && parres) { - /* Calculate parent's unused units */ - if (parres->cnt_guarantee == CKRM_SHARE_DONTCARE) - parres->cnt_unused = CKRM_SHARE_DONTCARE; - else if (par->total_guarantee) { - u64 temp = (u64) par->unused_guarantee * parres->cnt_guarantee; - do_div(temp, par->total_guarantee); - parres->cnt_unused = (int) temp; - } else - parres->cnt_unused = 0; - recalc_and_propagate(res, parres); - } - spin_unlock(&res->cnt_lock); - if (res->parent) - spin_unlock(&parres->cnt_lock); - return rc; -} - -static int numtasks_get_share_values(void *my_res, struct ckrm_shares *shares) -{ - struct ckrm_numtasks *res = my_res; - - if (!res) - return -EINVAL; - *shares = res->shares; - return 0; -} - -static int numtasks_get_stats(void *my_res, struct seq_file *sfile) -{ - struct ckrm_numtasks *res = my_res; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "---------Number of tasks stats start---------\n"); - seq_printf(sfile, "Total Over limit failures: %d\n", - res->tot_limit_failures); - seq_printf(sfile, "Total Over guarantee sucesses: %d\n", - res->tot_borrow_sucesses); - seq_printf(sfile, "Total Over guarantee failures: %d\n", - res->tot_borrow_failures); - - seq_printf(sfile, "Maximum Over limit failures: %d\n", - res->max_limit_failures); - seq_printf(sfile, "Maximum Over guarantee sucesses: %d\n", - res->max_borrow_sucesses); - seq_printf(sfile, "Maximum Over guarantee failures: %d\n", - res->max_borrow_failures); - seq_printf(sfile, "---------Number of tasks stats end---------\n"); -#ifdef NUMTASKS_DEBUG - seq_printf(sfile, - "cur_alloc %d; borrowed %d; cnt_guar %d; cnt_limit %d " - "cnt_unused %d, unused_guarantee %d, cur_max_limit %d\n", - atomic_read(&res->cnt_cur_alloc), - atomic_read(&res->cnt_borrowed), res->cnt_guarantee, - res->cnt_limit, res->cnt_unused, - res->shares.unused_guarantee, - res->shares.cur_max_limit); -#endif - - return 0; -} - -static int numtasks_show_config(void *my_res, struct seq_file *sfile) -{ - struct ckrm_numtasks *res = my_res; - - if (!res) - return -EINVAL; - - seq_printf(sfile, "res=%s,parameter=somevalue\n", NUMTASKS_NAME); - return 0; -} - -static int numtasks_set_config(void *my_res, const char *cfgstr) -{ - struct ckrm_numtasks *res = my_res; - - if (!res) - return -EINVAL; - printk("numtasks config='%s'\n", cfgstr); - return 0; -} - -static void numtasks_change_resclass(void *task, void *old, void *new) -{ - struct ckrm_numtasks *oldres = old; - struct ckrm_numtasks *newres = new; - - if (oldres != (void *)-1) { - struct task_struct *tsk = task; - if (!oldres) { - struct ckrm_core_class *old_core = - &(tsk->parent->taskclass->core); - oldres = - ckrm_get_res_class(old_core, numtasks_rcbs.resid, - struct ckrm_numtasks); - } - if (oldres) - numtasks_put_ref_local(oldres->core); - } - if (newres) - (void)numtasks_get_ref_local(newres->core, 1); -} - -struct ckrm_res_ctlr numtasks_rcbs = { - .res_name = NUMTASKS_NAME, - .res_hdepth = 1, - .resid = -1, - .res_alloc = numtasks_res_alloc, - .res_free = numtasks_res_free, - .set_share_values = numtasks_set_share_values, - .get_share_values = numtasks_get_share_values, - .get_stats = numtasks_get_stats, - .show_config = numtasks_show_config, - .set_config = numtasks_set_config, - .change_resclass = numtasks_change_resclass, -}; - -int __init init_ckrm_numtasks_res(void) -{ - struct ckrm_classtype *clstype; - int resid = numtasks_rcbs.resid; - - clstype = ckrm_find_classtype_by_name("taskclass"); - if (clstype == NULL) { - printk(KERN_INFO " Unknown ckrm classtype"); - return -ENOENT; - } - - if (resid == -1) { - resid = ckrm_register_res_ctlr(clstype, &numtasks_rcbs); - printk("........init_ckrm_numtasks_res -> %d\n", resid); - if (resid != -1) { - ckrm_numtasks_register(numtasks_get_ref_local, - numtasks_put_ref_local); - numtasks_rcbs.classtype = clstype; - } - } - return 0; -} - -void __exit exit_ckrm_numtasks_res(void) -{ - if (numtasks_rcbs.resid != -1) - ckrm_numtasks_register(NULL, NULL); - ckrm_unregister_res_ctlr(&numtasks_rcbs); - numtasks_rcbs.resid = -1; -} - -module_init(init_ckrm_numtasks_res) -module_exit(exit_ckrm_numtasks_res) - -MODULE_LICENSE("GPL"); diff --git a/kernel/ckrm/ckrm_numtasks_stub.c b/kernel/ckrm/ckrm_numtasks_stub.c deleted file mode 100644 index d9f15c98b..000000000 --- a/kernel/ckrm/ckrm_numtasks_stub.c +++ /dev/null @@ -1,53 +0,0 @@ -/* ckrm_tasks_stub.c - Stub file for ckrm_tasks modules - * - * Copyright (C) Chandra Seetharaman, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -#include -#include -#include - -static spinlock_t stub_lock = SPIN_LOCK_UNLOCKED; - -static get_ref_t real_get_ref = NULL; -static put_ref_t real_put_ref = NULL; - -void ckrm_numtasks_register(get_ref_t gr, put_ref_t pr) -{ - spin_lock(&stub_lock); - real_get_ref = gr; - real_put_ref = pr; - spin_unlock(&stub_lock); -} - -int numtasks_get_ref(struct ckrm_core_class *arg, int force) -{ - int ret = 1; - spin_lock(&stub_lock); - if (real_get_ref) { - ret = (*real_get_ref) (arg, force); - } - spin_unlock(&stub_lock); - return ret; -} - -void numtasks_put_ref(struct ckrm_core_class *arg) -{ - spin_lock(&stub_lock); - if (real_put_ref) { - (*real_put_ref) (arg); - } - spin_unlock(&stub_lock); -} - -EXPORT_SYMBOL(ckrm_numtasks_register); -EXPORT_SYMBOL(numtasks_get_ref); -EXPORT_SYMBOL(numtasks_put_ref); diff --git a/kernel/ckrm/ckrm_sockc.c b/kernel/ckrm/ckrm_sockc.c deleted file mode 100644 index 8ccadfa39..000000000 --- a/kernel/ckrm/ckrm_sockc.c +++ /dev/null @@ -1,576 +0,0 @@ -/* ckrm_sock.c - Class-based Kernel Resource Management (CKRM) - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004 - * (C) Shailabh Nagar, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Vivek Kashyap, IBM Corp. 2004 - * - * - * Provides kernel API of CKRM for in-kernel,per-resource controllers - * (one each for cpu, memory, io, network) and callbacks for - * classification modules. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 28 Aug 2003 - * Created. - * 06 Nov 2003 - * Made modifications to suit the new RBCE module. - * 10 Nov 2003 - * Fixed a bug in fork and exit callbacks. Added callbacks_active and - * surrounding logic. Added task paramter for all CE callbacks. - * 23 Mar 2004 - * moved to referenced counted class objects and correct locking - * 12 Apr 2004 - * introduced adopted to emerging classtype interface - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct ckrm_sock_class { - struct ckrm_core_class core; -}; - -static struct ckrm_sock_class sockclass_dflt_class = { -}; - -#define SOCKET_CLASS_TYPE_NAME "socketclass" - -const char *dflt_sockclass_name = SOCKET_CLASS_TYPE_NAME; - -static struct ckrm_core_class *sock_alloc_class(struct ckrm_core_class *parent, - const char *name); -static int sock_free_class(struct ckrm_core_class *core); - -static int sock_forced_reclassify(ckrm_core_class_t * target, - const char *resname); -static int sock_show_members(struct ckrm_core_class *core, - struct seq_file *seq); -static void sock_add_resctrl(struct ckrm_core_class *core, int resid); -static void sock_reclassify_class(struct ckrm_sock_class *cls); - -struct ckrm_classtype CT_sockclass = { - .mfidx = 1, - .name = SOCKET_CLASS_TYPE_NAME, - .typeID = CKRM_CLASSTYPE_SOCKET_CLASS, - .maxdepth = 3, - .resid_reserved = 0, - .max_res_ctlrs = CKRM_MAX_RES_CTLRS, - .max_resid = 0, - .bit_res_ctlrs = 0L, - .res_ctlrs_lock = SPIN_LOCK_UNLOCKED, - .classes = LIST_HEAD_INIT(CT_sockclass.classes), - - .default_class = &sockclass_dflt_class.core, - - // private version of functions - .alloc = &sock_alloc_class, - .free = &sock_free_class, - .show_members = &sock_show_members, - .forced_reclassify = &sock_forced_reclassify, - - // use of default functions - .show_shares = &ckrm_class_show_shares, - .show_stats = &ckrm_class_show_stats, - .show_config = &ckrm_class_show_config, - .set_config = &ckrm_class_set_config, - .set_shares = &ckrm_class_set_shares, - .reset_stats = &ckrm_class_reset_stats, - - // mandatory private version .. no dflt available - .add_resctrl = &sock_add_resctrl, -}; - -/* helper functions */ - -void ckrm_ns_hold(struct ckrm_net_struct *ns) -{ - atomic_inc(&ns->ns_refcnt); - return; -} - -void ckrm_ns_put(struct ckrm_net_struct *ns) -{ - if (atomic_dec_and_test(&ns->ns_refcnt)) - kfree(ns); - return; -} - -/* - * Change the class of a netstruct - * - * Change the task's task class to "newcls" if the task's current - * class (task->taskclass) is same as given "oldcls", if it is non-NULL. - * - */ - -static void -sock_set_class(struct ckrm_net_struct *ns, struct ckrm_sock_class *newcls, - struct ckrm_sock_class *oldcls, enum ckrm_event event) -{ - int i; - struct ckrm_res_ctlr *rcbs; - struct ckrm_classtype *clstype; - void *old_res_class, *new_res_class; - - if ((newcls == oldcls) || (newcls == NULL)) { - ns->core = (void *)oldcls; - return; - } - - class_lock(class_core(newcls)); - ns->core = newcls; - list_add(&ns->ckrm_link, &class_core(newcls)->objlist); - class_unlock(class_core(newcls)); - - clstype = class_isa(newcls); - for (i = 0; i < clstype->max_resid; i++) { - atomic_inc(&clstype->nr_resusers[i]); - old_res_class = - oldcls ? class_core(oldcls)->res_class[i] : NULL; - new_res_class = - newcls ? class_core(newcls)->res_class[i] : NULL; - rcbs = clstype->res_ctlrs[i]; - if (rcbs && rcbs->change_resclass - && (old_res_class != new_res_class)) - (*rcbs->change_resclass) (ns, old_res_class, - new_res_class); - atomic_dec(&clstype->nr_resusers[i]); - } - return; -} - -static void sock_add_resctrl(struct ckrm_core_class *core, int resid) -{ - struct ckrm_net_struct *ns; - struct ckrm_res_ctlr *rcbs; - - if ((resid < 0) || (resid >= CKRM_MAX_RES_CTLRS) - || ((rcbs = core->classtype->res_ctlrs[resid]) == NULL)) - return; - - class_lock(core); - list_for_each_entry(ns, &core->objlist, ckrm_link) { - if (rcbs->change_resclass) - (*rcbs->change_resclass) (ns, NULL, - core->res_class[resid]); - } - class_unlock(core); -} - -/************************************************************************** - * Functions called from classification points * - **************************************************************************/ - -static void cb_sockclass_listen_start(struct sock *sk) -{ - struct ckrm_net_struct *ns = NULL; - struct ckrm_sock_class *newcls = NULL; - struct ckrm_res_ctlr *rcbs; - struct ckrm_classtype *clstype; - int i = 0; - - // XXX - TBD ipv6 - if (sk->sk_family == AF_INET6) - return; - - // to store the socket address - ns = (struct ckrm_net_struct *) - kmalloc(sizeof(struct ckrm_net_struct), GFP_ATOMIC); - if (!ns) - return; - - memset(ns, 0, sizeof(*ns)); - INIT_LIST_HEAD(&ns->ckrm_link); - ckrm_ns_hold(ns); - - ns->ns_family = sk->sk_family; - if (ns->ns_family == AF_INET6) // IPv6 not supported yet. - return; - - ns->ns_daddrv4 = inet_sk(sk)->rcv_saddr; - ns->ns_dport = inet_sk(sk)->num; - - ns->ns_pid = current->pid; - ns->ns_tgid = current->tgid; - ns->ns_tsk = current; - ce_protect(&CT_sockclass); - CE_CLASSIFY_RET(newcls, &CT_sockclass, CKRM_EVENT_LISTEN_START, ns, - current); - ce_release(&CT_sockclass); - - if (newcls == NULL) { - newcls = &sockclass_dflt_class; - ckrm_core_grab(class_core(newcls)); - } - - class_lock(class_core(newcls)); - list_add(&ns->ckrm_link, &class_core(newcls)->objlist); - ns->core = newcls; - class_unlock(class_core(newcls)); - - // the socket is already locked - // take a reference on socket on our behalf - sock_hold(sk); - sk->sk_ns = (void *)ns; - ns->ns_sk = sk; - - // modify its shares - clstype = class_isa(newcls); - for (i = 0; i < clstype->max_resid; i++) { - atomic_inc(&clstype->nr_resusers[i]); - rcbs = clstype->res_ctlrs[i]; - if (rcbs && rcbs->change_resclass) { - (*rcbs->change_resclass) ((void *)ns, - NULL, - class_core(newcls)-> - res_class[i]); - } - atomic_dec(&clstype->nr_resusers[i]); - } - return; -} - -static void cb_sockclass_listen_stop(struct sock *sk) -{ - struct ckrm_net_struct *ns = NULL; - struct ckrm_sock_class *newcls = NULL; - - // XXX - TBD ipv6 - if (sk->sk_family == AF_INET6) - return; - - ns = (struct ckrm_net_struct *)sk->sk_ns; - if (!ns) // listen_start called before socket_aq was loaded - return; - - newcls = ns->core; - if (newcls) { - class_lock(class_core(newcls)); - list_del(&ns->ckrm_link); - INIT_LIST_HEAD(&ns->ckrm_link); - class_unlock(class_core(newcls)); - ckrm_core_drop(class_core(newcls)); - } - // the socket is already locked - sk->sk_ns = NULL; - sock_put(sk); - - // Should be the last count and free it - ckrm_ns_put(ns); - return; -} - -static struct ckrm_event_spec sock_events_callbacks[] = { - CKRM_EVENT_SPEC(LISTEN_START, cb_sockclass_listen_start), - CKRM_EVENT_SPEC(LISTEN_STOP, cb_sockclass_listen_stop), - {-1} -}; - -/************************************************************************** - * Class Object Creation / Destruction - **************************************************************************/ - -static struct ckrm_core_class *sock_alloc_class(struct ckrm_core_class *parent, - const char *name) -{ - struct ckrm_sock_class *sockcls; - sockcls = kmalloc(sizeof(struct ckrm_sock_class), GFP_KERNEL); - if (sockcls == NULL) - return NULL; - memset(sockcls, 0, sizeof(struct ckrm_sock_class)); - - ckrm_init_core_class(&CT_sockclass, class_core(sockcls), parent, name); - - ce_protect(&CT_sockclass); - if (CT_sockclass.ce_cb_active && CT_sockclass.ce_callbacks.class_add) - (*CT_sockclass.ce_callbacks.class_add) (name, sockcls, - CT_sockclass.typeID); - ce_release(&CT_sockclass); - - return class_core(sockcls); -} - -static int sock_free_class(struct ckrm_core_class *core) -{ - struct ckrm_sock_class *sockcls; - - if (!ckrm_is_core_valid(core)) { - // Invalid core - return (-EINVAL); - } - if (core == core->classtype->default_class) { - // reset the name tag - core->name = dflt_sockclass_name; - return 0; - } - - sockcls = class_type(struct ckrm_sock_class, core); - - ce_protect(&CT_sockclass); - - if (CT_sockclass.ce_cb_active && CT_sockclass.ce_callbacks.class_delete) - (*CT_sockclass.ce_callbacks.class_delete) (core->name, sockcls, - CT_sockclass.typeID); - - sock_reclassify_class(sockcls); - - ce_release(&CT_sockclass); - - ckrm_release_core_class(core); - // Hubertus .... could just drop the class .. error message - - return 0; -} - -static int sock_show_members(struct ckrm_core_class *core, struct seq_file *seq) -{ - struct list_head *lh; - struct ckrm_net_struct *ns = NULL; - - class_lock(core); - list_for_each(lh, &core->objlist) { - ns = container_of(lh, struct ckrm_net_struct, ckrm_link); - seq_printf(seq, "%d.%d.%d.%d\\%d\n", - NIPQUAD(ns->ns_daddrv4), ns->ns_dport); - } - class_unlock(core); - - return 0; -} - -static int -sock_forced_reclassify_ns(struct ckrm_net_struct *tns, - struct ckrm_core_class *core) -{ - struct ckrm_net_struct *ns = NULL; - struct sock *sk = NULL; - struct ckrm_sock_class *oldcls, *newcls; - int rc = -EINVAL; - - if (!ckrm_is_core_valid(core)) { - return rc; - } - - newcls = class_type(struct ckrm_sock_class, core); - // lookup the listening sockets - // returns with a reference count set on socket - if (tns->ns_family == AF_INET6) - return -EOPNOTSUPP; - - sk = tcp_v4_lookup_listener(tns->ns_daddrv4, tns->ns_dport, 0); - if (!sk) { - printk(KERN_INFO "No such listener 0x%x:%d\n", - tns->ns_daddrv4, tns->ns_dport); - return rc; - } - lock_sock(sk); - if (!sk->sk_ns) { - goto out; - } - ns = sk->sk_ns; - ckrm_ns_hold(ns); - if (!capable(CAP_NET_ADMIN) && (ns->ns_tsk->user != current->user)) { - ckrm_ns_put(ns); - rc = -EPERM; - goto out; - } - - oldcls = ns->core; - if ((oldcls == NULL) || (oldcls == newcls)) { - ckrm_ns_put(ns); - goto out; - } - // remove the net_struct from the current class - class_lock(class_core(oldcls)); - list_del(&ns->ckrm_link); - INIT_LIST_HEAD(&ns->ckrm_link); - ns->core = NULL; - class_unlock(class_core(oldcls)); - - sock_set_class(ns, newcls, oldcls, CKRM_EVENT_MANUAL); - ckrm_ns_put(ns); - rc = 0; - out: - release_sock(sk); - sock_put(sk); - - return rc; - -} - -enum sock_target_token_t { - IPV4, IPV6, SOCKC_TARGET_ERR -}; - -static match_table_t sock_target_tokens = { - {IPV4, "ipv4=%s"}, - {IPV6, "ipv6=%s"}, - {SOCKC_TARGET_ERR, NULL}, -}; - -char *v4toi(char *s, char c, __u32 * v) -{ - unsigned int k = 0, n = 0; - - while (*s && (*s != c)) { - if (*s == '.') { - n <<= 8; - n |= k; - k = 0; - } else - k = k * 10 + *s - '0'; - s++; - } - - n <<= 8; - *v = n | k; - - return s; -} - -static int -sock_forced_reclassify(struct ckrm_core_class *target, const char *options) -{ - char *p, *p2; - struct ckrm_net_struct ns; - __u32 v4addr, tmp; - - if (!options) - return -EINVAL; - - if (target == NULL) { - unsigned long id = simple_strtol(options,NULL,0); - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (id != 0) - return -EINVAL; - printk(KERN_DEBUG "sock_class: reclassify all not net implemented\n"); - return 0; - } - - while ((p = strsep((char **)&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - - if (!*p) - continue; - token = match_token(p, sock_target_tokens, args); - switch (token) { - - case IPV4: - - p2 = p; - while (*p2 && (*p2 != '=')) - ++p2; - p2++; - p2 = v4toi(p2, '\\', &(v4addr)); - ns.ns_daddrv4 = htonl(v4addr); - ns.ns_family = AF_INET; - p2 = v4toi(++p2, ':', &tmp); - ns.ns_dport = (__u16) tmp; - if (*p2) - p2 = v4toi(++p2, '\0', &ns.ns_pid); - sock_forced_reclassify_ns(&ns, target); - break; - - case IPV6: - printk(KERN_INFO "rcfs: IPV6 not supported yet\n"); - return -ENOSYS; - default: - return -EINVAL; - } - } - return -EINVAL; -} - -/* - * Listen_aq reclassification. - */ -static void sock_reclassify_class(struct ckrm_sock_class *cls) -{ - struct ckrm_net_struct *ns, *tns; - struct ckrm_core_class *core = class_core(cls); - LIST_HEAD(local_list); - - if (!cls) - return; - - if (!ckrm_validate_and_grab_core(core)) - return; - - class_lock(core); - // we have the core refcnt - if (list_empty(&core->objlist)) { - class_unlock(core); - ckrm_core_drop(core); - return; - } - - INIT_LIST_HEAD(&local_list); - list_splice_init(&core->objlist, &local_list); - class_unlock(core); - ckrm_core_drop(core); - - list_for_each_entry_safe(ns, tns, &local_list, ckrm_link) { - ckrm_ns_hold(ns); - list_del(&ns->ckrm_link); - if (ns->ns_sk) { - lock_sock(ns->ns_sk); - sock_set_class(ns, &sockclass_dflt_class, NULL, - CKRM_EVENT_MANUAL); - release_sock(ns->ns_sk); - } - ckrm_ns_put(ns); - } - return; -} - -void __init ckrm_meta_init_sockclass(void) -{ - printk(KERN_DEBUG "...... Initializing ClassType<%s> ........\n", - CT_sockclass.name); - // intialize the default class - ckrm_init_core_class(&CT_sockclass, class_core(&sockclass_dflt_class), - NULL, dflt_sockclass_name); - - // register classtype and initialize default task class - ckrm_register_classtype(&CT_sockclass); - ckrm_register_event_set(sock_events_callbacks); - - // note registeration of all resource controllers will be done - // later dynamically as these are specified as modules -} - -#if 1 - -/***************************************************************************** - * Debugging Network Classes: Utility functions - *****************************************************************************/ - -#endif diff --git a/kernel/ckrm/ckrm_tc.c b/kernel/ckrm/ckrm_tc.c deleted file mode 100644 index 77b565a16..000000000 --- a/kernel/ckrm/ckrm_tc.c +++ /dev/null @@ -1,802 +0,0 @@ -/* ckrm_tc.c - Class-based Kernel Resource Management (CKRM) - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003,2004 - * (C) Shailabh Nagar, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Vivek Kashyap, IBM Corp. 2004 - * - * - * Provides kernel API of CKRM for in-kernel,per-resource controllers - * (one each for cpu, memory, io, network) and callbacks for - * classification modules. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 28 Aug 2003 - * Created. - * 06 Nov 2003 - * Made modifications to suit the new RBCE module. - * 10 Nov 2003 - * Fixed a bug in fork and exit callbacks. Added callbacks_active and - * surrounding logic. Added task paramter for all CE callbacks. - * 23 Mar 2004 - * moved to referenced counted class objects and correct locking - * 12 Apr 2004 - * introduced adopted to emerging classtype interface - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#warning MEF I cannot believe that vserver changes force the following include statement: FIX THIS! -#include - - -#define TC_DEBUG(fmt, args...) do { \ -/* printk("%s: " fmt, __FUNCTION__ , ## args); */ } while (0) - -static struct ckrm_task_class taskclass_dflt_class = { -}; - -const char *dflt_taskclass_name = TASK_CLASS_TYPE_NAME; - -static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class - *parent, const char *name); -static int ckrm_free_task_class(struct ckrm_core_class *core); - -static int tc_forced_reclassify(ckrm_core_class_t * target, - const char *resname); -static int tc_show_members(struct ckrm_core_class *core, struct seq_file *seq); -static void tc_add_resctrl(struct ckrm_core_class *core, int resid); - -struct ckrm_classtype CT_taskclass = { - .mfidx = TC_MF_IDX, - .name = TASK_CLASS_TYPE_NAME, - .typeID = CKRM_CLASSTYPE_TASK_CLASS, - .maxdepth = 3, // Hubertus .. just to start - .resid_reserved = 4, // Hubertus .. reservation - .max_res_ctlrs = CKRM_MAX_RES_CTLRS, - .max_resid = 0, - .bit_res_ctlrs = 0L, - .res_ctlrs_lock = SPIN_LOCK_UNLOCKED, - .classes = LIST_HEAD_INIT(CT_taskclass.classes), - - .default_class = &taskclass_dflt_class.core, - - // private version of functions - .alloc = &ckrm_alloc_task_class, - .free = &ckrm_free_task_class, - .show_members = &tc_show_members, - .forced_reclassify = &tc_forced_reclassify, - - // use of default functions - .show_shares = &ckrm_class_show_shares, - .show_stats = &ckrm_class_show_stats, - .show_config = &ckrm_class_show_config, - .set_config = &ckrm_class_set_config, - .set_shares = &ckrm_class_set_shares, - .reset_stats = &ckrm_class_reset_stats, - - // mandatory private version .. no dflt available - .add_resctrl = &tc_add_resctrl, -}; - -/************************************************************************** - * Helper Functions * - **************************************************************************/ - -static inline void ckrm_init_task_lock(struct task_struct *tsk) -{ - tsk->ckrm_tsklock = SPIN_LOCK_UNLOCKED; -} - -// Hubertus .. following functions should move to ckrm_rc.h - -static inline void ckrm_task_lock(struct task_struct *tsk) -{ - spin_lock(&tsk->ckrm_tsklock); -} - -static inline void ckrm_task_unlock(struct task_struct *tsk) -{ - spin_unlock(&tsk->ckrm_tsklock); -} - -/* - * Change the task class of the given task. - * - * Change the task's task class to "newcls" if the task's current - * class (task->taskclass) is same as given "oldcls", if it is non-NULL. - * - * Caller is responsible to make sure the task structure stays put through - * this function. - * - * This function should be called with the following locks NOT held - * - tsk->ckrm_task_lock - * - core->ckrm_lock, if core is NULL then ckrm_dflt_class.ckrm_lock - * - tsk->taskclass->ckrm_lock - * - * Function is also called with a ckrm_core_grab on the new core, hence - * it needs to be dropped if no assignment takes place. - */ -static void -ckrm_set_taskclass(struct task_struct *tsk, ckrm_task_class_t * newcls, - ckrm_task_class_t * oldcls, enum ckrm_event event) -{ - int i; - ckrm_classtype_t *clstype; - ckrm_res_ctlr_t *rcbs; - ckrm_task_class_t *curcls; - void *old_res_class, *new_res_class; - int drop_old_cls; - - ckrm_task_lock(tsk); - curcls = tsk->taskclass; - - if ((void *)-1 == curcls) { - // task is disassociated from ckrm... don't bother it. - ckrm_task_unlock(tsk); - ckrm_core_drop(class_core(newcls)); - return; - } - - if ((curcls == NULL) && (newcls == (void *)-1)) { - // task need to disassociated from ckrm and has no curcls - // just disassociate and return. - tsk->taskclass = newcls; - ckrm_task_unlock(tsk); - return; - } - // check whether compare_and_exchange should - if (oldcls && (oldcls != curcls)) { - ckrm_task_unlock(tsk); - if (newcls) { - /* compensate for previous grab */ - TC_DEBUG("(%s:%d): Race-condition caught <%s> %d\n", - tsk->comm, tsk->pid, class_core(newcls)->name, - event); - ckrm_core_drop(class_core(newcls)); - } - return; - } - // make sure we have a real destination core - if (!newcls) { - newcls = &taskclass_dflt_class; - ckrm_core_grab(class_core(newcls)); - } - // take out of old class - // remember that we need to drop the oldcore - if ((drop_old_cls = (curcls != NULL))) { - class_lock(class_core(curcls)); - if (newcls == curcls) { - // we are already in the destination class. - // we still need to drop oldcore - class_unlock(class_core(curcls)); - ckrm_task_unlock(tsk); - goto out; - } - list_del(&tsk->taskclass_link); - INIT_LIST_HEAD(&tsk->taskclass_link); - tsk->taskclass = NULL; - class_unlock(class_core(curcls)); - if (newcls == (void *)-1) { - tsk->taskclass = newcls; - ckrm_task_unlock(tsk); - // still need to get out of old class - newcls = NULL; - goto rc_handling; - } - } - // put into new class - class_lock(class_core(newcls)); - tsk->taskclass = newcls; - list_add(&tsk->taskclass_link, &class_core(newcls)->objlist); - class_unlock(class_core(newcls)); - - if (newcls == curcls) { - ckrm_task_unlock(tsk); - goto out; - } - - CE_NOTIFY(&CT_taskclass, event, newcls, tsk); - - ckrm_task_unlock(tsk); - - rc_handling: - clstype = &CT_taskclass; - if (clstype->bit_res_ctlrs) { - // avoid running through the entire list if non is registered - for (i = 0; i < clstype->max_resid; i++) { - if (clstype->res_ctlrs[i] == NULL) - continue; - atomic_inc(&clstype->nr_resusers[i]); - old_res_class = - curcls ? class_core(curcls)->res_class[i] : NULL; - new_res_class = - newcls ? class_core(newcls)->res_class[i] : NULL; - rcbs = clstype->res_ctlrs[i]; - if (rcbs && rcbs->change_resclass - && (old_res_class != new_res_class)) - (*rcbs->change_resclass) (tsk, old_res_class, - new_res_class); - atomic_dec(&clstype->nr_resusers[i]); - } - } - - out: - if (drop_old_cls) - ckrm_core_drop(class_core(curcls)); - return; -} - -// HF SUGGEST: we could macro-tize this for other types -// DEF_FUNC_ADD_RESCTRL(funcname,link) -// would DEF_FUNC_ADD_RESCTRL(tc_add_resctrl,taskclass_link) - -static void tc_add_resctrl(struct ckrm_core_class *core, int resid) -{ - struct task_struct *tsk; - struct ckrm_res_ctlr *rcbs; - - if ((resid < 0) || (resid >= CKRM_MAX_RES_CTLRS) - || ((rcbs = core->classtype->res_ctlrs[resid]) == NULL)) - return; - - class_lock(core); - list_for_each_entry(tsk, &core->objlist, taskclass_link) { - if (rcbs->change_resclass) - (*rcbs->change_resclass) (tsk, (void *)-1, - core->res_class[resid]); - } - class_unlock(core); -} - -/************************************************************************** - * Functions called from classification points * - **************************************************************************/ - -#define ECB_PRINTK(fmt, args...) \ -// do { if (CT_taskclass.ce_regd) -// printk("%s: " fmt, __FUNCTION__ , ## args); } while (0) - -#define CE_CLASSIFY_TASK(event, tsk) \ -do { \ - struct ckrm_task_class *newcls = NULL; \ - struct ckrm_task_class *oldcls = tsk->taskclass; \ - \ - CE_CLASSIFY_RET(newcls,&CT_taskclass,event,tsk); \ - if (newcls) { \ - /* called synchrously. no need to get task struct */ \ - ckrm_set_taskclass(tsk, newcls, oldcls, event); \ - } \ -} while (0) - - -#define CE_CLASSIFY_TASK_PROTECT(event, tsk) \ -do { \ - ce_protect(&CT_taskclass); \ - CE_CLASSIFY_TASK(event,tsk); \ - ce_release(&CT_taskclass); \ -} while (0) - -static void cb_taskclass_newtask(struct task_struct *tsk) -{ - tsk->taskclass = NULL; - INIT_LIST_HEAD(&tsk->taskclass_link); -} - -static void cb_taskclass_fork(struct task_struct *tsk) -{ - struct ckrm_task_class *cls = NULL; - - ECB_PRINTK("%p:%d:%s\n", tsk, tsk->pid, tsk->comm); - - ce_protect(&CT_taskclass); - CE_CLASSIFY_RET(cls, &CT_taskclass, CKRM_EVENT_FORK, tsk); - if (cls == NULL) { - ckrm_task_lock(tsk->parent); - cls = tsk->parent->taskclass; - ckrm_core_grab(class_core(cls)); - ckrm_task_unlock(tsk->parent); - } - if (!list_empty(&tsk->taskclass_link)) - printk(KERN_WARNING "BUG in cb_fork.. tsk (%s:%d> already linked\n", - tsk->comm, tsk->pid); - - ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_FORK); - ce_release(&CT_taskclass); -} - -static void cb_taskclass_exit(struct task_struct *tsk) -{ - CE_CLASSIFY_NORET(&CT_taskclass, CKRM_EVENT_EXIT, tsk); - ckrm_set_taskclass(tsk, (void *)-1, NULL, CKRM_EVENT_EXIT); -} - -static void cb_taskclass_exec(const char *filename) -{ - ECB_PRINTK("%p:%d:%s <%s>\n", current, current->pid, current->comm, - filename); - CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_EXEC, current); -} - -static void cb_taskclass_uid(void) -{ - ECB_PRINTK("%p:%d:%s\n", current, current->pid, current->comm); - CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_UID, current); -} - -static void cb_taskclass_gid(void) -{ - ECB_PRINTK("%p:%d:%s\n", current, current->pid, current->comm); - CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_GID, current); -} - -static void -cb_taskclass_xid(struct task_struct *tsk) -{ - ECB_PRINTK("%p:%d:%s\n",current,current->pid,current->comm); - CE_CLASSIFY_TASK_PROTECT(CKRM_EVENT_XID, tsk); -} - -static struct ckrm_event_spec taskclass_events_callbacks[] = { - CKRM_EVENT_SPEC(NEWTASK, cb_taskclass_newtask), - CKRM_EVENT_SPEC(EXEC, cb_taskclass_exec), - CKRM_EVENT_SPEC(FORK, cb_taskclass_fork), - CKRM_EVENT_SPEC(EXIT, cb_taskclass_exit), - CKRM_EVENT_SPEC(UID, cb_taskclass_uid), - CKRM_EVENT_SPEC(GID, cb_taskclass_gid), - CKRM_EVENT_SPEC(XID, cb_taskclass_xid), - {-1} -}; - -/*********************************************************************** - * - * Asynchronous callback functions (driven by RCFS) - * - * Async functions force a setting of the task structure - * synchronous callbacks are protected against race conditions - * by using a cmpxchg on the core before setting it. - * Async calls need to be serialized to ensure they can't - * race against each other - * - ***********************************************************************/ - -DECLARE_MUTEX(async_serializer); // serialize all async functions - -/* - * Go through the task list and reclassify all tasks according to the current - * classification rules. - * - * We have the problem that we can not hold any lock (including the - * tasklist_lock) while classifying. Two methods possible - * - * (a) go through entire pidrange (0..pidmax) and if a task exists at - * that pid then reclassify it - * (b) go several time through task list and build a bitmap for a particular - * subrange of pid otherwise the memory requirements ight be too much. - * - * We use a hybrid by comparing ratio nr_threads/pidmax - */ - -static int ckrm_reclassify_all_tasks(void) -{ - extern int pid_max; - - struct task_struct *proc, *thread; - int i; - int curpidmax = pid_max; - int ratio; - int use_bitmap; - - /* Check permissions */ - if ((!capable(CAP_SYS_NICE)) && (!capable(CAP_SYS_RESOURCE))) { - return -EPERM; - } - - ratio = curpidmax / nr_threads; - if (curpidmax <= PID_MAX_DEFAULT) { - use_bitmap = 1; - } else { - use_bitmap = (ratio >= 2); - } - - ce_protect(&CT_taskclass); - - retry: - - if (use_bitmap == 0) { - // go through it in one walk - read_lock(&tasklist_lock); - for (i = 0; i < curpidmax; i++) { - if ((thread = find_task_by_pid(i)) == NULL) - continue; - get_task_struct(thread); - read_unlock(&tasklist_lock); - CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY, thread); - put_task_struct(thread); - read_lock(&tasklist_lock); - } - read_unlock(&tasklist_lock); - } else { - unsigned long *bitmap; - int bitmapsize; - int order = 0; - int num_loops; - int pid, do_next; - - bitmap = (unsigned long *)__get_free_pages(GFP_KERNEL, order); - if (bitmap == NULL) { - use_bitmap = 0; - goto retry; - } - - bitmapsize = 8 * (1 << (order + PAGE_SHIFT)); - num_loops = (curpidmax + bitmapsize - 1) / bitmapsize; - - do_next = 1; - for (i = 0; i < num_loops && do_next; i++) { - int pid_start = i * bitmapsize; - int pid_end = pid_start + bitmapsize; - int num_found = 0; - int pos; - - memset(bitmap, 0, bitmapsize / 8); // start afresh - do_next = 0; - - read_lock(&tasklist_lock); - do_each_thread(proc, thread) { - pid = thread->pid; - if ((pid < pid_start) || (pid >= pid_end)) { - if (pid >= pid_end) { - do_next = 1; - } - continue; - } - pid -= pid_start; - set_bit(pid, bitmap); - num_found++; - } - while_each_thread(proc, thread); - read_unlock(&tasklist_lock); - - if (num_found == 0) - continue; - - pos = 0; - for (; num_found--;) { - pos = find_next_bit(bitmap, bitmapsize, pos); - pid = pos + pid_start; - - read_lock(&tasklist_lock); - if ((thread = find_task_by_pid(pid)) != NULL) { - get_task_struct(thread); - read_unlock(&tasklist_lock); - CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY, - thread); - put_task_struct(thread); - } else { - read_unlock(&tasklist_lock); - } - pos++; - } - } - - } - ce_release(&CT_taskclass); - return 0; -} - -/* - * Reclassify all tasks in the given core class. - */ - -static void ckrm_reclassify_class_tasks(struct ckrm_task_class *cls) -{ - int ce_regd; - struct ckrm_hnode *cnode; - struct ckrm_task_class *parcls; - int num = 0; - - if (!ckrm_validate_and_grab_core(&cls->core)) - return; - - down(&async_serializer); // protect again race condition - TC_DEBUG("start %p:%s:%d:%d\n", cls, cls->core.name, - atomic_read(&cls->core.refcnt), - atomic_read(&cls->core.hnode.parent->refcnt)); - // If no CE registered for this classtype, following will be needed - // repeatedly; - ce_regd = atomic_read(&class_core(cls)->classtype->ce_regd); - cnode = &(class_core(cls)->hnode); - parcls = class_type(ckrm_task_class_t, cnode->parent); - - next_task: - class_lock(class_core(cls)); - if (!list_empty(&class_core(cls)->objlist)) { - struct ckrm_task_class *newcls = NULL; - struct task_struct *tsk = - list_entry(class_core(cls)->objlist.next, - struct task_struct, taskclass_link); - - get_task_struct(tsk); - class_unlock(class_core(cls)); - - if (ce_regd) { - CE_CLASSIFY_RET(newcls, &CT_taskclass, - CKRM_EVENT_RECLASSIFY, tsk); - if (cls == newcls) { - // don't allow reclassifying to the same class - // as we are in the process of cleaning up - // this class - - // compensate CE's grab - ckrm_core_drop(class_core(newcls)); - newcls = NULL; - } - } - if (newcls == NULL) { - newcls = parcls; - ckrm_core_grab(class_core(newcls)); - } - ckrm_set_taskclass(tsk, newcls, cls, CKRM_EVENT_RECLASSIFY); - put_task_struct(tsk); - num++; - goto next_task; - } - TC_DEBUG("stop %p:%s:%d:%d %d\n", cls, cls->core.name, - atomic_read(&cls->core.refcnt), - atomic_read(&cls->core.hnode.parent->refcnt), num); - class_unlock(class_core(cls)); - ckrm_core_drop(class_core(cls)); - - up(&async_serializer); - - return; -} - -/* - * Change the core class of the given task - */ - -int ckrm_forced_reclassify_pid(pid_t pid, struct ckrm_task_class *cls) -{ - struct task_struct *tsk; - - if (cls && !ckrm_validate_and_grab_core(class_core(cls))) - return -EINVAL; - - read_lock(&tasklist_lock); - if ((tsk = find_task_by_pid(pid)) == NULL) { - read_unlock(&tasklist_lock); - if (cls) - ckrm_core_drop(class_core(cls)); - return -EINVAL; - } - get_task_struct(tsk); - read_unlock(&tasklist_lock); - - /* Check permissions */ - if ((!capable(CAP_SYS_NICE)) && - (!capable(CAP_SYS_RESOURCE)) && (current->user != tsk->user)) { - if (cls) - ckrm_core_drop(class_core(cls)); - put_task_struct(tsk); - return -EPERM; - } - - ce_protect(&CT_taskclass); - if (cls == NULL) - CE_CLASSIFY_TASK(CKRM_EVENT_RECLASSIFY,tsk); - else - ckrm_set_taskclass(tsk, cls, NULL, CKRM_EVENT_MANUAL); - - ce_release(&CT_taskclass); - put_task_struct(tsk); - - return 0; -} - -static struct ckrm_core_class *ckrm_alloc_task_class(struct ckrm_core_class - *parent, const char *name) -{ - struct ckrm_task_class *taskcls; - taskcls = kmalloc(sizeof(struct ckrm_task_class), GFP_KERNEL); - if (taskcls == NULL) - return NULL; - memset(taskcls, 0, sizeof(struct ckrm_task_class)); - - ckrm_init_core_class(&CT_taskclass, class_core(taskcls), parent, name); - - ce_protect(&CT_taskclass); - if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_add) - (*CT_taskclass.ce_callbacks.class_add) (name, taskcls, - CT_taskclass.typeID); - ce_release(&CT_taskclass); - - return class_core(taskcls); -} - -static int ckrm_free_task_class(struct ckrm_core_class *core) -{ - struct ckrm_task_class *taskcls; - - if (!ckrm_is_core_valid(core)) { - // Invalid core - return (-EINVAL); - } - if (core == core->classtype->default_class) { - // reset the name tag - core->name = dflt_taskclass_name; - return 0; - } - - TC_DEBUG("%p:%s:%d\n", core, core->name, atomic_read(&core->refcnt)); - - taskcls = class_type(struct ckrm_task_class, core); - - ce_protect(&CT_taskclass); - - if (CT_taskclass.ce_cb_active && CT_taskclass.ce_callbacks.class_delete) - (*CT_taskclass.ce_callbacks.class_delete) (core->name, taskcls, - CT_taskclass.typeID); - ckrm_reclassify_class_tasks(taskcls); - - ce_release(&CT_taskclass); - - ckrm_release_core_class(core); - // Hubertus .... could just drop the class .. error message - return 0; -} - -void __init ckrm_meta_init_taskclass(void) -{ - printk(KERN_DEBUG "...... Initializing ClassType<%s> ........\n", - CT_taskclass.name); - // intialize the default class - ckrm_init_core_class(&CT_taskclass, class_core(&taskclass_dflt_class), - NULL, dflt_taskclass_name); - - // register classtype and initialize default task class - ckrm_register_classtype(&CT_taskclass); - ckrm_register_event_set(taskclass_events_callbacks); - - // note registeration of all resource controllers will be done - // later dynamically as these are specified as modules - - // prepare init_task and then rely on inheritance of properties - ckrm_set_taskclass(&init_task, NULL, NULL, CKRM_EVENT_NEWTASK); -} - -static int tc_show_members(struct ckrm_core_class *core, struct seq_file *seq) -{ - struct list_head *lh; - struct task_struct *tsk; - - class_lock(core); - list_for_each(lh, &core->objlist) { - tsk = container_of(lh, struct task_struct, taskclass_link); - seq_printf(seq, "%ld\n", (long)tsk->pid); - } - class_unlock(core); - - return 0; -} - -static int tc_forced_reclassify(struct ckrm_core_class *target, const char *obj) -{ - pid_t pid; - int rc = -EINVAL; - - pid = (pid_t) simple_strtol(obj, NULL, 0); - - down(&async_serializer); // protect again race condition with reclassify_class - if (pid < 0) { - // do we want to treat this as process group .. TBD - rc = -EINVAL; - } else if (pid == 0) { - rc = (target == NULL) ? ckrm_reclassify_all_tasks() : -EINVAL; - } else { - struct ckrm_task_class *cls = NULL; - if (target) - cls = class_type(ckrm_task_class_t,target); - rc = ckrm_forced_reclassify_pid(pid,cls); - } - up(&async_serializer); - return rc; -} - -#if 0 - -/****************************************************************************** - * Debugging Task Classes: Utility functions - ******************************************************************************/ - -void check_tasklist_sanity(struct ckrm_task_class *cls) -{ - struct ckrm_core_class *core = class_core(cls); - struct list_head *lh1, *lh2; - int count = 0; - - if (core) { - class_lock(core); - if (list_empty(&core->objlist)) { - class_lock(core); - printk(KERN_DEBUG "check_tasklist_sanity: class %s empty list\n", - core->name); - return; - } - list_for_each_safe(lh1, lh2, &core->objlist) { - struct task_struct *tsk = - container_of(lh1, struct task_struct, - taskclass_link); - if (count++ > 20000) { - printk(KERN_WARNING "list is CORRUPTED\n"); - break; - } - if (tsk->taskclass != cls) { - const char *tclsname; - tclsname = (tsk->taskclass) ? - class_core(tsk->taskclass)->name:"NULL"; - printk(KERN_WARNING "sanity: task %s:%d has ckrm_core " - "|%s| but in list |%s|\n", tsk->comm, - tsk->pid, tclsname, core->name); - } - } - class_unlock(core); - } -} - -void ckrm_debug_free_task_class(struct ckrm_task_class *tskcls) -{ - struct task_struct *proc, *thread; - int count = 0; - - printk(KERN_DEBUG "Analyze Error <%s> %d\n", - class_core(tskcls)->name, - atomic_read(&(class_core(tskcls)->refcnt))); - - read_lock(&tasklist_lock); - class_lock(class_core(tskcls)); - do_each_thread(proc, thread) { - count += (tskcls == thread->taskclass); - if ((thread->taskclass == tskcls) || (tskcls == NULL)) { - const char *tclsname; - tclsname = (thread->taskclass) ? - class_core(thread->taskclass)->name :"NULL"; - printk(KERN_DEBUG "%d thread=<%s:%d> -> <%s> <%lx>\n", count, - thread->comm, thread->pid, tclsname, - thread->flags & PF_EXITING); - } - } while_each_thread(proc, thread); - class_unlock(class_core(tskcls)); - read_unlock(&tasklist_lock); - - printk(KERN_DEBUG "End Analyze Error <%s> %d\n", - class_core(tskcls)->name, - atomic_read(&(class_core(tskcls)->refcnt))); -} - -#endif diff --git a/kernel/ckrm/ckrmutils.c b/kernel/ckrm/ckrmutils.c deleted file mode 100644 index 5e5bf29c8..000000000 --- a/kernel/ckrm/ckrmutils.c +++ /dev/null @@ -1,200 +0,0 @@ -/* - * ckrmutils.c - Utility functions for CKRM - * - * Copyright (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Hubertus Franke , IBM Corp. 2004 - * - * Provides simple utility functions for the core module, CE and resource - * controllers. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* - * Changes - * - * 13 Nov 2003 - * Created - */ - -#include -#include -#include -#include -#include - -int get_exe_path_name(struct task_struct *tsk, char *buf, int buflen) -{ - struct vm_area_struct *vma; - struct vfsmount *mnt; - struct mm_struct *mm = get_task_mm(tsk); - struct dentry *dentry; - char *lname; - int rc = 0; - - *buf = '\0'; - if (!mm) { - return -EINVAL; - } - down_read(&mm->mmap_sem); - vma = mm->mmap; - while (vma) { - if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { - dentry = dget(vma->vm_file->f_dentry); - mnt = mntget(vma->vm_file->f_vfsmnt); - lname = d_path(dentry, mnt, buf, buflen); - if (!IS_ERR(lname)) { - strncpy(buf, lname, strlen(lname) + 1); - } else { - rc = (int)PTR_ERR(lname); - } - mntput(mnt); - dput(dentry); - break; - } - vma = vma->vm_next; - } - up_read(&mm->mmap_sem); - mmput(mm); - return rc; -} - -/* - * must be called with cnt_lock of parres held - * Caller is responsible for making sure that the new guarantee doesn't - * overflow parent's total guarantee. - */ -void child_guarantee_changed(struct ckrm_shares *parent, int cur, int new) -{ - if (new == cur || !parent) { - return; - } - if (new != CKRM_SHARE_DONTCARE) { - parent->unused_guarantee -= new; - } - if (cur != CKRM_SHARE_DONTCARE) { - parent->unused_guarantee += cur; - } - return; -} - -/* - * must be called with cnt_lock of parres held - * Caller is responsible for making sure that the new limit is not more - * than parent's max_limit - */ -void child_maxlimit_changed(struct ckrm_shares *parent, int new_limit) -{ - if (parent && parent->cur_max_limit < new_limit) { - parent->cur_max_limit = new_limit; - } - return; -} - -/* - * Caller is responsible for holding any lock to protect the data - * structures passed to this function - */ -int -set_shares(struct ckrm_shares *new, struct ckrm_shares *cur, - struct ckrm_shares *par) -{ - int rc = -EINVAL; - int cur_usage_guar = cur->total_guarantee - cur->unused_guarantee; - int increase_by; - - if (cur->my_guarantee < 0) // DONTCARE or UNCHANGED - increase_by = new->my_guarantee; - else - increase_by = new->my_guarantee - cur->my_guarantee; - - /* Check total_guarantee for correctness */ - if (new->total_guarantee <= CKRM_SHARE_DONTCARE) { - goto set_share_err; - } else if (new->total_guarantee == CKRM_SHARE_UNCHANGED) { - /* do nothing */; - } else if (cur_usage_guar > new->total_guarantee) { - goto set_share_err; - } - /* Check max_limit for correctness */ - if (new->max_limit <= CKRM_SHARE_DONTCARE) { - goto set_share_err; - } else if (new->max_limit == CKRM_SHARE_UNCHANGED) { - /* do nothing */; - } else if (cur->cur_max_limit > new->max_limit) { - goto set_share_err; - } - /* Check my_guarantee for correctness */ - if (new->my_guarantee == CKRM_SHARE_UNCHANGED) { - /* do nothing */; - } else if (new->my_guarantee == CKRM_SHARE_DONTCARE) { - /* do nothing */; - } else if (par && increase_by > par->unused_guarantee) { - goto set_share_err; - } - /* Check my_limit for correctness */ - if (new->my_limit == CKRM_SHARE_UNCHANGED) { - /* do nothing */; - } else if (new->my_limit == CKRM_SHARE_DONTCARE) { - /* do nothing */; - } else if (par && new->my_limit > par->max_limit) { - /* I can't get more limit than my parent's limit */ - goto set_share_err; - - } - /* make sure guarantee is lesser than limit */ - if (new->my_limit == CKRM_SHARE_DONTCARE) { - /* do nothing */; - } else if (new->my_limit == CKRM_SHARE_UNCHANGED) { - if (new->my_guarantee == CKRM_SHARE_DONTCARE) { - /* do nothing */; - } else if (new->my_guarantee == CKRM_SHARE_UNCHANGED) { - /* - * do nothing; earlier setting would have - * taken care of it - */; - } else if (new->my_guarantee > cur->my_limit) { - goto set_share_err; - } - } else { /* new->my_limit has a valid value */ - if (new->my_guarantee == CKRM_SHARE_DONTCARE) { - /* do nothing */; - } else if (new->my_guarantee == CKRM_SHARE_UNCHANGED) { - if (cur->my_guarantee > new->my_limit) { - goto set_share_err; - } - } else if (new->my_guarantee > new->my_limit) { - goto set_share_err; - } - } - if (new->my_guarantee != CKRM_SHARE_UNCHANGED) { - child_guarantee_changed(par, cur->my_guarantee, - new->my_guarantee); - cur->my_guarantee = new->my_guarantee; - } - if (new->my_limit != CKRM_SHARE_UNCHANGED) { - child_maxlimit_changed(par, new->my_limit); - cur->my_limit = new->my_limit; - } - if (new->total_guarantee != CKRM_SHARE_UNCHANGED) { - cur->unused_guarantee = new->total_guarantee - cur_usage_guar; - cur->total_guarantee = new->total_guarantee; - } - if (new->max_limit != CKRM_SHARE_UNCHANGED) { - cur->max_limit = new->max_limit; - } - rc = 0; -set_share_err: - return rc; -} - -EXPORT_SYMBOL_GPL(get_exe_path_name); -EXPORT_SYMBOL_GPL(child_guarantee_changed); -EXPORT_SYMBOL_GPL(child_maxlimit_changed); -EXPORT_SYMBOL_GPL(set_shares); diff --git a/kernel/ckrm/rbce/Makefile b/kernel/ckrm/rbce/Makefile deleted file mode 100644 index 6355d0b5a..000000000 --- a/kernel/ckrm/rbce/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# -# Makefile for CKRM -# - -obj-$(CONFIG_CKRM_RBCE) += rbce.o -rbce-objs := rbcemod.o rbce_fs.o - -obj-$(CONFIG_CKRM_CRBCE) += crbce.o -crbce-objs := crbcemod.o rbce_fs.o - -CFLAGS_crbcemod.o += -DRBCE_EXTENSION # compile rbcemod.c into crbce -CFLAGS_crbcemod.o += -DRBCE_DO_SAMPLE # disable if sampling not desired -CFLAGS_crbcemod.o += -DRBCE_DO_DELAY # disable if delay info not desired diff --git a/kernel/ckrm/rbce/bitvector.h b/kernel/ckrm/rbce/bitvector.h deleted file mode 100644 index 098cc2327..000000000 --- a/kernel/ckrm/rbce/bitvector.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * - * Bitvector package - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 15 Nov 2003 - * Created - */ - -#ifndef BITVECTOR_H -#define BITVECTOR_H - -typedef struct { - int size; // maxsize in longs - unsigned long bits[0]; // bit vector -} bitvector_t; - -#define BITS_2_LONGS(sz) (((sz)+BITS_PER_LONG-1)/BITS_PER_LONG) -#define BITS_2_BYTES(sz) (((sz)+7)/8) - -#if 0 -#define CHECK_VEC(vec) (vec) /* check against NULL */ -#else -#define CHECK_VEC(vec) (1) /* assume no problem */ -#endif - -#define CHECK_VEC_VOID(vec) do { if (!CHECK_VEC(vec)) return; } while(0) -#define CHECK_VEC_RC(vec, val) \ -do { if (!CHECK_VEC(vec)) return (val); } while(0) - -inline static void bitvector_zero(bitvector_t * bitvec) -{ - int sz; - - CHECK_VEC_VOID(bitvec); - sz = BITS_2_BYTES(bitvec->size); - memset(bitvec->bits, 0, sz); - return; -} - -inline static unsigned long bitvector_bytes(unsigned long size) -{ - return sizeof(bitvector_t) + BITS_2_BYTES(size); -} - -inline static void bitvector_init(bitvector_t * bitvec, unsigned long size) -{ - bitvec->size = size; - bitvector_zero(bitvec); - return; -} - -inline static bitvector_t *bitvector_alloc(unsigned long size) -{ - bitvector_t *vec = - (bitvector_t *) kmalloc(bitvector_bytes(size), GFP_KERNEL); - if (vec) { - vec->size = size; - bitvector_zero(vec); - } - return vec; -} - -inline static void bitvector_free(bitvector_t * bitvec) -{ - CHECK_VEC_VOID(bitvec); - kfree(bitvec); - return; -} - -#define def_bitvec_op(name,mod1,op,mod2) \ -inline static int name(bitvector_t *res, bitvector_t *op1, \ - bitvector_t *op2) \ -{ \ - unsigned int i, size; \ - \ - CHECK_VEC_RC(res, 0); \ - CHECK_VEC_RC(op1, 0); \ - CHECK_VEC_RC(op2, 0); \ - size = res->size; \ - if (((size != (op1)->size) || (size != (op2)->size))) { \ - return 0; \ - } \ - size = BITS_2_LONGS(size); \ - for (i = 0; i < size; i++) { \ - (res)->bits[i] = (mod1 (op1)->bits[i]) op \ - (mod2 (op2)->bits[i]); \ - } \ - return 1; \ -} - -def_bitvec_op(bitvector_or,, |,); -def_bitvec_op(bitvector_and,, &,); -def_bitvec_op(bitvector_xor,, ^,); -def_bitvec_op(bitvector_or_not,, |, ~); -def_bitvec_op(bitvector_not_or, ~, |,); -def_bitvec_op(bitvector_and_not,, &, ~); -def_bitvec_op(bitvector_not_and, ~, &,); - -inline static void bitvector_set(int idx, bitvector_t * vec) -{ - set_bit(idx, vec->bits); - return; -} - -inline static void bitvector_clear(int idx, bitvector_t * vec) -{ - clear_bit(idx, vec->bits); - return; -} - -inline static int bitvector_test(int idx, bitvector_t * vec) -{ - return test_bit(idx, vec->bits); -} - -#ifdef DEBUG -inline static void bitvector_print(int flag, bitvector_t * vec) -{ - unsigned int i; - int sz; - extern int rbcedebug; - - if ((rbcedebug & flag) == 0) { - return; - } - if (vec == NULL) { - printk(KERN_DEBUG "v<0>-NULL\n"); - return; - } - printk(KERN_DEBUG "v<%d>-", sz = vec->size); - for (i = 0; i < sz; i++) { - printk(KERN_DEBUG "%c", test_bit(i, vec->bits) ? '1' : '0'); - } - return; -} -#else -#define bitvector_print(x, y) -#endif - -#endif // BITVECTOR_H diff --git a/kernel/ckrm/rbce/crbce.h b/kernel/ckrm/rbce/crbce.h deleted file mode 100644 index c2967d18d..000000000 --- a/kernel/ckrm/rbce/crbce.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * crbce.h - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * - * This files contains the type definition of the record - * created by the CRBCE CKRM classification engine - * - * Changes - * - * 2003-11-11 Created by H.Franke - * 2003-12-01 Sanitized for Delivery by H.Franke - * - */ - -#ifndef CRBCE_RECORDS_H -#define CRBCE_RECORDS_H - -#include -#include -#include -#include - -#define CRBCE_UKCC_NAME "crbce_ukcc" -#define CRBCE_UKCC_PATH "/mnt/relayfs" - -#define CRBCE_UKCC_PATH_NAME CRBCE_UKCC_PATH"/"CRBCE_UKCC_NAME - -#define CRBCE_MAX_CLASS_NAME_LEN 256 - -/**************************************************************** - * - * CRBCE EVENT SET is and extension to the standard CKRM_EVENTS - * - ****************************************************************/ -enum { - - /* we use the standard CKRM_EVENT_<..> - * to identify reclassification cause actions - * and extend by additional ones we need - */ - - /* up event flow */ - - CRBCE_REC_EXIT = CKRM_NUM_EVENTS, - CRBCE_REC_DATA_DELIMITER, - CRBCE_REC_SAMPLE, - CRBCE_REC_TASKINFO, - CRBCE_REC_SYS_INFO, - CRBCE_REC_CLASS_INFO, - CRBCE_REC_KERNEL_CMD_DONE, - CRBCE_REC_UKCC_FULL, - - /* down command issueance */ - CRBCE_REC_KERNEL_CMD, - - CRBCE_NUM_EVENTS -}; - -struct task_sample_info { - uint32_t cpu_running; - uint32_t cpu_waiting; - uint32_t io_delayed; - uint32_t memio_delayed; -}; - -/********************************************* - * KERNEL -> USER records * - *********************************************/ - -/* we have records with either a time stamp or not */ -struct crbce_hdr { - int type; - pid_t pid; -}; - -struct crbce_hdr_ts { - int type; - pid_t pid; - uint32_t jiffies; - uint64_t cls; -}; - -/* individual records */ - -struct crbce_rec_fork { - struct crbce_hdr_ts hdr; - pid_t ppid; -}; - -struct crbce_rec_data_delim { - struct crbce_hdr_ts hdr; - int is_stop; /* 0 start, 1 stop */ -}; - -struct crbce_rec_task_data { - struct crbce_hdr_ts hdr; - struct task_sample_info sample; - struct task_delay_info delay; -}; - -struct crbce_ukcc_full { - struct crbce_hdr_ts hdr; -}; - -struct crbce_class_info { - struct crbce_hdr_ts hdr; - int action; - int namelen; - char name[CRBCE_MAX_CLASS_NAME_LEN]; -}; - -/********************************************* - * USER -> KERNEL records * - *********************************************/ - -enum crbce_kernel_cmd { - CRBCE_CMD_START, - CRBCE_CMD_STOP, - CRBCE_CMD_SET_TIMER, - CRBCE_CMD_SEND_DATA, -}; - -struct crbce_command { - int type; /* we need this for the K->U reflection */ - int cmd; - uint32_t len; /* added in the kernel for reflection */ -}; - -#define set_cmd_hdr(rec,tok) \ -((rec).hdr.type=CRBCE_REC_KERNEL_CMD,(rec).hdr.cmd=(tok)) - -struct crbce_cmd_done { - struct crbce_command hdr; - int rc; -}; - -struct crbce_cmd { - struct crbce_command hdr; -}; - -struct crbce_cmd_send_data { - struct crbce_command hdr; - int delta_mode; -}; - -struct crbce_cmd_settimer { - struct crbce_command hdr; - uint32_t interval; /* in msec .. 0 means stop */ -}; - -#endif diff --git a/kernel/ckrm/rbce/crbcemod.c b/kernel/ckrm/rbce/crbcemod.c deleted file mode 100644 index 3492049a3..000000000 --- a/kernel/ckrm/rbce/crbcemod.c +++ /dev/null @@ -1,2 +0,0 @@ -/* Easiest way to transmit a symbolic link as a patch */ -#include "rbcemod.c" diff --git a/kernel/ckrm/rbce/info.h b/kernel/ckrm/rbce/info.h deleted file mode 100644 index 3bc13b519..000000000 --- a/kernel/ckrm/rbce/info.h +++ /dev/null @@ -1,58 +0,0 @@ -static char *info = - "1. Magic files\n" - "\t|--rbce_info - read only file detailing how to setup and use RBCE.\n\n" - "\t|--rbce_reclassify - contains nothing. Writing a pid to it" - "reclassifies\n" - "\tthe given task according to the current set of rules.\n" - "\tWriting 0 to it reclassifies all tasks in the system according to the \n" - "\tsurrent set of rules. This is typically done by the user/sysadmin \n" - "\tafter changing/creating rules. \n\n" - "\t|--rbce_state - determines whether RBCE is currently active" - " or inactive.\n" - "\tWriting 1 (0) activates (deactivates) the CE. Reading the file\n" - "\treturns the current state.\n\n" - "\t|--rbce_tag - set tag of the given pid, syntax - \"pid tag\"\n\n" - "2. Rules subdirectory: Each rule of the RBCE is represented by a file in\n" - "/rcfs/ce/rules.\n\n" - "Following are the different attr/value pairs that can be specified.\n\n" - "Note: attr/value pairs must be separated by commas(,) with no space" - "between them\n\n" - "\t<*id> number where ={>,<,=,!}\n" - "\t<*id>={uid,euid,gid,egid}\n\n" - "\tcmd=\"string\" // basename of the command\n\n" - "\tpath=\"/path/to/string\" // full pathname of the command\n\n" - "\targs=\"string\" // argv[1] - argv[argc] of command\n\n" - "\ttag=\"string\" // application tag of the task\n\n" - "\t[+,-]depend=rule_filename\n" - "\t\t\t// used to chain a rule's terms with existing rules\n" - "\t\t\t// to avoid respecifying the latter's rule terms.\n" - "\t\t\t// A rule's dependent rules are evaluated before \n" - "\t\t\t// its rule terms get evaluated.\n" - "\t\t\t//\n" - "\t\t\t// An optional + or - can precede the depend keyword.\n" - "\t\t\t// +depend adds a dependent rule to the tail of the\n" - "\t\t\t// current chain, -depend removes an existing \n" - "\t\t\t// dependent rule\n\n" - "\torder=number // order in which this rule is executed relative to\n" - "\t\t\t// other independent rules.\n" - "\t\t\t// rule with order 1 is checked first and so on.\n" - "\t\t\t// As soon as a rule matches, the class of that rule\n" - "\t\t\t// is returned to Core. So, order really matters.\n" - "\t\t\t// If no order is specified by the user, the next\n" - "\t\t\t// highest available order number is assigned to\n" - "\t\t\t// the rule.\n\n" - "\tclass=\"/rcfs/.../classname\" // target class of this rule.\n" - "\t\t\t// /rcfs all by itself indicates the\n" - "\t\t\t// systemwide default class\n\n" - "\tstate=number // 1 or 0, provides the ability to deactivate a\n" - "\t\t\t// specific rule, if needed.\n\n" - "\tipv4=\"string\" // ipv4 address in dotted decimal and port\n" - "\t\t\t// e.g. \"127.0.0.1\\80\"\n" - "\t\t\t// e.g. \"*\\80\" for CE to match any address\n" - "\t\t\t// used in socket accept queue classes\n\n" - "\tipv6=\"string\" // ipv6 address in hex and port\n" - "\t\t\t// e.g. \"fe80::4567\\80\"\n" - "\t\t\t// e.g. \"*\\80\" for CE to match any address \n" - "\t\t\t// used in socket accept queue classes\n\n" - "\texample:\n" - "\techo \"uid=100,euid<200,class=/rcfs\" > /rcfs/ce/rules/rule1\n"; diff --git a/kernel/ckrm/rbce/rbce.h b/kernel/ckrm/rbce/rbce.h deleted file mode 100644 index a3af72fcd..000000000 --- a/kernel/ckrm/rbce/rbce.h +++ /dev/null @@ -1,122 +0,0 @@ -/* Rule-based Classification Engine (RBCE) module - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * - * Module for loading of classification policies and providing - * a user API for Class-based Kernel Resource Management (CKRM) - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * 25 Mar 2004 - * Integrate RBCE and CRBE into a single module - */ - -#ifndef RBCE_H -#define RBCE_H - -// data types defined in main rbcemod.c -struct rbce_private_data; -struct rbce_class; -struct ckrm_core_class; - -#ifndef RBCE_EXTENSION - -/**************************************************************************** - * - * RBCE STANDALONE VERSION, NO CHOICE FOR DATA COLLECTION - * - ****************************************************************************/ - -#ifdef RBCE_SHOW_INCL -#warning " ... RBCE .." -#endif - -#define RBCE_MOD_DESCR "Rule Based Classification Engine Module for CKRM" -#define RBCE_MOD_NAME "rbce" - -/* extension to private data: NONE */ -struct rbce_ext_private_data { - /* empty data */ -}; -static inline void init_ext_private_data(struct rbce_private_data *dst) -{ -} - -/* sending notification to user: NONE */ - -static void notify_class_action(struct rbce_class *cls, int action) -{ -} -static inline void send_fork_notification(struct task_struct *tsk, - struct ckrm_core_class *cls) -{ -} -static inline void send_exit_notification(struct task_struct *tsk) -{ -} -static inline void send_manual_notification(struct task_struct *tsk) -{ -} - -/* extension initialization and destruction at module init and exit */ -static inline int init_rbce_ext_pre(void) -{ - return 0; -} -static inline int init_rbce_ext_post(void) -{ - return 0; -} -static inline void exit_rbce_ext(void) -{ -} - -#else - -/*************************************************************************** - * - * RBCE with User Level Notification - * - ***************************************************************************/ - -#ifdef RBCE_SHOW_INCL -#warning " ... CRBCE .." -#ifdef RBCE_DO_SAMPLE -#warning " ... CRBCE doing sampling ..." -#endif -#ifdef RBCE_DO_DELAY -#warning " ... CRBCE doing delay ..." -#endif -#endif - -#define RBCE_MOD_DESCR "Rule Based Classification Engine Module" \ - "with Data Sampling/Delivery for CKRM" -#define RBCE_MOD_NAME "crbce" - -#include "crbce.h" - -struct rbce_ext_private_data { - struct task_sample_info sample; -}; - -static void notify_class_action(struct rbce_class *cls, int action); -#if 0 -static void send_fork_notification(struct task_struct *tsk, - struct ckrm_core_class *cls); -static void send_exit_notification(struct task_struct *tsk); -static void send_manual_notification(struct task_struct *tsk); -#endif - -#endif - -#endif // RBCE_H diff --git a/kernel/ckrm/rbce/rbce_fs.c b/kernel/ckrm/rbce/rbce_fs.c deleted file mode 100644 index 8631bd18d..000000000 --- a/kernel/ckrm/rbce/rbce_fs.c +++ /dev/null @@ -1,490 +0,0 @@ -/* RCFS API for Rule-based Classification Engine (RBCE) and - * Consolidated RBCE module code (combined) - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Vivek Kashyap, IBM Corp. 2004 - * - * Module for loading of classification policies and providing - * a user API for Class-based Kernel Resource Management (CKRM) - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern int rbce_enabled; -extern void get_rule(const char *, char *); -extern int rule_exists(const char *); -extern int change_rule(const char *, char *); -extern int delete_rule(const char *); -//extern int reclassify_pid(int); -extern int set_tasktag(int, char *); -extern int rename_rule(const char *, const char *); - -extern int rcfs_register_engine(rbce_eng_callback_t * rcbs); -extern int rcfs_unregister_engine(rbce_eng_callback_t * rcbs); -extern int rcfs_mkroot(struct rcfs_magf *, int, struct dentry **); -extern int rcfs_rmroot(struct dentry *); - -static int rbce_unlink(struct inode *, struct dentry *); - -#include "info.h" -static ssize_t -rbce_write(struct file *file, const char __user * buf, - size_t len, loff_t * ppos) -{ - char *line, *ptr; - int rc = 0, pid; - - line = (char *)kmalloc(len + 1, GFP_KERNEL); - if (!line) { - return -ENOMEM; - } - if (copy_from_user(line, buf, len)) { - kfree(line); - return -EFAULT; - } - line[len] = '\0'; - ptr = line + strlen(line) - 1; - if (*ptr == '\n') { - *ptr = '\0'; - } - if (!strcmp(file->f_dentry->d_name.name, "rbce_tag")) { - pid = simple_strtol(line, &ptr, 0); - rc = set_tasktag(pid, ptr + 1); // expected syntax "pid tag" - } else if (!strcmp(file->f_dentry->d_name.name, "rbce_state")) { - rbce_enabled = line[0] - '0'; - } else if (!strcmp(file->f_dentry->d_name.name, "rbce_info")) { - len = -EPERM; - } else { - rc = change_rule(file->f_dentry->d_name.name, line); - } - if (rc) { - len = rc; - } - // printk("kernel read |%s|\n", line); - // printk("kernel read-2 |%s|\n", line+1000); - // printk prints only 1024 bytes once :) - // - kfree(line); - return len; -} - -static int rbce_show(struct seq_file *seq, void *offset) -{ - struct file *file = (struct file *)seq->private; - char result[256]; - - memset(result, 0, 256); - if (!strcmp(file->f_dentry->d_name.name, "rbce_tag")) { - return -EPERM; - } - if (!strcmp(file->f_dentry->d_name.name, "rbce_state")) { - seq_printf(seq, "%d\n", rbce_enabled); - return 0; - } - if (!strcmp(file->f_dentry->d_name.name, "rbce_info")) { - seq_printf(seq, info); - return 0; - } - - get_rule(file->f_dentry->d_name.name, result); - seq_printf(seq, "%s\n", result); - return 0; -} - -static int rbce_open(struct inode *inode, struct file *file) -{ - //printk("mnt_mountpoint %s\n", - // file->f_vfsmnt->mnt_mountpoint->d_name.name); - //printk("mnt_root %s\n", file->f_vfsmnt->mnt_root->d_name.name); - return single_open(file, rbce_show, file); -} - -static int rbce_close(struct inode *ino, struct file *file) -{ - const char *name = file->f_dentry->d_name.name; - - if (strcmp(name, "rbce_state") && - strcmp(name, "rbce_tag") && strcmp(name, "rbce_info")) { - - if (!rule_exists(name)) { - // need more stuff to happen in the vfs layer - rbce_unlink(file->f_dentry->d_parent->d_inode, - file->f_dentry); - } - } - return single_release(ino, file); -} - -#define RCFS_MAGIC 0x4feedbac - -static struct file_operations rbce_file_operations; -static struct inode_operations rbce_file_inode_operations; -static struct inode_operations rbce_dir_inode_operations; - -static struct inode *rbce_get_inode(struct inode *dir, int mode, dev_t dev) -{ - struct inode *inode = new_inode(dir->i_sb); - - if (inode) { - inode->i_mode = mode; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; - inode->i_blocks = 0; - inode->i_mapping->a_ops = dir->i_mapping->a_ops; - inode->i_mapping->backing_dev_info = - dir->i_mapping->backing_dev_info; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - switch (mode & S_IFMT) { - default: - init_special_inode(inode, mode, dev); - break; - case S_IFREG: - /* Treat as default assignment */ - inode->i_op = &rbce_file_inode_operations; - inode->i_fop = &rbce_file_operations; - break; - case S_IFDIR: - inode->i_op = &rbce_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* directory inodes start off with i_nlink == 2 - (for "." entry) */ - inode->i_nlink++; - break; - } - } - return inode; -} - -/* - * File creation. Allocate an inode, and we're done.. - */ -/* SMP-safe */ -static int -rbce_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) -{ - struct inode *inode = rbce_get_inode(dir, mode, dev); - int error = -ENOSPC; - - if (inode) { - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - error = 0; - - } - return error; -} - -static int rbce_unlink(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - int rc; - - rc = delete_rule(dentry->d_name.name); - if (rc == 0) { - if (dir) { - dir->i_ctime = dir->i_mtime = CURRENT_TIME; - } - inode->i_ctime = CURRENT_TIME; - inode->i_nlink--; - dput(dentry); - } - return rc; -} - -static int -rbce_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - int rc; - struct inode *inode = old_dentry->d_inode; - struct dentry *old_d = list_entry(old_dir->i_dentry.next, - struct dentry, d_alias); - struct dentry *new_d = list_entry(new_dir->i_dentry.next, - struct dentry, d_alias); - - // cannot rename any directory - if (S_ISDIR(old_dentry->d_inode->i_mode)) { - return -EINVAL; - } - // cannot rename anything under /ce - if (!strcmp(old_d->d_name.name, "ce")) { - return -EINVAL; - } - // cannot move anything to /ce - if (!strcmp(new_d->d_name.name, "ce")) { - return -EINVAL; - } - - rc = rename_rule(old_dentry->d_name.name, new_dentry->d_name.name); - - if (!rc) { - old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = - new_dir->i_mtime = inode->i_ctime = CURRENT_TIME; - } - return rc; -} - -// CE allows only the rules directory to be created -int rbce_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - int retval = -EINVAL; - - struct dentry *pd = - list_entry(dir->i_dentry.next, struct dentry, d_alias); - - // Allow only /rcfs/ce and ce/rules - if ((!strcmp(pd->d_name.name, "ce") && - !strcmp(dentry->d_name.name, "rules")) || - (!strcmp(pd->d_name.name, "/") && - !strcmp(dentry->d_name.name, "ce"))) { - - if (!strcmp(dentry->d_name.name, "ce")) { - try_module_get(THIS_MODULE); - } - retval = rbce_mknod(dir, dentry, mode | S_IFDIR, 0); - if (!retval) { - dir->i_nlink++; - } - } - - return retval; -} - -// CE doesn't allow deletion of directory -int rbce_rmdir(struct inode *dir, struct dentry *dentry) -{ - int rc; - // printk("removal of directory %s prohibited\n", dentry->d_name.name); - rc = simple_rmdir(dir, dentry); - - if (!rc && !strcmp(dentry->d_name.name, "ce")) { - module_put(THIS_MODULE); - } - return rc; -} - -static int -rbce_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) -{ - struct dentry *pd = - list_entry(dir->i_dentry.next, struct dentry, d_alias); - - // Under /ce only "rbce_state", "rbce_tag" and "rbce_info" are allowed - if (!strcmp(pd->d_name.name, "ce")) { - if (strcmp(dentry->d_name.name, "rbce_state") && - strcmp(dentry->d_name.name, "rbce_tag") && - strcmp(dentry->d_name.name, "rbce_info")) { - return -EINVAL; - } - } - - return rbce_mknod(dir, dentry, mode | S_IFREG, 0); -} - -static int rbce_link(struct dentry *old_d, struct inode *dir, struct dentry *d) -{ - return -EINVAL; -} - -static int -rbce_symlink(struct inode *dir, struct dentry *dentry, const char *symname) -{ - return -EINVAL; -} - -/******************************* Magic files ********************/ - -#define RBCE_NR_MAGF 5 -struct rcfs_magf rbce_magf_files[RBCE_NR_MAGF] = { - { - .name = "ce", - .mode = RCFS_DEFAULT_DIR_MODE, - .i_op = &rbce_dir_inode_operations, - }, - { - .name = "rbce_tag", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_fop = &rbce_file_operations, - }, - { - .name = "rbce_info", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_fop = &rbce_file_operations, - }, - { - .name = "rbce_state", - .mode = RCFS_DEFAULT_FILE_MODE, - .i_fop = &rbce_file_operations, - }, - { - .name = "rules", - .mode = (RCFS_DEFAULT_DIR_MODE | S_IWUSR), - .i_fop = &simple_dir_operations, - .i_op = &rbce_dir_inode_operations, - } -}; - -static struct dentry *ce_root_dentry; - -int rbce_create_magic(void) -{ - int rc; - - // Make root dentry - rc = rcfs_mkroot(rbce_magf_files, RBCE_NR_MAGF, &ce_root_dentry); - if ((!ce_root_dentry) || rc) - return rc; - - // Create magic files - if ((rc = rcfs_create_magic(ce_root_dentry, &rbce_magf_files[1], - RBCE_NR_MAGF - 1))) { - printk(KERN_ERR "Failed to create c/rbce magic files." - " Deleting c/rbce root\n"); - rcfs_rmroot(ce_root_dentry); - return rc; - } - - return rc; -} - -int rbce_clear_magic(void) -{ - int rc = 0; - if (ce_root_dentry) - rc = rcfs_rmroot(ce_root_dentry); - return rc; -} - -/******************************* File ops ********************/ - -static struct file_operations rbce_file_operations = { - .owner = THIS_MODULE, - .open = rbce_open, - .llseek = seq_lseek, - .read = seq_read, - .write = rbce_write, - .release = rbce_close, -}; - -static struct inode_operations rbce_file_inode_operations = { - .getattr = simple_getattr, -}; - -static struct inode_operations rbce_dir_inode_operations = { - .create = rbce_create, - .lookup = simple_lookup, - .link = rbce_link, - .unlink = rbce_unlink, - .symlink = rbce_symlink, - .mkdir = rbce_mkdir, - .rmdir = rbce_rmdir, - .mknod = rbce_mknod, - .rename = rbce_rename, - .getattr = simple_getattr, -}; - -#if 0 -static void rbce_put_super(struct super_block *sb) -{ - module_put(THIS_MODULE); - printk(KERN_DEBUG "rbce_put_super called\n"); -} - -static struct super_operations rbce_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .put_super = rbce_put_super, -}; - -static int rbce_fill_super(struct super_block *sb, void *data, int silent) -{ - struct inode *inode; - struct dentry *root; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = RCFS_MAGIC; - sb->s_op = &rbce_ops; - inode = rbce_get_inode(sb, S_IFDIR | 0755, 0); - if (!inode) - return -ENOMEM; - - root = d_alloc_root(inode); - if (!root) { - iput(inode); - return -ENOMEM; - } - sb->s_root = root; - - return 0; -} - -static struct super_block *rbce_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) -{ - struct super_block *sb = - get_sb_nodev(fs_type, flags, data, rbce_fill_super); - if (sb) { - try_module_get(THIS_MODULE); - } - return sb; -} - -static struct file_system_type rbce_fs_type = { - .name = "rbce", - .get_sb = rbce_get_sb, - .kill_sb = kill_litter_super, -}; - -static int -__init init_rbce_fs(void) -{ - return register_filesystem(&rbce_fs_type); -} - -static void -__exit exit_rbce_fs(void) -{ - unregister_filesystem(&rbce_fs_type); -} - -module_init(init_rbce_fs) - module_exit(exit_rbce_fs) - MODULE_LICENSE("GPL"); -#endif diff --git a/kernel/ckrm/rbce/rbcemod.c b/kernel/ckrm/rbce/rbcemod.c deleted file mode 100644 index 98f624fdf..000000000 --- a/kernel/ckrm/rbce/rbcemod.c +++ /dev/null @@ -1,2611 +0,0 @@ -/* Rule-based Classification Engine (RBCE) and - * Consolidated RBCE module code (combined) - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Vivek Kashyap, IBM Corp. 2004 - * - * Module for loading of classification policies and providing - * a user API for Class-based Kernel Resource Management (CKRM) - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -/* Changes - * - * 28 Aug 2003 - * Created. First cut with much scope for cleanup ! - * 07 Nov 2003 - * Made modifications to suit the new RBCE module. - * Made modifications to address sampling and delivery - * 16 Mar 2004 - * Integrated changes from original RBCE module - * 25 Mar 2004 - * Merged RBCE and CRBCE into common code base - * 29 Mar 2004 - * Incorporated listen call back and IPv4 match support - * 23 Apr 2004 - * Added Multi-Classtype Support - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include "bitvector.h" -#include - -#warning MEF I cannot believe that vserver changes force the following include statement: FIX THIS! -#include - - -#define DEBUG - -MODULE_DESCRIPTION(RBCE_MOD_DESCR); -MODULE_AUTHOR("Hubertus Franke, Chandra Seetharaman (IBM)"); -MODULE_LICENSE("GPL"); - -static char modname[] = RBCE_MOD_NAME; - -/* ==================== typedef, global variables etc., ==================== */ -struct named_obj_hdr { - struct list_head link; - int referenced; - char *name; -}; - -#define GET_REF(x) ((x)->obj.referenced) -#define INC_REF(x) (GET_REF(x)++) -#define DEC_REF(x) (--GET_REF(x)) -struct rbce_class { - struct named_obj_hdr obj; - int classtype; - void *classobj; -}; - -typedef enum { - RBCE_RULE_CMD_PATH = 1, // full qualified path - RBCE_RULE_CMD, // basename of the command - RBCE_RULE_ARGS, // arguments of the command - RBCE_RULE_REAL_UID, // task's real uid - RBCE_RULE_REAL_GID, // task's real gid - RBCE_RULE_EFFECTIVE_UID, // task's effective uid - RBCE_RULE_EFFECTIVE_GID, // task's effective gid - RBCE_RULE_APP_TAG, // task's application tag - RBCE_RULE_IPV4, // IP address of listen(), ipv4 format - RBCE_RULE_IPV6, // IP address of listen(), ipv6 format - RBCE_RULE_XID, // VSERVER - RBCE_RULE_DEP_RULE, // dependent rule; must be the first term - RBCE_RULE_INVALID, // invalid, for filler - RBCE_RULE_INVALID2, // invalid, for filler -} rbce_rule_op_t; - -typedef enum { - RBCE_EQUAL = 1, - RBCE_NOT, - RBCE_LESS_THAN, - RBCE_GREATER_THAN, -} rbce_operator_t; - -struct rbce_rule_term { - rbce_rule_op_t op; - rbce_operator_t operator; - union { - char *string; // path, cmd, arg, tag, ipv4 and ipv6 - long id; // uid, gid, euid, egid - struct rbce_rule *deprule; - } u; -}; - -struct rbce_rule { - struct named_obj_hdr obj; - struct rbce_class *target_class; - int classtype; - int num_terms; - int *terms; // vector of indices into the global term vector - int index; // index of this rule into the global term vector - int termflag; // which term ids would require a recalculation - int do_opt; // do we have to consider this rule during optimize - char *strtab; // string table to store the strings of all terms - int order; // order of execution of this rule - int state; // RBCE_RULE_ENABLED/RBCE_RULE_DISABLED -}; - -// rules states -#define RBCE_RULE_DISABLED 0 -#define RBCE_RULE_ENABLED 1 - -/// -// Data structures and macros used for optimization -#define RBCE_TERM_CMD (0) -#define RBCE_TERM_UID (1) -#define RBCE_TERM_GID (2) -#define RBCE_TERM_TAG (3) -#define RBCE_TERM_IPV4 (4) -#define RBCE_TERM_IPV6 (5) -#define RBCE_TERM_XID (6) - -#define NUM_TERM_MASK_VECTOR (7) // must be one more the last RBCE_TERM_... - -// Rule flags. 1 bit for each type of rule term -#define RBCE_TERMFLAG_CMD (1 << RBCE_TERM_CMD) -#define RBCE_TERMFLAG_UID (1 << RBCE_TERM_UID) -#define RBCE_TERMFLAG_GID (1 << RBCE_TERM_GID) -#define RBCE_TERMFLAG_TAG (1 << RBCE_TERM_TAG) -#define RBCE_TERMFLAG_IPV4 (1 << RBCE_TERM_IPV4) -#define RBCE_TERMFLAG_IPV6 (1 << RBCE_TERM_IPV6) -#define RBCE_TERMFLAG_XID (1 << RBCE_TERM_XID) -#define RBCE_TERMFLAG_ALL (RBCE_TERMFLAG_CMD | RBCE_TERMFLAG_UID | \ - RBCE_TERMFLAG_GID | RBCE_TERMFLAG_TAG | RBCE_TERMFLAG_XID | \ - RBCE_TERMFLAG_IPV4 | RBCE_TERMFLAG_IPV6) - -int termop_2_vecidx[RBCE_RULE_INVALID] = { - [RBCE_RULE_CMD_PATH] = RBCE_TERM_CMD, - [RBCE_RULE_CMD] = RBCE_TERM_CMD, - [RBCE_RULE_ARGS] = RBCE_TERM_CMD, - [RBCE_RULE_REAL_UID] = RBCE_TERM_UID, - [RBCE_RULE_REAL_GID] = RBCE_TERM_GID, - [RBCE_RULE_EFFECTIVE_UID] = RBCE_TERM_UID, - [RBCE_RULE_EFFECTIVE_GID] = RBCE_TERM_GID, - [RBCE_RULE_XID] = RBCE_TERM_XID, - [RBCE_RULE_APP_TAG] = RBCE_TERM_TAG, - [RBCE_RULE_IPV4] = RBCE_TERM_IPV4, - [RBCE_RULE_IPV6] = RBCE_TERM_IPV6, - [RBCE_RULE_DEP_RULE] = -1 -}; - -#define TERMOP_2_TERMFLAG(x) (1 << termop_2_vecidx[x]) -#define TERM_2_TERMFLAG(x) (1 << x) - -#define POLICY_INC_NUMTERMS (BITS_PER_LONG) // No. of terms added at a time -#define POLICY_ACTION_NEW_VERSION 0x01 // Force reallocation -#define POLICY_ACTION_REDO_ALL 0x02 // Recompute all rule flags -#define POLICY_ACTION_PACK_TERMS 0x04 // Time to pack the terms - -const int use_persistent_state = 1; - -struct ckrm_eng_callback ckrm_ecbs; - -// Term vector state -// -static int gl_bitmap_version, gl_action, gl_num_terms; -static int gl_allocated, gl_released; -struct rbce_rule_term *gl_terms; -bitvector_t *gl_mask_vecs[NUM_TERM_MASK_VECTOR]; - -extern int errno; -static void optimize_policy(void); - -#ifndef CKRM_MAX_CLASSTYPES -#define CKRM_MAX_CLASSTYPES 32 -#endif - -struct list_head rules_list[CKRM_MAX_CLASSTYPES]; -LIST_HEAD(class_list); // List of classes used - -static int gl_num_rules; -static int gl_rules_version; -int rbce_enabled = 1; -static rwlock_t global_rwlock = RW_LOCK_UNLOCKED; - /* - * One lock to protect them all !!! - * Additions, deletions to rules must - * happen with this lock being held in write mode. - * Access(read/write) to any of the data structures must happen - * with this lock held in read mode. - * Since, rule related changes do not happen very often it is ok to - * have single rwlock. - */ - -/* - * data structure rbce_private_data holds the bit vector 'eval' which - * specifies if rules and terms of rules are evaluated against the task - * and if they were evaluated, bit vector 'true' holds the result of that - * evaluation. - * - * This data structure is maintained in a task, and the bitvectors are - * updated only when needed. - * - * Each rule and each term of a rule has a corresponding bit in the vector. - * - */ -struct rbce_private_data { - struct rbce_ext_private_data ext_data; - int evaluate; // whether to evaluate rules or not ? - int rules_version; // whether to evaluate rules or not ? - char *app_tag; - unsigned long bitmap_version; - bitvector_t *eval; - bitvector_t *true; - char data[0]; // eval points to this variable size data array -}; - -#define RBCE_DATA(tsk) ((struct rbce_private_data*)((tsk)->ce_data)) -#define RBCE_DATAP(tsk) ((tsk)->ce_data) - -/* ======================= DEBUG Functions ========================= */ - -#ifdef DEBUG - -int rbcedebug = 0x00; - -#define DBG_CLASSIFY_RES ( 0x01 ) -#define DBG_CLASSIFY_DETAILS ( 0x02 ) -#define DBG_OPTIMIZATION ( 0x04 ) -#define DBG_SHOW_RESCTL ( 0x08 ) -#define DBG_CLASS ( 0x10 ) -#define DBG_RULE ( 0x20 ) -#define DBG_POLICY ( 0x40 ) - -#define DPRINTK(x, y...) if (rbcedebug & (x)) printk(KERN_DEBUG y) - // debugging selectively enabled through /proc/sys/debug/rbce - -static void print_context_vectors(void) -{ - int i; - - if ((rbcedebug & DBG_OPTIMIZATION) == 0) { - return; - } - for (i = 0; i < NUM_TERM_MASK_VECTOR; i++) { - printk(KERN_DEBUG "%d: ", i); - bitvector_print(DBG_OPTIMIZATION, gl_mask_vecs[i]); - printk(KERN_DEBUG "\n"); - } -} -#else - -#define DPRINTK(x, y...) -#define print_context_vectors(x) -#endif - -/* ====================== VSERVER support ========================== */ -#define CONFIG_VSERVER -#ifdef CONFIG_VSERVER -#include -#else -typedef unsigned int xid_t; -#define vx_task_xid(t) (0) -#endif - -/* ======================= Helper Functions ========================= */ - -#include "token.c" - -static struct ckrm_core_class *rbce_classify(struct task_struct *, - struct ckrm_net_struct *, - unsigned long, int classtype); - -static inline struct rbce_rule *find_rule_name(const char *name) -{ - struct named_obj_hdr *pos; - int i; - - for (i = 0; i < CKRM_MAX_CLASSTYPES; i++) { - list_for_each_entry(pos, &rules_list[i], link) { - if (!strcmp(pos->name, name)) { - return ((struct rbce_rule *)pos); - } - } - } - return NULL; -} - -static inline struct rbce_class *find_class_name(const char *name) -{ - struct named_obj_hdr *pos; - - list_for_each_entry(pos, &class_list, link) { - if (!strcmp(pos->name, name)) - return (struct rbce_class *)pos; - } - return NULL; -} - -/* - * Insert the given rule at the specified order - * order = -1 ==> insert at the tail. - * - * Caller must hold global_rwlock in write mode. - */ -static int insert_rule(struct rbce_rule *rule, int order) -{ -#define ORDER_COUNTER_INCR 10 - static int order_counter; - int old_counter; - struct list_head *head = &rules_list[rule->classtype]; - struct list_head *insert = head; - struct rbce_rule *tmp; - - if (gl_num_rules == 0) { - order_counter = 0; - } - - switch (order) { - case -1: - rule->order = order_counter; - // FIXME: order_counter overflow/wraparound!! - order_counter += ORDER_COUNTER_INCR; - break; - default: - old_counter = order_counter; - if (order_counter < order) { - order_counter = order; - } - rule->order = order; - order_counter += ORDER_COUNTER_INCR; - list_for_each_entry(tmp, head, obj.link) { - if (rule->order == tmp->order) { - order_counter = old_counter; - return -EEXIST; - } - if (rule->order < tmp->order) { - insert = &tmp->obj.link; - break; - } - } - } - list_add_tail(&rule->obj.link, insert); - // protect the module from removed when any rule is - // defined - try_module_get(THIS_MODULE); - gl_num_rules++; - gl_rules_version++; - return 0; -} - -/* - * Remove the rule and reinsert at the specified order. - * - * Caller must hold global_rwlock in write mode. - */ -static int reinsert_rule(struct rbce_rule *rule, int order) -{ - if (!list_empty(&rule->obj.link)) { - list_del_init(&rule->obj.link); - gl_num_rules--; - gl_rules_version++; - module_put(THIS_MODULE); - } - return insert_rule(rule, order); -} - -/* - * Get a refernece to the class, create one if it doesn't exist - * - * Caller need to hold global_rwlock in write mode. - * __GFP_WAIT - */ - -static struct rbce_class *create_rbce_class(const char *classname, - int classtype, void *classobj) -{ - struct rbce_class *cls; - - if (classtype >= CKRM_MAX_CLASSTYPES) { - printk(KERN_ERR - "ckrm_classobj returned %d as classtype which cannot " - " be handled by RBCE\n", classtype); - return NULL; - } - - cls = kmalloc(sizeof(struct rbce_class), GFP_ATOMIC); - if (!cls) { - return NULL; - } - cls->obj.name = kmalloc(strlen(classname) + 1, GFP_ATOMIC); - if (cls->obj.name) { - GET_REF(cls) = 1; - cls->classobj = classobj; - strcpy(cls->obj.name, classname); - list_add_tail(&cls->obj.link, &class_list); - cls->classtype = classtype; - } else { - kfree(cls); - cls = NULL; - } - return cls; -} - -static struct rbce_class *get_class(const char *classname, int *classtype) -{ - struct rbce_class *cls; - void *classobj; - - if (!classname) { - return NULL; - } - cls = find_class_name(classname); - if (cls) { - if (cls->classobj) { - INC_REF(cls); - *classtype = cls->classtype; - return cls; - } - return NULL; - } - classobj = ckrm_classobj(classname, classtype); - if (!classobj) { - return NULL; - } - - return create_rbce_class(classname, *classtype, classobj); -} - -/* - * Drop a refernece to the class, create one if it doesn't exist - * - * Caller need to hold global_rwlock in write mode. - */ -static void put_class(struct rbce_class *cls) -{ - if (cls) { - if (DEC_REF(cls) <= 0) { - list_del(&cls->obj.link); - kfree(cls->obj.name); - kfree(cls); - } - } - return; -} - -/* - * Callback from core when a class is added - */ - -#ifdef RBCE_EXTENSION -static void rbce_class_addcb(const char *classname, void *clsobj, int classtype) -{ - struct rbce_class *cls; - - write_lock(&global_rwlock); - cls = get_class(classname, &classtype); - if (cls) { - cls->classobj = clsobj; - notify_class_action(cls, 1); - } - write_unlock(&global_rwlock); - return; -} -#endif - -/* - * Callback from core when a class is deleted. - */ -static void -rbce_class_deletecb(const char *classname, void *classobj, int classtype) -{ - static struct rbce_class *cls; - struct named_obj_hdr *pos; - struct rbce_rule *rule; - - write_lock(&global_rwlock); - cls = find_class_name(classname); - if (cls) { -#ifdef RBCE_EXTENSION - put_class(cls); -#endif - if (cls->classobj != classobj) { - printk(KERN_ERR "rbce: class %s changed identity\n", - classname); - } - notify_class_action(cls, 0); - cls->classobj = NULL; - list_for_each_entry(pos, &rules_list[classtype], link) { - rule = (struct rbce_rule *)pos; - if (rule->target_class) { - if (!strcmp - (rule->target_class->obj.name, classname)) { - put_class(cls); - rule->target_class = NULL; - rule->classtype = -1; - } - } - } - if ((cls = find_class_name(classname)) != NULL) { - printk(KERN_ERR - "rbce ERROR: class %s exists in rbce after " - "removal in core\n", classname); - } - } - write_unlock(&global_rwlock); - return; -} - -/* - * Allocate an index in the global term vector - * On success, returns the index. On failure returns -errno. - * Caller must hold the global_rwlock in write mode as global data is - * written onto. - */ -static int alloc_term_index(void) -{ - int size = gl_allocated; - - if (gl_num_terms >= size) { - int i; - struct rbce_rule_term *oldv, *newv; - int newsize = size + POLICY_INC_NUMTERMS; - - oldv = gl_terms; - newv = - kmalloc(newsize * sizeof(struct rbce_rule_term), - GFP_ATOMIC); - if (!newv) { - return -ENOMEM; - } - memcpy(newv, oldv, size * sizeof(struct rbce_rule_term)); - for (i = size; i < newsize; i++) { - newv[i].op = -1; - } - gl_terms = newv; - gl_allocated = newsize; - kfree(oldv); - - gl_action |= POLICY_ACTION_NEW_VERSION; - DPRINTK(DBG_OPTIMIZATION, - "alloc_term_index: Expanding size from %d to %d\n", - size, newsize); - } - return gl_num_terms++; -} - -/* - * Release an index in the global term vector - * - * Caller must hold the global_rwlock in write mode as the global data - * is written onto. - */ -static void release_term_index(int idx) -{ - if ((idx < 0) || (idx > gl_num_terms)) - return; - - gl_terms[idx].op = -1; - gl_released++; - if ((gl_released > POLICY_INC_NUMTERMS) && - (gl_allocated > - (gl_num_terms - gl_released + POLICY_INC_NUMTERMS))) { - gl_action |= POLICY_ACTION_PACK_TERMS; - } - return; -} - -/* - * Release the indices, string memory, and terms associated with the given - * rule. - * - * Caller should be holding global_rwlock - */ -static void __release_rule(struct rbce_rule *rule) -{ - int i, *terms = rule->terms; - - // remove memory and references from other rules - for (i = rule->num_terms; --i >= 0;) { - struct rbce_rule_term *term = &gl_terms[terms[i]]; - - if (term->op == RBCE_RULE_DEP_RULE) { - DEC_REF(term->u.deprule); - } - release_term_index(terms[i]); - } - rule->num_terms = 0; - if (rule->strtab) { - kfree(rule->strtab); - rule->strtab = NULL; - } - if (rule->terms) { - kfree(rule->terms); - rule->terms = NULL; - } - return; -} - -/* - * delete the given rule and all memory associated with it. - * - * Caller is responsible for protecting the global data - */ -static inline int __delete_rule(struct rbce_rule *rule) -{ - // make sure we are not referenced by other rules - if (list_empty(&rule->obj.link)) { - return 0; - } - if (GET_REF(rule)) { - return -EBUSY; - } - __release_rule(rule); - put_class(rule->target_class); - release_term_index(rule->index); - list_del_init(&rule->obj.link); - gl_num_rules--; - gl_rules_version++; - module_put(THIS_MODULE); - kfree(rule->obj.name); - kfree(rule); - return 0; -} - -/* - * Optimize the rule evaluation logic - * - * Caller must hold global_rwlock in write mode. - */ -static void optimize_policy(void) -{ - int i, ii; - struct rbce_rule *rule; - struct rbce_rule_term *terms; - int num_terms; - int bsize; - bitvector_t **mask_vecs; - int pack_terms = 0; - int redoall; - - /* - * Due to dynamic rule addition/deletion of rules the term - * vector can get sparse. As a result the bitvectors grow as we don't - * reuse returned indices. If it becomes sparse enough we pack them - * closer. - */ - - pack_terms = (gl_action & POLICY_ACTION_PACK_TERMS); - DPRINTK(DBG_OPTIMIZATION, - "----- Optimize Policy ----- act=%x pt=%d (a=%d n=%d r=%d)\n", - gl_action, pack_terms, gl_allocated, gl_num_terms, gl_released); - - if (pack_terms) { - int nsz = ALIGN((gl_num_terms - gl_released), - POLICY_INC_NUMTERMS); - int newidx = 0; - struct rbce_rule_term *newterms; - - terms = gl_terms; - newterms = - kmalloc(nsz * sizeof(struct rbce_rule_term), GFP_ATOMIC); - if (newterms) { - for (ii = 0; ii < CKRM_MAX_CLASSTYPES; ii++) { - // FIXME: check only for task class types - list_for_each_entry_reverse(rule, - &rules_list[ii], - obj.link) { - rule->index = newidx++; - for (i = rule->num_terms; --i >= 0;) { - int idx = rule->terms[i]; - newterms[newidx] = terms[idx]; - rule->terms[i] = newidx++; - } - } - } - kfree(terms); - gl_allocated = nsz; - gl_released = 0; - gl_num_terms = newidx; - gl_terms = newterms; - - gl_action &= ~POLICY_ACTION_PACK_TERMS; - gl_action |= POLICY_ACTION_NEW_VERSION; - } - } - - num_terms = gl_num_terms; - bsize = gl_allocated / 8 + sizeof(bitvector_t); - mask_vecs = gl_mask_vecs; - terms = gl_terms; - - if (gl_action & POLICY_ACTION_NEW_VERSION) { - /* allocate new mask vectors */ - char *temp = kmalloc(NUM_TERM_MASK_VECTOR * bsize, GFP_ATOMIC); - - DPRINTK(DBG_OPTIMIZATION, - "------ allocmasks act=%x ------- ver=%d\n", gl_action, - gl_bitmap_version); - if (!temp) { - return; - } - if (mask_vecs[0]) {// index 0 has the alloc returned address - kfree(mask_vecs[0]); - } - for (i = 0; i < NUM_TERM_MASK_VECTOR; i++) { - mask_vecs[i] = (bitvector_t *) (temp + i * bsize); - bitvector_init(mask_vecs[i], gl_allocated); - } - gl_action &= ~POLICY_ACTION_NEW_VERSION; - gl_action |= POLICY_ACTION_REDO_ALL; - gl_bitmap_version++; - } - - /* We do two things here at once - * 1) recompute the rulemask for each required rule - * we guarantee proper dependency order during creation time and - * by reversely running through this list. - * 2) recompute the mask for each term and rule, if required - */ - - redoall = gl_action & POLICY_ACTION_REDO_ALL; - gl_action &= ~POLICY_ACTION_REDO_ALL; - - DPRINTK(DBG_OPTIMIZATION, "------- run act=%x -------- redoall=%d\n", - gl_action, redoall); - for (ii = 0; ii < CKRM_MAX_CLASSTYPES; ii++) { - // FIXME: check only for task class types - list_for_each_entry_reverse(rule, &rules_list[ii], obj.link) { - unsigned long termflag; - - if (!redoall && !rule->do_opt) - continue; - termflag = 0; - for (i = rule->num_terms; --i >= 0;) { - int j, idx = rule->terms[i]; - struct rbce_rule_term *term = &terms[idx]; - int vecidx = termop_2_vecidx[term->op]; - - if (vecidx == -1) { - termflag |= term->u.deprule->termflag; - /* mark this term belonging to all - contexts of deprule */ - for (j = 0; j < NUM_TERM_MASK_VECTOR; - j++) { - if (term->u.deprule->termflag - & (1 << j)) { - bitvector_set(idx, - mask_vecs - [j]); - } - } - } else { - termflag |= TERM_2_TERMFLAG(vecidx); - /* mark this term belonging to - a particular context */ - bitvector_set(idx, mask_vecs[vecidx]); - } - } - for (i = 0; i < NUM_TERM_MASK_VECTOR; i++) { - if (termflag & (1 << i)) { - bitvector_set(rule->index, - mask_vecs[i]); - } - } - rule->termflag = termflag; - rule->do_opt = 0; - DPRINTK(DBG_OPTIMIZATION, "r-%s: %x %d\n", - rule->obj.name, rule->termflag, rule->index); - } - } - print_context_vectors(); - return; -} - -/* ======================= Rule related Functions ========================= */ - -/* - * Caller need to hold global_rwlock in write mode. - */ -static int -fill_rule(struct rbce_rule *newrule, struct rbce_rule_term *terms, int nterms) -{ - char *class, *strtab; - int i, j, order, state, real_nterms, index; - int strtablen, rc = 0, counter; - struct rbce_rule_term *term = NULL; - struct rbce_class *targetcls = NULL; - struct rbce_rule *deprule; - - if (!newrule) { - return -EINVAL; - } - // Digest filled terms. - real_nterms = 0; - strtab = class = NULL; - strtablen = 0; - state = -1; - order = -1; - index = -1; - for (i = 0; i < nterms; i++) { - if (terms[i].op != RBCE_RULE_INVALID) { - real_nterms++; - - switch (terms[i].op) { - case RBCE_RULE_DEP_RULE: - // check if the depend rule is valid - // - deprule = find_rule_name(terms[i].u.string); - if (!deprule || deprule == newrule) { - rc = -EINVAL; - goto out; - } else { - // make sure _a_ depend rule - // appears in only one term. - for (j = 0; j < i; j++) { - if (terms[j].op == - RBCE_RULE_DEP_RULE - && terms[j].u.deprule == - deprule) { - rc = -EINVAL; - goto out; - } - } - terms[i].u.deprule = deprule; - } - - // +depend is acceptable and -depend is not - if (terms[i].operator != TOKEN_OP_DEP_DEL) { - terms[i].operator = RBCE_EQUAL; - } else { - rc = -EINVAL; - goto out; - } - break; - - case RBCE_RULE_CMD_PATH: - case RBCE_RULE_CMD: - case RBCE_RULE_ARGS: - case RBCE_RULE_APP_TAG: - case RBCE_RULE_IPV4: - case RBCE_RULE_IPV6: - // sum up the string length - strtablen += strlen(terms[i].u.string) + 1; - break; - default: - break; - - } - } else { - switch (terms[i].operator) { - case TOKEN_OP_ORDER: - order = terms[i].u.id; - if (order < 0) { - rc = -EINVAL; - goto out; - } - break; - case TOKEN_OP_STATE: - state = terms[i].u.id != 0; - break; - case TOKEN_OP_CLASS: - class = terms[i].u.string; - break; - default: - break; - } - } - } - - // Check if class was specified - if (class != NULL) { - int classtype; - if ((targetcls = get_class(class, &classtype)) == NULL) { - rc = -EINVAL; - goto out; - } - put_class(newrule->target_class); - - newrule->target_class = targetcls; - newrule->classtype = classtype; - } - if (!newrule->target_class) { - rc = -EINVAL; - goto out; - } - - if (state != -1) { - newrule->state = state; - } - if (order != -1) { - newrule->order = order; - } - newrule->terms = kmalloc(real_nterms * sizeof(int), GFP_ATOMIC); - if (!newrule->terms) { - rc = -ENOMEM; - goto out; - } - newrule->num_terms = real_nterms; - if (strtablen && ((strtab = kmalloc(strtablen, GFP_ATOMIC)) == NULL)) { - rc = -ENOMEM; - goto out; - } - - if (newrule->index == -1) { - index = alloc_term_index(); - if (index < 0) { - rc = -ENOMEM; - goto out; - } - newrule->index = index; - term = &gl_terms[newrule->index]; - term->op = RBCE_RULE_DEP_RULE; - term->u.deprule = newrule; - } - newrule->strtab = strtab; - newrule->termflag = 0; - - // Fill the term vector - strtablen = 0; - counter = 0; - for (i = 0; i < nterms; i++) { - if (terms[i].op == RBCE_RULE_INVALID) { - continue; - } - - newrule->terms[counter] = alloc_term_index(); - if (newrule->terms[counter] < 0) { - for (j = 0; j < counter; j++) { - release_term_index(newrule->terms[j]); - } - rc = -ENOMEM; - goto out; - } - term = &gl_terms[newrule->terms[counter]]; - term->op = terms[i].op; - term->operator = terms[i].operator; - switch (terms[i].op) { - case RBCE_RULE_CMD_PATH: - case RBCE_RULE_CMD: - case RBCE_RULE_ARGS: - case RBCE_RULE_APP_TAG: - case RBCE_RULE_IPV4: - case RBCE_RULE_IPV6: - term->u.string = &strtab[strtablen]; - strcpy(term->u.string, terms[i].u.string); - strtablen = strlen(term->u.string) + 1; - break; - - case RBCE_RULE_REAL_UID: - case RBCE_RULE_REAL_GID: - case RBCE_RULE_EFFECTIVE_UID: - case RBCE_RULE_EFFECTIVE_GID: - case RBCE_RULE_XID: - term->u.id = terms[i].u.id; - break; - - case RBCE_RULE_DEP_RULE: - term->u.deprule = terms[i].u.deprule; - INC_REF(term->u.deprule); - break; - default: - break; - } - counter++; - } - - out: - if (rc) { - if (targetcls) { - put_class(targetcls); - } - if (index >= 0) { - release_term_index(index); - } - kfree(newrule->terms); - kfree(strtab); - - } - return rc; -} - -int change_rule(const char *rname, char *rdefn) -{ - struct rbce_rule *rule = NULL, *deprule; - struct rbce_rule_term *new_terms = NULL, *term, *terms; - int nterms, new_term_mask = 0, oterms, tot_terms; - int i, j, k, rc, new_order = 0; - - if ((nterms = rules_parse(rdefn, &new_terms, &new_term_mask)) <= 0) { - return !nterms ? -EINVAL : nterms; - } - - write_lock(&global_rwlock); - rule = find_rule_name(rname); - if (rule == NULL) { - rule = kmalloc(sizeof(struct rbce_rule), GFP_ATOMIC); - if (rule) { - rule->obj.name = kmalloc(strlen(rname) + 1, GFP_ATOMIC); - if (rule->obj.name) { - strcpy(rule->obj.name, rname); - GET_REF(rule) = 0; - rule->order = -1; - rule->index = -1; - rule->state = RBCE_RULE_ENABLED; - rule->target_class = NULL; - rule->classtype = -1; - rule->terms = NULL; - rule->do_opt = 1; - INIT_LIST_HEAD(&rule->obj.link); - rc = fill_rule(rule, new_terms, nterms); - if (rc) { - kfree(rule); - } else { - if ((rc = - insert_rule(rule, - rule->order)) == 0) { - if (rbce_enabled) { - optimize_policy(); - } - } else { - __delete_rule(rule); - } - } - } else { - kfree(rule); - rc = -ENOMEM; - } - kfree(new_terms); - } else { - rc = -ENOMEM; - } - write_unlock(&global_rwlock); - return rc; - } - - oterms = rule->num_terms; - tot_terms = nterms + oterms; - - terms = kmalloc(tot_terms * sizeof(struct rbce_rule_term), GFP_ATOMIC); - - if (!terms) { - kfree(new_terms); - write_unlock(&global_rwlock); - return -ENOMEM; - } - - new_term_mask &= ~(1 << RBCE_RULE_DEP_RULE); - //ignore the new deprule terms for the first iteration. - // taken care of later. - for (i = 0; i < oterms; i++) { - term = &gl_terms[rule->terms[i]]; // old term - - if ((1 << term->op) & new_term_mask) { - // newrule has this attr/value - for (j = 0; j < nterms; j++) { - if (term->op == new_terms[j].op) { - terms[i].op = new_terms[j].op; - terms[i].operator = new_terms[j]. - operator; - terms[i].u.string = - new_terms[j].u.string; - new_terms[j].op = RBCE_RULE_INVALID2; - break; - } - } - } else { - terms[i].op = term->op; - terms[i].operator = term->operator; - terms[i].u.string = term->u.string; - } - } - - i = oterms; // for readability - - for (j = 0; j < nterms; j++) { - // handled in the previous iteration - if (new_terms[j].op == RBCE_RULE_INVALID2) { - continue; - } - - if (new_terms[j].op == RBCE_RULE_DEP_RULE) { - if (new_terms[j].operator == TOKEN_OP_DEP) { - // "depend=rule" deletes all depends in the - // original rule so, delete all depend rule - // terms in the original rule - for (k = 0; k < oterms; k++) { - if (terms[k].op == RBCE_RULE_DEP_RULE) { - terms[k].op = RBCE_RULE_INVALID; - } - } - // must copy the new deprule term - } else { - // delete the depend rule term if was defined - // in the original rule for both +depend - // and -depend - deprule = find_rule_name(new_terms[j].u.string); - if (deprule) { - for (k = 0; k < oterms; k++) { - if (terms[k].op == - RBCE_RULE_DEP_RULE - && terms[k].u.deprule == - deprule) { - terms[k].op = - RBCE_RULE_INVALID; - break; - } - } - } - if (new_terms[j].operator == TOKEN_OP_DEP_DEL) { - // No need to copy the new deprule term - continue; - } - } - } else { - if ((new_terms[j].op == RBCE_RULE_INVALID) && - (new_terms[j].operator == TOKEN_OP_ORDER)) { - new_order++; - } - } - terms[i].op = new_terms[j].op; - terms[i].operator = new_terms[j].operator; - terms[i].u.string = new_terms[j].u.string; - i++; - new_terms[j].op = RBCE_RULE_INVALID2; - } - - tot_terms = i; - - // convert old deprule pointers to name pointers. - for (i = 0; i < oterms; i++) { - if (terms[i].op != RBCE_RULE_DEP_RULE) - continue; - terms[i].u.string = terms[i].u.deprule->obj.name; - } - - // release the rule - __release_rule(rule); - - rule->do_opt = 1; - rc = fill_rule(rule, terms, tot_terms); - if (rc == 0 && new_order) { - rc = reinsert_rule(rule, rule->order); - } - if (rc != 0) { // rule creation/insertion failed - __delete_rule(rule); - } - if (rbce_enabled) { - optimize_policy(); - } - write_unlock(&global_rwlock); - kfree(new_terms); - kfree(terms); - return rc; -} - -/* - * Delete the specified rule. - * - */ -int delete_rule(const char *rname) -{ - int rc = 0; - struct rbce_rule *rule; - - write_lock(&global_rwlock); - - if ((rule = find_rule_name(rname)) == NULL) { - write_unlock(&global_rwlock); - goto out; - } - rc = __delete_rule(rule); - if (rbce_enabled && (gl_action & POLICY_ACTION_PACK_TERMS)) { - optimize_policy(); - } - write_unlock(&global_rwlock); - out: - DPRINTK(DBG_RULE, "delete rule %s\n", rname); - return rc; -} - -/* - * copy the rule specified by rname and to the given result string. - * - */ -void get_rule(const char *rname, char *result) -{ - int i; - struct rbce_rule *rule; - struct rbce_rule_term *term; - char *cp = result, oper, idtype[3], str[5]; - - read_lock(&global_rwlock); - - rule = find_rule_name(rname); - if (rule != NULL) { - for (i = 0; i < rule->num_terms; i++) { - term = gl_terms + rule->terms[i]; - switch (term->op) { - case RBCE_RULE_REAL_UID: - strcpy(idtype, "u"); - goto handleid; - case RBCE_RULE_REAL_GID: - strcpy(idtype, "g"); - goto handleid; - case RBCE_RULE_EFFECTIVE_UID: - strcpy(idtype, "eu"); - goto handleid; - case RBCE_RULE_EFFECTIVE_GID: - strcpy(idtype, "eg"); - goto handleid; - case RBCE_RULE_XID: - strcpy(idtype, "x"); - handleid: - if (term->operator == RBCE_LESS_THAN) { - oper = '<'; - } else if (term->operator == RBCE_GREATER_THAN) { - oper = '>'; - } else if (term->operator == RBCE_NOT) { - oper = '!'; - } else { - oper = '='; - } - cp += - sprintf(cp, "%sid%c%ld,", idtype, oper, - term->u.id); - break; - case RBCE_RULE_CMD_PATH: - strcpy(str, "path"); - goto handle_str; - case RBCE_RULE_CMD: - strcpy(str, "cmd"); - goto handle_str; - case RBCE_RULE_ARGS: - strcpy(str, "args"); - goto handle_str; - case RBCE_RULE_APP_TAG: - strcpy(str, "tag"); - goto handle_str; - case RBCE_RULE_IPV4: - strcpy(str, "ipv4"); - goto handle_str; - case RBCE_RULE_IPV6: - strcpy(str, "ipv6"); - handle_str: - cp += - sprintf(cp, "%s=%s,", str, term->u.string); - break; - case RBCE_RULE_DEP_RULE: - cp += - sprintf(cp, "depend=%s,", - term->u.deprule->obj.name); - break; - default: - break; - } - } - if (!rule->num_terms) { - cp += sprintf(cp, "***** no terms defined ***** "); - } - - cp += - sprintf(cp, "order=%d,state=%d,", rule->order, rule->state); - cp += - sprintf(cp, "class=%s", - rule->target_class ? rule->target_class->obj. - name : "***** REMOVED *****"); - *cp = '\0'; - } else { - sprintf(result, "***** Rule %s doesn't exist *****", rname); - } - - read_unlock(&global_rwlock); - return; -} - -/* - * Change the name of the given rule "from_rname" to "to_rname" - * - */ -int rename_rule(const char *from_rname, const char *to_rname) -{ - struct rbce_rule *rule; - int nlen, rc = -EINVAL; - - if (!to_rname || !*to_rname) { - return rc; - } - write_lock(&global_rwlock); - - rule = find_rule_name(from_rname); - if (rule != NULL) { - if ((nlen = strlen(to_rname)) > strlen(rule->obj.name)) { - char *name = kmalloc(nlen + 1, GFP_ATOMIC); - if (!name) { - return -ENOMEM; - } - kfree(rule->obj.name); - rule->obj.name = name; - } - strcpy(rule->obj.name, to_rname); - rc = 0; - } - write_unlock(&global_rwlock); - return rc; -} - -/* - * Return TRUE if the given rule exists, FALSE otherwise - * - */ -int rule_exists(const char *rname) -{ - struct rbce_rule *rule; - - read_lock(&global_rwlock); - rule = find_rule_name(rname); - read_unlock(&global_rwlock); - return rule != NULL; -} - -/*====================== Magic file handling =======================*/ -/* - * Reclassify - */ -static struct rbce_private_data *create_private_data(struct rbce_private_data *, - int); - -static inline -void reset_evaluation(struct rbce_private_data *pdata,int termflag) -{ - /* reset TAG ruleterm evaluation results to pick up - * on next classification event - */ - if (use_persistent_state && gl_mask_vecs[termflag]) { - bitvector_and_not( pdata->eval, pdata->eval, - gl_mask_vecs[termflag] ); - bitvector_and_not( pdata->true, pdata->true, - gl_mask_vecs[termflag] ); - } -} - -int set_tasktag(int pid, char *tag) -{ - char *tp; - int rc = 0; - struct task_struct *tsk; - struct rbce_private_data *pdata; - int len; - - if (!tag) { - return -EINVAL; - } - len = strlen(tag) + 1; - tp = kmalloc(len, GFP_ATOMIC); - if (!tp) { - return -ENOMEM; - } - strncpy(tp,tag,len); - - read_lock(&tasklist_lock); - if ((tsk = find_task_by_pid(pid)) == NULL) { - rc = -EINVAL; - goto out; - } - - if (unlikely(!RBCE_DATA(tsk))) { - RBCE_DATAP(tsk) = create_private_data(NULL, 0); - if (!RBCE_DATA(tsk)) { - rc = -ENOMEM; - goto out; - } - } - pdata = RBCE_DATA(tsk); - if (pdata->app_tag) { - kfree(pdata->app_tag); - } - pdata->app_tag = tp; - reset_evaluation(pdata,RBCE_TERMFLAG_TAG); - - out: - read_unlock(&tasklist_lock); - if (rc != 0) - kfree(tp); - return rc; -} - -/*====================== Classification Functions =======================*/ - -/* - * Match the given full path name with the command expression. - * This function treats the folowing 2 charaters as special if seen in - * cmd_exp, all other chanracters are compared as is: - * ? - compares to any one single character - * * - compares to one or more single characters - * - * If fullpath is 1, tsk_comm is compared in full. otherwise only the command - * name (basename(tsk_comm)) is compared. - */ -static int match_cmd(const char *tsk_comm, const char *cmd_exp, int fullpath) -{ - const char *c, *t, *last_ast, *cmd = tsk_comm; - char next_c; - - // get the command name if we don't have to match the fullpath - if (!fullpath && ((c = strrchr(tsk_comm, '/')) != NULL)) { - cmd = c + 1; - } - - /* now faithfully assume the entire pathname is in cmd */ - - /* we now have to effectively implement a regular expression - * for now assume - * '?' any single character - * '*' one or more '?' - * rest must match - */ - - c = cmd_exp; - t = cmd; - if (t == NULL || c == NULL) { - return 0; - } - - last_ast = NULL; - next_c = '\0'; - - while (*c && *t) { - switch (*c) { - case '?': - if (*t == '/') { - return 0; - } - c++; - t++; - continue; - case '*': - if (*t == '/') { - return 0; - } - // eat up all '*' in c - while (*(c + 1) == '*') - c++; - next_c = '\0'; - last_ast = c; - //t++; // Add this for matching '*' with "one" - // or more chars. - while (*t && (*t != *(c + 1)) && *t != '/') - t++; - if (*t == *(c + 1)) { - c++; - if (*c != '/') { - if (*c == '?') { - if (*t == '/') { - return 0; - } - t++; - c++; - } - next_c = *c; - if (*c) { - if (*t == '/') { - return 0; - } - t++; - c++; - if (!*c && *t) - c = last_ast; - } - } else { - last_ast = NULL; - } - continue; - } - return 0; - case '/': - next_c = '\0'; - /*FALLTHRU*/ default: - if (*t == *c && next_c != *t) { - c++, t++; - continue; - } else { - /* reset to last asterix and - continue from there */ - if (last_ast) { - c = last_ast; - } else { - return 0; - } - } - } - } - - /* check for trailing "*" */ - while (*c == '*') - c++; - - return (!*c && !*t); -} - -static void reverse(char *str, int n) -{ - char s; - int i, j = n - 1; - - for (i = 0; i < j; i++, j--) { - s = str[i]; - str[i] = str[j]; - str[j] = s; - } -} - -static int itoa(int n, char *str) -{ - int i = 0, sz = 0; - - do { - str[i++] = n % 10 + '0'; - sz++; - n = n / 10; - } while (n > 0); - - (void)reverse(str, sz); - return sz; -} - -static int v4toa(__u32 y, char *a) -{ - int i; - int size = 0; - - for (i = 0; i < 4; i++) { - size += itoa(y & 0xff, &a[size]); - a[size++] = '.'; - y >>= 8; - } - return --size; -} - -int match_ipv4(struct ckrm_net_struct *ns, char **string) -{ - char *ptr = *string; - int size; - char a4[16]; - - size = v4toa(ns->ns_daddrv4, a4); - - *string += size; - return !strncmp(a4, ptr, size); -} - -int match_port(struct ckrm_net_struct *ns, char *ptr) -{ - char a[5]; - int size = itoa(ns->ns_dport, a); - - return !strncmp(a, ptr, size); -} - -static int __evaluate_rule(struct task_struct *tsk, struct ckrm_net_struct *ns, - struct rbce_rule *rule, bitvector_t * vec_eval, - bitvector_t * vec_true, char **filename); -/* - * evaluate the given task against the given rule with the vec_eval and - * vec_true in context. Return 1 if the task satisfies the given rule, 0 - * otherwise. - * - * If the bit corresponding to the rule is set in the vec_eval, then the - * corresponding bit in vec_true is the result. If it is not set, evaluate - * the rule and set the bits in both the vectors accordingly. - * - * On return, filename will have the pointer to the pathname of the task's - * executable, if the rule had any command related terms. - * - * Caller must hold the global_rwlock atleast in read mode. - */ -static inline int -evaluate_rule(struct task_struct *tsk, struct ckrm_net_struct *ns, - struct rbce_rule *rule, bitvector_t * vec_eval, - bitvector_t * vec_true, char **filename) -{ - int tidx = rule->index; - - if (!bitvector_test(tidx, vec_eval)) { - if (__evaluate_rule - (tsk, ns, rule, vec_eval, vec_true, filename)) { - bitvector_set(tidx, vec_true); - } - bitvector_set(tidx, vec_eval); - } - return bitvector_test(tidx, vec_true); -} - -/* - * evaluate the given task against every term in the given rule with - * vec_eval and vec_true in context. - * - * If the bit corresponding to a rule term is set in the vec_eval, then the - * corresponding bit in vec_true is the result for taht particular. If it is - * not set, evaluate the rule term and set the bits in both the vectors - * accordingly. - * - * This fucntions returns true only if all terms in the rule evaluate true. - * - * On return, filename will have the pointer to the pathname of the task's - * executable, if the rule had any command related terms. - * - * Caller must hold the global_rwlock atleast in read mode. - */ -static int -__evaluate_rule(struct task_struct *tsk, struct ckrm_net_struct *ns, - struct rbce_rule *rule, bitvector_t * vec_eval, - bitvector_t * vec_true, char **filename) -{ - int i; - int no_ip = 1; - - for (i = rule->num_terms; --i >= 0;) { - int rc = 1, tidx = rule->terms[i]; - - if (!bitvector_test(tidx, vec_eval)) { - struct rbce_rule_term *term = &gl_terms[tidx]; - - switch (term->op) { - - case RBCE_RULE_CMD_PATH: - case RBCE_RULE_CMD: -#if 0 - if (!*filename) { /* get this once */ - if (((*filename = - kmalloc(NAME_MAX, - GFP_ATOMIC)) == NULL) - || - (get_exe_path_name - (tsk, *filename, NAME_MAX) < 0)) { - rc = 0; - break; - } - } - rc = match_cmd(*filename, term->u.string, - (term->op == - RBCE_RULE_CMD_PATH)); -#else - rc = match_cmd(tsk->comm, term->u.string, - (term->op == - RBCE_RULE_CMD_PATH)); -#endif - break; - case RBCE_RULE_REAL_UID: - if (term->operator == RBCE_LESS_THAN) { - rc = (tsk->uid < term->u.id); - } else if (term->operator == RBCE_GREATER_THAN){ - rc = (tsk->uid > term->u.id); - } else if (term->operator == RBCE_NOT) { - rc = (tsk->uid != term->u.id); - } else { - rc = (tsk->uid == term->u.id); - } - break; - case RBCE_RULE_REAL_GID: - if (term->operator == RBCE_LESS_THAN) { - rc = (tsk->gid < term->u.id); - } else if (term->operator == RBCE_GREATER_THAN){ - rc = (tsk->gid > term->u.id); - } else if (term->operator == RBCE_NOT) { - rc = (tsk->gid != term->u.id); - } else { - rc = (tsk->gid == term->u.id); - } - break; - case RBCE_RULE_EFFECTIVE_UID: - if (term->operator == RBCE_LESS_THAN) { - rc = (tsk->euid < term->u.id); - } else if (term->operator == RBCE_GREATER_THAN){ - rc = (tsk->euid > term->u.id); - } else if (term->operator == RBCE_NOT) { - rc = (tsk->euid != term->u.id); - } else { - rc = (tsk->euid == term->u.id); - } - break; - case RBCE_RULE_EFFECTIVE_GID: - if (term->operator == RBCE_LESS_THAN) { - rc = (tsk->egid < term->u.id); - } else if (term->operator == RBCE_GREATER_THAN){ - rc = (tsk->egid > term->u.id); - } else if (term->operator == RBCE_NOT) { - rc = (tsk->egid != term->u.id); - } else { - rc = (tsk->egid == term->u.id); - } - break; - case RBCE_RULE_APP_TAG: - rc = (RBCE_DATA(tsk) - && RBCE_DATA(tsk)-> - app_tag) ? !strcmp(RBCE_DATA(tsk)-> - app_tag, - term->u.string) : 0; - break; - case RBCE_RULE_DEP_RULE: - rc = evaluate_rule(tsk, NULL, term->u.deprule, - vec_eval, vec_true, - filename); - break; - - case RBCE_RULE_IPV4: - // TBD: add NOT_EQUAL match. At present rbce - // recognises EQUAL matches only. - if (ns && term->operator == RBCE_EQUAL) { - int ma = 0; - int mp = 0; - char *ptr = term->u.string; - - if (term->u.string[0] == '*') - ma = 1; - else - ma = match_ipv4(ns, &ptr); - - if (*ptr != '\\') { // error - rc = 0; - break; - } else { - ++ptr; - if (*ptr == '*') - mp = 1; - else - mp = match_port(ns, - ptr); - } - rc = mp && ma; - } else - rc = 0; - no_ip = 0; - break; - - case RBCE_RULE_IPV6: // no support yet - rc = 0; - no_ip = 0; - break; - - case RBCE_RULE_XID: - { - xid_t xid = vx_task_xid(tsk); - - if (term->operator == RBCE_LESS_THAN) { - rc = (xid < term->u.id); - } else if (term->operator == RBCE_GREATER_THAN) { - rc = (xid > term->u.id); - } else if (term->operator == RBCE_NOT) { - rc = (xid != term->u.id); - } else { - rc = (xid == term->u.id); - } - break; - } - - default: - rc = 0; - printk(KERN_ERR "Error evaluate term op=%d\n", - term->op); - break; - } - if (!rc && no_ip) { - bitvector_clear(tidx, vec_true); - } else { - bitvector_set(tidx, vec_true); - } - bitvector_set(tidx, vec_eval); - } else { - rc = bitvector_test(tidx, vec_true); - } - if (!rc) { - return 0; - } - } - return 1; -} - -//#define PDATA_DEBUG -#ifdef PDATA_DEBUG - -#define MAX_PDATA 10000 -void *pdata_arr[MAX_PDATA]; -int pdata_count, pdata_next; -static spinlock_t pdata_lock = SPIN_LOCK_UNLOCKED; - -static inline int valid_pdata(struct rbce_private_data *pdata) -{ - int i; - - if (!pdata) { - return 1; - } - spin_lock(&pdata_lock); - for (i = 0; i < MAX_PDATA; i++) { - if (pdata_arr[i] == pdata) { - spin_unlock(&pdata_lock); - return 1; - } - } - spin_unlock(&pdata_lock); - printk(KERN_WARNING "INVALID/CORRUPT PDATA %p\n", pdata); - return 0; -} - -static inline void store_pdata(struct rbce_private_data *pdata) -{ - int i = 0; - - if (pdata) { - spin_lock(&pdata_lock); - - while (i < MAX_PDATA) { - if (pdata_arr[pdata_next] == NULL) { - printk(KERN_DEBUG "storing %p at %d, count %d\n", pdata, - pdata_next, pdata_count); - pdata_arr[pdata_next++] = pdata; - if (pdata_next == MAX_PDATA) { - pdata_next = 0; - } - pdata_count++; - break; - } - pdata_next++; - i++; - } - spin_unlock(&pdata_lock); - } - if (i == MAX_PDATA) { - printk(KERN_DEBUG "PDATA BUFFER FULL pdata_count %d pdata %p\n", - pdata_count, pdata); - } -} - -static inline void unstore_pdata(struct rbce_private_data *pdata) -{ - int i; - if (pdata) { - spin_lock(&pdata_lock); - for (i = 0; i < MAX_PDATA; i++) { - if (pdata_arr[i] == pdata) { - printk(KERN_DEBUG "unstoring %p at %d, count %d\n", pdata, - i, pdata_count); - pdata_arr[i] = NULL; - pdata_count--; - pdata_next = i; - break; - } - } - spin_unlock(&pdata_lock); - if (i == MAX_PDATA) { - printk(KERN_DEBUG "pdata %p not found in the stored array\n", - pdata); - } - } - return; -} - -#else // PDATA_DEBUG - -#define valid_pdata(pdata) (1) -#define store_pdata(pdata) -#define unstore_pdata(pdata) - -#endif // PDATA_DEBUG - -/* - * Allocate and initialize a rbce_private_data data structure. - * - * Caller must hold global_rwlock atleast in read mode. - */ - -static inline void -copy_ext_private_data(struct rbce_private_data *src, - struct rbce_private_data *dst) -{ - if (src) - dst->ext_data = src->ext_data; - else - memset(&dst->ext_data, 0, sizeof(dst->ext_data)); -} - -static struct rbce_private_data *create_private_data(struct rbce_private_data - *src, int copy_sample) -{ - int vsize, psize, bsize; - struct rbce_private_data *pdata; - - if (use_persistent_state) { - vsize = gl_allocated; - bsize = vsize / 8 + sizeof(bitvector_t); - psize = sizeof(struct rbce_private_data) + 2 * bsize; - } else { - psize = sizeof(struct rbce_private_data); - } - - pdata = kmalloc(psize, GFP_ATOMIC); - if (pdata != NULL) { - if (use_persistent_state) { - pdata->bitmap_version = gl_bitmap_version; - pdata->eval = (bitvector_t *) & pdata->data[0]; - pdata->true = (bitvector_t *) & pdata->data[bsize]; - if (src && (src->bitmap_version == gl_bitmap_version)) { - memcpy(pdata->data, src->data, 2 * bsize); - } else { - bitvector_init(pdata->eval, vsize); - bitvector_init(pdata->true, vsize); - } - } - copy_ext_private_data(src, pdata); - //if (src) { // inherit evaluate and app_tag - // pdata->evaluate = src->evaluate; - // if(src->app_tag) { - // int len = strlen(src->app_tag)+1; - // printk(KERN_DEBUG "CREATE_PRIVATE: apptag %s len %d\n", - // src->app_tag,len); - // pdata->app_tag = kmalloc(len, GFP_ATOMIC); - // if (pdata->app_tag) { - // strcpy(pdata->app_tag, src->app_tag); - // } - // } - //} else { - pdata->evaluate = 1; - pdata->rules_version = src ? src->rules_version : 0; - pdata->app_tag = NULL; - //} - } - store_pdata(pdata); - return pdata; -} - -static inline void free_private_data(struct rbce_private_data *pdata) -{ - if (valid_pdata(pdata)) { - unstore_pdata(pdata); - kfree(pdata); - } -} - -static void free_all_private_data(void) -{ - struct task_struct *proc, *thread; - - read_lock(&tasklist_lock); - do_each_thread(proc, thread) { - struct rbce_private_data *pdata; - - pdata = RBCE_DATA(thread); - RBCE_DATAP(thread) = NULL; - free_private_data(pdata); - } while_each_thread(proc, thread); - read_unlock(&tasklist_lock); - return; -} - -/* - * reclassify function, which is called by all the callback functions. - * - * Takes that task to be reclassified and ruleflags that indicates the - * attributes that caused this reclassification request. - * - * On success, returns the core class pointer to which the given task should - * belong to. - */ -static struct ckrm_core_class *rbce_classify(struct task_struct *tsk, - struct ckrm_net_struct *ns, - unsigned long termflag, - int classtype) -{ - int i; - struct rbce_rule *rule; - bitvector_t *vec_true = NULL, *vec_eval = NULL; - struct rbce_class *tgt = NULL; - struct ckrm_core_class *cls = NULL; - char *filename = NULL; - - if (!valid_pdata(RBCE_DATA(tsk))) { - return NULL; - } - if (classtype >= CKRM_MAX_CLASSTYPES) { - // can't handle more than CKRM_MAX_CLASSTYPES - return NULL; - } - // fast path to avoid locking in case CE is not enabled or if no rules - // are defined or if the tasks states that no evaluation is needed. - if (!rbce_enabled || !gl_num_rules || - (RBCE_DATA(tsk) && !RBCE_DATA(tsk)->evaluate)) { - return NULL; - } - // FIXME: optimize_policy should be called from here if - // gl_action is non-zero. Also, it has to be called with the - // global_rwlock held in write mode. - - read_lock(&global_rwlock); - - vec_eval = vec_true = NULL; - if (use_persistent_state) { - struct rbce_private_data *pdata = RBCE_DATA(tsk); - - if (!pdata - || (pdata - && (gl_bitmap_version != pdata->bitmap_version))) { - struct rbce_private_data *new_pdata = - create_private_data(pdata, 1); - - if (new_pdata) { - if (pdata) { - new_pdata->rules_version = - pdata->rules_version; - new_pdata->evaluate = pdata->evaluate; - new_pdata->app_tag = pdata->app_tag; - free_private_data(pdata); - } - pdata = RBCE_DATAP(tsk) = new_pdata; - termflag = RBCE_TERMFLAG_ALL; - // need to evaluate them all - } else { - // we shouldn't free the pdata as it has more - // details than the vectors. But, this - // reclassification should go thru - pdata = NULL; - } - } - if (!pdata) { - goto cls_determined; - } - vec_eval = pdata->eval; - vec_true = pdata->true; - } else { - int bsize = gl_allocated; - - vec_eval = bitvector_alloc(bsize); - vec_true = bitvector_alloc(bsize); - - if (vec_eval == NULL || vec_true == NULL) { - goto cls_determined; - } - termflag = RBCE_TERMFLAG_ALL; - // need to evaluate all of them now - } - - /* - * using bit ops invalidate all terms related to this termflag - * context (only in per task vec) - */ - DPRINTK(DBG_CLASSIFY_DETAILS, "\nClassify: termflag=%lx\n", termflag); - DPRINTK(DBG_CLASSIFY_DETAILS, " eval before: "); - bitvector_print(DBG_CLASSIFY_DETAILS, vec_eval); - DPRINTK(DBG_CLASSIFY_DETAILS, "\n true before: "); - bitvector_print(DBG_CLASSIFY_DETAILS, vec_true); - DPRINTK(DBG_CLASSIFY_DETAILS, "\n redo => "); - - if (termflag == RBCE_TERMFLAG_ALL) { - DPRINTK(DBG_CLASSIFY_DETAILS, " redoall "); - bitvector_zero(vec_eval); - } else { - for (i = 0; i < NUM_TERM_MASK_VECTOR; i++) { - if (test_bit(i, &termflag)) { - bitvector_t *maskvec = gl_mask_vecs[i]; - - DPRINTK(DBG_CLASSIFY_DETAILS, " mask(%d) ", i); - bitvector_print(DBG_CLASSIFY_DETAILS, maskvec); - bitvector_and_not(vec_eval, vec_eval, maskvec); - } - } - } - bitvector_and(vec_true, vec_true, vec_eval); - - DPRINTK(DBG_CLASSIFY_DETAILS, "\n eval now: "); - bitvector_print(DBG_CLASSIFY_DETAILS, vec_eval); - DPRINTK(DBG_CLASSIFY_DETAILS, "\n"); - - /* run through the rules in order and see what needs evaluation */ - list_for_each_entry(rule, &rules_list[classtype], obj.link) { - if (rule->state == RBCE_RULE_ENABLED && - rule->target_class && - rule->target_class->classobj && - evaluate_rule(tsk, ns, rule, vec_eval, vec_true, - &filename)) { - tgt = rule->target_class; - cls = rule->target_class->classobj; - break; - } - } - - cls_determined: - DPRINTK(DBG_CLASSIFY_RES, - "==> |%s|; pid %d; euid %d; egid %d; ruid %d; rgid %d;" - "tag |%s| ===> class |%s|\n", - filename ? filename : tsk->comm, - tsk->pid, - tsk->euid, - tsk->egid, - tsk->uid, - tsk->gid, - RBCE_DATA(tsk) ? RBCE_DATA(tsk)->app_tag : "", - tgt ? tgt->obj.name : ""); - DPRINTK(DBG_CLASSIFY_DETAILS, " eval after: "); - bitvector_print(DBG_CLASSIFY_DETAILS, vec_eval); - DPRINTK(DBG_CLASSIFY_DETAILS, "\n true after: "); - bitvector_print(DBG_CLASSIFY_DETAILS, vec_true); - DPRINTK(DBG_CLASSIFY_DETAILS, "\n"); - - if (!use_persistent_state) { - if (vec_eval) { - bitvector_free(vec_eval); - } - if (vec_true) { - bitvector_free(vec_true); - } - } - ckrm_core_grab(cls); - read_unlock(&global_rwlock); - if (filename) { - kfree(filename); - } - if (RBCE_DATA(tsk)) { - RBCE_DATA(tsk)->rules_version = gl_rules_version; - } - return cls; -} - -/***************************************************************************** - * - * Module specific utilization of core RBCE functionality - * - * Includes support for the various classtypes - * New classtypes will require extensions here - * - *****************************************************************************/ - -/* helper functions that are required in the extended version */ - -static inline void rbce_tc_manual(struct task_struct *tsk) -{ - read_lock(&global_rwlock); - - if (!RBCE_DATA(tsk)) { - RBCE_DATAP(tsk) = - (void *)create_private_data(RBCE_DATA(tsk->parent), 0); - } - if (RBCE_DATA(tsk)) { - RBCE_DATA(tsk)->evaluate = 0; - } - read_unlock(&global_rwlock); - return; -} - -/***************************************************************************** - * load any extensions - *****************************************************************************/ - -#ifdef RBCE_EXTENSION -#include "rbcemod_ext.c" -#endif - -/***************************************************************************** - * VARIOUS CLASSTYPES - *****************************************************************************/ - -// to enable type coercion of the function pointers - -/*============================================================================ - * TASKCLASS CLASSTYPE - *============================================================================*/ - -int tc_classtype = -1; - -/* - * fork callback to be registered with core module. - */ -inline static void *rbce_tc_forkcb(struct task_struct *tsk) -{ - int rule_version_changed = 1; - struct ckrm_core_class *cls; - read_lock(&global_rwlock); - // dup ce_data - RBCE_DATAP(tsk) = - (void *)create_private_data(RBCE_DATA(tsk->parent), 0); - read_unlock(&global_rwlock); - - if (RBCE_DATA(tsk->parent)) { - rule_version_changed = - (RBCE_DATA(tsk->parent)->rules_version != gl_rules_version); - } - cls = rule_version_changed ? - rbce_classify(tsk, NULL, RBCE_TERMFLAG_ALL, tc_classtype) : NULL; - - // note the fork notification to any user client will be sent through - // the guaranteed fork-reclassification - return cls; -} - -/* - * exit callback to be registered with core module. - */ -static void rbce_tc_exitcb(struct task_struct *tsk) -{ - struct rbce_private_data *pdata; - - send_exit_notification(tsk); - - pdata = RBCE_DATA(tsk); - RBCE_DATAP(tsk) = NULL; - if (pdata) { - if (pdata->app_tag) { - kfree(pdata->app_tag); - } - free_private_data(pdata); - } - return; -} - -#define AENT(x) [ CKRM_EVENT_##x] = #x -static const char *event_names[CKRM_NUM_EVENTS] = { - AENT(NEWTASK), - AENT(FORK), - AENT(EXIT), - AENT(EXEC), - AENT(UID), - AENT(GID), - AENT(XID), - AENT(LOGIN), - AENT(USERADD), - AENT(USERDEL), - AENT(LISTEN_START), - AENT(LISTEN_STOP), - AENT(APPTAG), - AENT(RECLASSIFY), - AENT(MANUAL), -}; - -void *rbce_tc_classify(enum ckrm_event event, ...) -{ - va_list args; - void *cls = NULL; - struct task_struct *tsk; - struct rbce_private_data *pdata; - - va_start(args, event); - tsk = va_arg(args, struct task_struct *); - va_end(args); - - /* we only have to deal with events between - * [ CKRM_LATCHABLE_EVENTS .. CKRM_NONLATCHABLE_EVENTS ) - */ - - // printk(KERN_DEBUG "tc_classify %p:%d:%s '%s'\n",tsk,tsk->pid, - // tsk->comm,event_names[event]); - - switch (event) { - - case CKRM_EVENT_FORK: - cls = rbce_tc_forkcb(tsk); - break; - - case CKRM_EVENT_EXIT: - rbce_tc_exitcb(tsk); - break; - - case CKRM_EVENT_EXEC: - cls = rbce_classify(tsk, NULL, RBCE_TERMFLAG_CMD | - RBCE_TERMFLAG_UID | RBCE_TERMFLAG_GID, - tc_classtype); - break; - - case CKRM_EVENT_UID: - cls = rbce_classify(tsk, NULL, RBCE_TERMFLAG_UID, tc_classtype); - break; - - case CKRM_EVENT_GID: - cls = rbce_classify(tsk, NULL, RBCE_TERMFLAG_GID, tc_classtype); - break; - - case CKRM_EVENT_XID: - cls = rbce_classify(tsk, NULL, RBCE_TERMFLAG_XID, tc_classtype); - break; - - case CKRM_EVENT_LOGIN: - case CKRM_EVENT_USERADD: - case CKRM_EVENT_USERDEL: - case CKRM_EVENT_LISTEN_START: - case CKRM_EVENT_LISTEN_STOP: - case CKRM_EVENT_APPTAG: - /* no interest in this events .. */ - break; - - default: - /* catch all */ - break; - - case CKRM_EVENT_RECLASSIFY: - if ((pdata = (RBCE_DATA(tsk)))) { - pdata->evaluate = 1; - } - cls = rbce_classify(tsk, NULL, RBCE_TERMFLAG_ALL, tc_classtype); - break; - - } - // printk(KERN_DEBUG "tc_classify %p:%d:%s '%s' ==> %p\n",tsk,tsk->pid, - // tsk->comm,event_names[event],cls); - - return cls; -} - -#ifndef RBCE_EXTENSION -static void rbce_tc_notify(int event, void *core, struct task_struct *tsk) -{ - printk(KERN_DEBUG "tc_manual %p:%d:%s '%s'\n", tsk, tsk->pid, tsk->comm, - event_names[event]); - if (event != CKRM_EVENT_MANUAL) - return; - rbce_tc_manual(tsk); -} -#endif - -static struct ckrm_eng_callback rbce_taskclass_ecbs = { - .c_interest = (unsigned long)(-1), // set whole bitmap - .classify = (ce_classify_fct_t) rbce_tc_classify, - .class_delete = rbce_class_deletecb, -#ifndef RBCE_EXTENSION - .n_interest = (1 << CKRM_EVENT_MANUAL), - .notify = (ce_notify_fct_t) rbce_tc_notify, - .always_callback = 0, -#else - .n_interest = (unsigned long)(-1), // set whole bitmap - .notify = (ce_notify_fct_t) rbce_tc_ext_notify, - .class_add = rbce_class_addcb, - .always_callback = 1, -#endif -}; - -/*============================================================================ - * ACCEPTQ CLASSTYPE - *============================================================================*/ - -int sc_classtype = -1; - -void *rbce_sc_classify(enum ckrm_event event, ...) -{ - // no special consideratation - void *result; - va_list args; - struct task_struct *tsk; - struct ckrm_net_struct *ns; - - va_start(args, event); - ns = va_arg(args, struct ckrm_net_struct *); - tsk = va_arg(args, struct task_struct *); - va_end(args); - - result = rbce_classify(tsk, ns, RBCE_TERMFLAG_ALL, sc_classtype); - - DPRINTK(DBG_CLASSIFY_RES, - "==> %d.%d.%d.%d\\%d , %p:%d:%s '%s' => %p\n", - NIPQUAD(ns->ns_daddrv4), ns->ns_dport, - tsk, tsk ? tsk->pid : 0, tsk ? tsk->comm : "-", - event_names[event], result); - return result; -} - -static struct ckrm_eng_callback rbce_acceptQclass_ecbs = { - .c_interest = (unsigned long)(-1), - .always_callback = 0, // enable during debugging only - .classify = (ce_classify_fct_t) & rbce_sc_classify, - .class_delete = rbce_class_deletecb, -}; - -/*============================================================================ - * Module Initialization ... - *============================================================================*/ - -#define TASKCLASS_NAME "taskclass" -#define SOCKCLASS_NAME "socket_class" - -struct ce_regtable_struct { - const char *name; - struct ckrm_eng_callback *cbs; - int *clsvar; -}; - -struct ce_regtable_struct ce_regtable[] = { - {TASKCLASS_NAME, &rbce_taskclass_ecbs, &tc_classtype}, - {SOCKCLASS_NAME, &rbce_acceptQclass_ecbs, &sc_classtype}, - {NULL} -}; - -static void unregister_classtype_engines(void) - { - int rc; - struct ce_regtable_struct *ceptr = ce_regtable; - - while (ceptr->name) { - if (*ceptr->clsvar >= 0) { - printk(KERN_DEBUG "ce unregister with <%s>\n",ceptr->name); - while ((rc = ckrm_unregister_engine(ceptr->name)) == -EAGAIN) - ; - printk(KERN_DEBUG "ce unregister with <%s> rc=%d\n",ceptr->name,rc); - *ceptr->clsvar = -1; - } - ceptr++; - } - } - -static int register_classtype_engines(void) -{ - int rc; - struct ce_regtable_struct *ceptr = ce_regtable; - - while (ceptr->name) { - rc = ckrm_register_engine(ceptr->name, ceptr->cbs); - printk(KERN_DEBUG "ce register with <%s> typeId=%d\n",ceptr->name,rc); - if ((rc < 0) && (rc != -ENOENT)) { - unregister_classtype_engines(); - return (rc); - } - if (rc != -ENOENT) - *ceptr->clsvar = rc; - ceptr++; - } - return 0; -} - -// =========== /proc/sysctl/debug/rbce debug stuff ============= - -#ifdef DEBUG -static struct ctl_table_header *rbce_sysctl_table_header; - -#define CTL_RBCE_DEBUG (201) // picked some number.. dont know algo to pick -static struct ctl_table rbce_entry_table[] = { - { - .ctl_name = CTL_RBCE_DEBUG, - .procname = "rbce", - .data = &rbcedebug, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - {0} -}; - -static struct ctl_table rbce_root_table[] = { - { - .ctl_name = CTL_DEBUG, - .procname = "debug", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = rbce_entry_table}, - {0} -}; - -static inline void start_debug(void) -{ - rbce_sysctl_table_header = register_sysctl_table(rbce_root_table, 1); -} -static inline void stop_debug(void) -{ - if (rbce_sysctl_table_header) - unregister_sysctl_table(rbce_sysctl_table_header); -} - -#else - -static inline void start_debug(void) -{ -} -static inline void stop_debug(void) -{ -} - -#endif // DEBUG - -extern int rbce_mkdir(struct inode *, struct dentry *, int); -extern int rbce_rmdir(struct inode *, struct dentry *); -extern int rbce_create_magic(void); -extern int rbce_clear_magic(void); - -rbce_eng_callback_t rcfs_ecbs = { - rbce_mkdir, - rbce_rmdir, - rbce_create_magic, - rbce_clear_magic -}; - -/* ======================= Module definition Functions ====================== */ - -int init_rbce(void) -{ - int rc, i, line; - - printk(KERN_DEBUG "<1>\nInstalling \'%s\' module\n", modname); - - for (i = 0; i < CKRM_MAX_CLASSTYPES; i++) { - INIT_LIST_HEAD(&rules_list[i]); - } - - rc = init_rbce_ext_pre(); - line = __LINE__; - if (rc) - goto out; - - rc = register_classtype_engines(); - line = __LINE__; - if (rc) - goto out_unreg_ckrm; // need to remove anyone opened - - /* register any other class type engine here */ - - rc = rcfs_register_engine(&rcfs_ecbs); - line = __LINE__; - if (rc) - goto out_unreg_ckrm; - - if (rcfs_mounted) { - rc = rbce_create_magic(); - line = __LINE__; - if (rc) - goto out_unreg_rcfs; - } - - start_debug(); - - rc = init_rbce_ext_post(); - line = __LINE__; - if (rc) - goto out_debug; - - return 0; // SUCCESS - - out_debug: - stop_debug(); - - out_unreg_rcfs: - rcfs_unregister_engine(&rcfs_ecbs); - out_unreg_ckrm: - unregister_classtype_engines(); - exit_rbce_ext(); - out: - - printk(KERN_DEBUG "<1>%s: error installing rc=%d line=%d\n", __FUNCTION__, rc, - line); - return rc; -} - -void exit_rbce(void) -{ - int i; - - printk(KERN_DEBUG "<1>Removing \'%s\' module\n", modname); - - stop_debug(); - exit_rbce_ext(); - - // Print warnings if lists are not empty, which is a bug - if (!list_empty(&class_list)) { - printk(KERN_DEBUG "exit_rbce: Class list is not empty\n"); - } - - for (i = 0; i < CKRM_MAX_CLASSTYPES; i++) { - if (!list_empty(&rules_list[i])) { - printk(KERN_DEBUG "exit_rbce: Rules list for classtype %d" - " is not empty\n", i); - } - } - - if (rcfs_mounted) - rbce_clear_magic(); - - rcfs_unregister_engine(&rcfs_ecbs); - unregister_classtype_engines(); - free_all_private_data(); -} - -EXPORT_SYMBOL(get_rule); -EXPORT_SYMBOL(rule_exists); -EXPORT_SYMBOL(change_rule); -EXPORT_SYMBOL(delete_rule); -EXPORT_SYMBOL(rename_rule); -EXPORT_SYMBOL(set_tasktag); - -module_init(init_rbce); -module_exit(exit_rbce); - - diff --git a/kernel/ckrm/rbce/rbcemod_ext.c b/kernel/ckrm/rbce/rbcemod_ext.c deleted file mode 100644 index d0c97eae2..000000000 --- a/kernel/ckrm/rbce/rbcemod_ext.c +++ /dev/null @@ -1,622 +0,0 @@ -/* Data Collection Extension to Rule-based Classification Engine (RBCE) module - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * - * Extension to be included into RBCE to collect delay and sample information - * Requires user daemon e.g. crbcedmn to activate. - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - - -/******************************************************************************* - * - * User-Kernel Communication Channel (UKCC) - * Protocol and communication handling - * - ******************************************************************************/ - -#include -#include - -#define PSAMPLE(pdata) (&((pdata)->ext_data.sample)) -#define UKCC_N_SUB_BUFFERS (4) -#define UKCC_SUB_BUFFER_SIZE (1<<15) -#define UKCC_TOTAL_BUFFER_SIZE (UKCC_N_SUB_BUFFERS * UKCC_SUB_BUFFER_SIZE) - -#define CHANNEL_AUTO_CONT 0 /* this is during debugging only. It allows - the module to continue sending data through - the UKCC if space frees up vs. going into - the recovery driven mode - */ - -enum ukcc_state { - UKCC_OK = 0, - UKCC_STANDBY = 1, - UKCC_FULL = 2 -}; - -int ukcc_channel = -1; -static enum ukcc_state chan_state = UKCC_STANDBY; - -inline static int ukcc_ok(void) -{ - return (chan_state == UKCC_OK); -} - -static void ukcc_cmd_deliver(int rchan_id, char *from, u32 len); -static void client_attached(void); -static void client_detached(void); - -static int ukcc_fileop_notify(int rchan_id, - struct file *filp, enum relay_fileop fileop) -{ - static int readers = 0; - if (fileop == RELAY_FILE_OPEN) { - // printk(KERN_DEBUG "got fileop_notify RELAY_FILE_OPEN for file %p\n", - // filp); - if (readers) { - printk(KERN_DEBUG "only one client allowed, backoff .... \n"); - return -EPERM; - } - if (!try_module_get(THIS_MODULE)) - return -EPERM; - readers++; - client_attached(); - - } else if (fileop == RELAY_FILE_CLOSE) { - // printk(KERN_DEBUG "got fileop_notify RELAY_FILE_CLOSE for file %p\n", - // filp); - client_detached(); - readers--; - module_put(THIS_MODULE); - } - return 0; -} - -static int create_ukcc_channel(void) -{ - static struct rchan_callbacks ukcc_callbacks = { - .buffer_start = NULL, - .buffer_end = NULL, - .deliver = NULL, - .user_deliver = ukcc_cmd_deliver, - .needs_resize = NULL, - .fileop_notify = ukcc_fileop_notify, - }; - - u32 channel_flags = - RELAY_USAGE_GLOBAL | RELAY_SCHEME_ANY | RELAY_TIMESTAMP_ANY; - - // notify on subbuffer full (through poll) - channel_flags |= RELAY_DELIVERY_BULK; - // channel_flags |= RELAY_DELIVERY_PACKET; - // avoid overwrite, otherwise recovery will be nasty... - channel_flags |= RELAY_MODE_NO_OVERWRITE; - - ukcc_channel = relay_open(CRBCE_UKCC_NAME, - UKCC_SUB_BUFFER_SIZE, - UKCC_N_SUB_BUFFERS, - channel_flags, - &ukcc_callbacks, 0, 0, 0, 0, 0, 0, NULL, 0); - if (ukcc_channel < 0) - printk(KERN_DEBUG "crbce: ukcc creation failed, errcode: %d\n", - ukcc_channel); - else - printk(KERN_DEBUG "crbce: ukcc created (%u KB)\n", - UKCC_TOTAL_BUFFER_SIZE >> 10); - return ukcc_channel; -} - -static inline void close_ukcc_channel(void) -{ - if (ukcc_channel >= 0) { - relay_close(ukcc_channel); - ukcc_channel = -1; - chan_state = UKCC_STANDBY; - } -} - -#define rec_set_hdr(r,t,p) ((r)->hdr.type = (t), (r)->hdr.pid = (p)) -#define rec_set_timehdr(r,t,p,c) (rec_set_hdr(r,t,p), \ -(r)->hdr.timestamp = jiffies_to_msecs(jiffies), (r)->hdr.cls=(unsigned long)(c) ) - - -#if CHANNEL_AUTO_CONT - -/* we only provide this for debugging.. it allows us to send records - * based on availability in the channel when the UKCC stalles rather - * going through the UKCC recovery protocol - */ - -#define rec_send_len(r,l) \ - do { \ - int chan_wasok = (chan_state == UKCC_OK); \ - int chan_isok = (relay_write(ukcc_channel, \ - (r),(l),-1,NULL) > 0); \ - chan_state = chan_isok ? UKCC_OK : UKCC_STANDBY; \ - if (chan_wasok && !chan_isok) { \ - printk(KERN_DEBUG "Channel stalled\n"); \ - } else if (!chan_wasok && chan_isok) { \ - printk(KERN_DEBUG "Channel continues\n"); \ - } \ - } while (0) - -#define rec_send(r) rec_send_len(r,sizeof(*(r))) - -#else - -/* Default UKCC channel protocol. - * Though a UKCC buffer overflow should not happen ever, it is possible iff - * the user daemon stops reading for some reason. Hence we provide a simple - * protocol based on 3 states - * UKCC_OK := channel is active and properly working. When a channel - * write fails we move to state CHAN_FULL. - * UKCC_FULL := channel is active, but the last send_rec has failed. As - * a result we will try to send an indication to the daemon - * that this has happened. When that succeeds, we move to - * state UKCC_STANDBY. - * UKCC_STANDBY := we are waiting to be restarted by the user daemon - * - */ - -static void ukcc_full(void) -{ - static spinlock_t ukcc_state_lock = SPIN_LOCK_UNLOCKED; - /* protect transition from OK -> FULL to ensure only one record is sent, - rest we do not need to protect, protocol implies that. we keep the - channel OK until - */ - int send = 0; - spin_lock(&ukcc_state_lock); - if ((send = (chan_state != UKCC_STANDBY))) - chan_state = UKCC_STANDBY; /* assume we can send */ - spin_unlock(&ukcc_state_lock); - - if (send) { - struct crbce_ukcc_full rec; - rec_set_timehdr(&rec, CRBCE_REC_UKCC_FULL, 0, 0); - if (relay_write(ukcc_channel, &rec, - sizeof(rec), -1, NULL) <= 0) { - /* channel is remains full .. try with next one */ - chan_state = UKCC_FULL; - } - } -} - -#define rec_send_len(r,l) \ - do { \ - switch (chan_state) { \ - case UKCC_OK: \ - if (relay_write(ukcc_channel,(r), \ - (l),-1,NULL) > 0) \ - break; \ - case UKCC_FULL: \ - ukcc_full(); \ - break; \ - default: \ - break; \ - } \ - } while (0) - -#define rec_send(r) rec_send_len(r,sizeof(*(r))) - -#endif - -/****************************************************************************** - * - * Callbacks for the CKRM engine. - * In each we do the necessary classification and event record generation - * We generate 3 kind of records in the callback - * (a) FORK send the pid, the class and the ppid - * (b) RECLASSIFICATION send the pid, the class and < sample data + - * delay data > - * (b) EXIT send the pid - * - ******************************************************************************/ - -int delta_mode = 0; - -static inline void copy_delay(struct task_delay_info *delay, - struct task_struct *tsk) -{ - *delay = tsk->delays; -} - -static inline void zero_delay(struct task_delay_info *delay) -{ - memset(delay, 0, sizeof(struct task_delay_info)); - /* we need to think about doing this 64-bit atomic */ -} - -static inline void zero_sample(struct task_sample_info *sample) -{ - memset(sample, 0, sizeof(struct task_sample_info)); - /* we need to think about doing this 64-bit atomic */ -} - -static inline int check_zero(void *ptr, int len) -{ - int iszero = 1; - int i; - unsigned long *uptr = (unsigned long *)ptr; - - for (i = len / sizeof(unsigned long); i-- && iszero; uptr++) - // assume its rounded - iszero &= (*uptr == 0); - return iszero; -} - -static inline int check_not_zero(void *ptr, int len) -{ - int i; - unsigned long *uptr = (unsigned long *)ptr; - - for (i = len / sizeof(unsigned long); i--; uptr++) - // assume its rounded - if (*uptr) - return 1; - return 0; -} - -static inline int sample_changed(struct task_sample_info *s) -{ - return check_not_zero(s, sizeof(*s)); -} -static inline int delay_changed(struct task_delay_info *d) -{ - return check_not_zero(d, sizeof(*d)); -} - -static inline int -send_task_record(struct task_struct *tsk, int event, - struct ckrm_core_class *core, int send_forced) -{ - struct crbce_rec_task_data rec; - struct rbce_private_data *pdata; - int send = 0; - - if (!ukcc_ok()) - return 0; - pdata = RBCE_DATA(tsk); - if (pdata == NULL) { - // printk(KERN_DEBUG "send [%d]<%s>: no pdata\n",tsk->pid,tsk->comm); - return 0; - } - if (send_forced || (delta_mode == 0) - || sample_changed(PSAMPLE(RBCE_DATA(tsk))) - || delay_changed(&tsk->delays)) { - rec_set_timehdr(&rec, event, tsk->pid, - core ? core : (struct ckrm_core_class *)tsk-> - taskclass); - rec.sample = *PSAMPLE(RBCE_DATA(tsk)); - copy_delay(&rec.delay, tsk); - rec_send(&rec); - if (delta_mode || send_forced) { - // on reclassify or delta mode reset the counters - zero_sample(PSAMPLE(RBCE_DATA(tsk))); - zero_delay(&tsk->delays); - } - send = 1; - } - return send; -} - -static inline void send_exit_notification(struct task_struct *tsk) -{ - send_task_record(tsk, CRBCE_REC_EXIT, NULL, 1); -} - -static inline void -rbce_tc_ext_notify(int event, void *core, struct task_struct *tsk) -{ - struct crbce_rec_fork rec; - - switch (event) { - case CKRM_EVENT_FORK: - if (ukcc_ok()) { - rec.ppid = tsk->parent->pid; - rec_set_timehdr(&rec, CKRM_EVENT_FORK, tsk->pid, core); - rec_send(&rec); - } - break; - case CKRM_EVENT_MANUAL: - rbce_tc_manual(tsk); - - default: - send_task_record(tsk, event, (struct ckrm_core_class *)core, 1); - break; - } -} - -/*====================== end classification engine =======================*/ - -static void sample_task_data(unsigned long unused); - -struct timer_list sample_timer = {.expires = 0,.function = sample_task_data }; -unsigned long timer_interval_length = (250 * HZ) / 1000; - -inline void stop_sample_timer(void) -{ - if (sample_timer.expires > 0) { - del_timer_sync(&sample_timer); - sample_timer.expires = 0; - } -} - -inline void start_sample_timer(void) -{ - if (timer_interval_length > 0) { - sample_timer.expires = - jiffies + (timer_interval_length * HZ) / 1000; - add_timer(&sample_timer); - } -} - -static void send_task_data(void) -{ - struct crbce_rec_data_delim limrec; - struct task_struct *proc, *thread; - int sendcnt = 0; - int taskcnt = 0; - limrec.is_stop = 0; - rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0); - rec_send(&limrec); - - read_lock(&tasklist_lock); - do_each_thread(proc, thread) { - taskcnt++; - task_lock(thread); - sendcnt += send_task_record(thread, CRBCE_REC_SAMPLE, NULL, 0); - task_unlock(thread); - } while_each_thread(proc, thread); - read_unlock(&tasklist_lock); - - limrec.is_stop = 1; - rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0); - rec_send(&limrec); - - // printk(KERN_DEBUG "send_task_data mode=%d t#=%d s#=%d\n", - // delta_mode,taskcnt,sendcnt); -} - -static void notify_class_action(struct rbce_class *cls, int action) -{ - struct crbce_class_info cinfo; - int len; - - rec_set_timehdr(&cinfo, CRBCE_REC_CLASS_INFO, 0, cls->classobj); - cinfo.action = action; - len = strnlen(cls->obj.name, CRBCE_MAX_CLASS_NAME_LEN - 1); - memcpy(&cinfo.name, cls->obj.name, len); - cinfo.name[len] = '\0'; - len++; - cinfo.namelen = len; - - len += sizeof(cinfo) - CRBCE_MAX_CLASS_NAME_LEN; - rec_send_len(&cinfo, len); -} - -static void send_classlist(void) -{ - struct rbce_class *cls; - - read_lock(&global_rwlock); - list_for_each_entry(cls, &class_list, obj.link) { - notify_class_action(cls, 1); - } - read_unlock(&global_rwlock); -} - -/* - * resend_task_info - * - * This function resends all essential task information to the client. - * - */ -static void resend_task_info(void) -{ - struct crbce_rec_data_delim limrec; - struct crbce_rec_fork rec; - struct task_struct *proc, *thread; - - send_classlist(); // first send available class information - - limrec.is_stop = 2; - rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0); - rec_send(&limrec); - - write_lock(&tasklist_lock); // avoid any mods during this phase - do_each_thread(proc, thread) { - if (ukcc_ok()) { - rec.ppid = thread->parent->pid; - rec_set_timehdr(&rec, CRBCE_REC_TASKINFO, thread->pid, - thread->taskclass); - rec_send(&rec); - } - } - while_each_thread(proc, thread); - write_unlock(&tasklist_lock); - - limrec.is_stop = 3; - rec_set_timehdr(&limrec, CRBCE_REC_DATA_DELIMITER, 0, 0); - rec_send(&limrec); -} - -extern int task_running_sys(struct task_struct *); - -static void add_all_private_data(void) -{ - struct task_struct *proc, *thread; - - write_lock(&tasklist_lock); - do_each_thread(proc, thread) { - if (RBCE_DATA(thread) == NULL) - RBCE_DATAP(thread) = create_private_data(NULL, 0); - } - while_each_thread(proc, thread); - write_unlock(&tasklist_lock); -} - -static void sample_task_data(unsigned long unused) -{ - struct task_struct *proc, *thread; - - int run = 0; - int wait = 0; - read_lock(&tasklist_lock); - do_each_thread(proc, thread) { - struct rbce_private_data *pdata = RBCE_DATA(thread); - - if (pdata == NULL) { - // some wierdo race condition .. simply ignore - continue; - } - if (thread->state == TASK_RUNNING) { - if (task_running_sys(thread)) { - atomic_inc((atomic_t *) & - (PSAMPLE(pdata)->cpu_running)); - run++; - } else { - atomic_inc((atomic_t *) & - (PSAMPLE(pdata)->cpu_waiting)); - wait++; - } - } - /* update IO state */ - if (thread->flags & PF_IOWAIT) { - if (thread->flags & PF_MEMIO) - atomic_inc((atomic_t *) & - (PSAMPLE(pdata)->memio_delayed)); - else - atomic_inc((atomic_t *) & - (PSAMPLE(pdata)->io_delayed)); - } - } - while_each_thread(proc, thread); - read_unlock(&tasklist_lock); -// printk(KERN_DEBUG "sample_timer: run=%d wait=%d\n",run,wait); - start_sample_timer(); -} - -static void ukcc_cmd_deliver(int rchan_id, char *from, u32 len) -{ - struct crbce_command *cmdrec = (struct crbce_command *)from; - struct crbce_cmd_done cmdret; - int rc = 0; - -// printk(KERN_DEBUG "ukcc_cmd_deliver: %d %d len=%d:%d\n",cmdrec->type, -// cmdrec->cmd,cmdrec->len,len); - - cmdrec->len = len; // add this to reflection so the user doesn't - // accidently write the wrong length and the - // protocol is getting screwed up - - if (cmdrec->type != CRBCE_REC_KERNEL_CMD) { - rc = EINVAL; - goto out; - } - - switch (cmdrec->cmd) { - case CRBCE_CMD_SET_TIMER: - { - struct crbce_cmd_settimer *cptr = - (struct crbce_cmd_settimer *)cmdrec; - if (len != sizeof(*cptr)) { - rc = EINVAL; - break; - } - stop_sample_timer(); - timer_interval_length = cptr->interval; - if ((timer_interval_length > 0) - && (timer_interval_length < 10)) - timer_interval_length = 10; - // anything finer can create problems - printk(KERN_INFO "CRBCE set sample collect timer %lu\n", - timer_interval_length); - start_sample_timer(); - break; - } - case CRBCE_CMD_SEND_DATA: - { - struct crbce_cmd_send_data *cptr = - (struct crbce_cmd_send_data *)cmdrec; - if (len != sizeof(*cptr)) { - rc = EINVAL; - break; - } - delta_mode = cptr->delta_mode; - send_task_data(); - break; - } - case CRBCE_CMD_START: - add_all_private_data(); - chan_state = UKCC_OK; - resend_task_info(); - break; - - case CRBCE_CMD_STOP: - chan_state = UKCC_STANDBY; - free_all_private_data(); - break; - - default: - rc = EINVAL; - break; - } - - out: - cmdret.hdr.type = CRBCE_REC_KERNEL_CMD_DONE; - cmdret.hdr.cmd = cmdrec->cmd; - cmdret.rc = rc; - rec_send(&cmdret); -// printk(KERN_DEBUG "ukcc_cmd_deliver ACK: %d %d rc=%d %d\n",cmdret.hdr.type, -// cmdret.hdr.cmd,rc,sizeof(cmdret)); -} - -static void client_attached(void) -{ - printk(KERN_DEBUG "client [%d]<%s> attached to UKCC\n", current->pid, - current->comm); - relay_reset(ukcc_channel); -} - -static void client_detached(void) -{ - printk(KERN_DEBUG "client [%d]<%s> detached to UKCC\n", current->pid, - current->comm); - chan_state = UKCC_STANDBY; - stop_sample_timer(); - relay_reset(ukcc_channel); - free_all_private_data(); -} - -static int init_rbce_ext_pre(void) -{ - int rc; - - rc = create_ukcc_channel(); - return ((rc < 0) ? rc : 0); -} - -static int init_rbce_ext_post(void) -{ - init_timer(&sample_timer); - return 0; -} - -static void exit_rbce_ext(void) -{ - stop_sample_timer(); - close_ukcc_channel(); -} diff --git a/kernel/ckrm/rbce/token.c b/kernel/ckrm/rbce/token.c deleted file mode 100644 index 32446fb2b..000000000 --- a/kernel/ckrm/rbce/token.c +++ /dev/null @@ -1,301 +0,0 @@ -/* Tokens for Rule-based Classification Engine (RBCE) and - * Consolidated RBCE module code (combined) - * - * Copyright (C) Hubertus Franke, IBM Corp. 2003 - * (C) Chandra Seetharaman, IBM Corp. 2003 - * (C) Vivek Kashyap, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * - */ - -#include -#include - -enum rule_token_t { - TOKEN_PATH, - TOKEN_CMD, - TOKEN_ARGS, - TOKEN_RUID_EQ, - TOKEN_RUID_LT, - TOKEN_RUID_GT, - TOKEN_RUID_NOT, - TOKEN_RGID_EQ, - TOKEN_RGID_LT, - TOKEN_RGID_GT, - TOKEN_RGID_NOT, - TOKEN_EUID_EQ, - TOKEN_EUID_LT, - TOKEN_EUID_GT, - TOKEN_EUID_NOT, - TOKEN_EGID_EQ, - TOKEN_EGID_LT, - TOKEN_EGID_GT, - TOKEN_EGID_NOT, - TOKEN_XID_EQ, - TOKEN_XID_LT, - TOKEN_XID_GT, - TOKEN_XID_NOT, - TOKEN_TAG, - TOKEN_IPV4, - TOKEN_IPV6, - TOKEN_DEP, - TOKEN_DEP_ADD, - TOKEN_DEP_DEL, - TOKEN_ORDER, - TOKEN_CLASS, - TOKEN_STATE, - TOKEN_INVALID -}; - -int token_to_ruleop[TOKEN_INVALID + 1] = { - [TOKEN_PATH] = RBCE_RULE_CMD_PATH, - [TOKEN_CMD] = RBCE_RULE_CMD, - [TOKEN_ARGS] = RBCE_RULE_ARGS, - [TOKEN_RUID_EQ] = RBCE_RULE_REAL_UID, - [TOKEN_RUID_LT] = RBCE_RULE_REAL_UID, - [TOKEN_RUID_GT] = RBCE_RULE_REAL_UID, - [TOKEN_RUID_NOT] = RBCE_RULE_REAL_UID, - [TOKEN_RGID_EQ] = RBCE_RULE_REAL_GID, - [TOKEN_RGID_LT] = RBCE_RULE_REAL_GID, - [TOKEN_RGID_GT] = RBCE_RULE_REAL_GID, - [TOKEN_RGID_NOT] = RBCE_RULE_REAL_GID, - [TOKEN_EUID_EQ] = RBCE_RULE_EFFECTIVE_UID, - [TOKEN_EUID_LT] = RBCE_RULE_EFFECTIVE_UID, - [TOKEN_EUID_GT] = RBCE_RULE_EFFECTIVE_UID, - [TOKEN_EUID_NOT] = RBCE_RULE_EFFECTIVE_UID, - [TOKEN_EGID_EQ] = RBCE_RULE_EFFECTIVE_GID, - [TOKEN_EGID_LT] = RBCE_RULE_EFFECTIVE_GID, - [TOKEN_EGID_GT] = RBCE_RULE_EFFECTIVE_GID, - [TOKEN_EGID_NOT] = RBCE_RULE_EFFECTIVE_GID, - [TOKEN_XID_EQ] = RBCE_RULE_XID, - [TOKEN_XID_LT] = RBCE_RULE_XID, - [TOKEN_XID_GT] = RBCE_RULE_XID, - [TOKEN_XID_NOT] = RBCE_RULE_XID, - [TOKEN_TAG] = RBCE_RULE_APP_TAG, - [TOKEN_IPV4] = RBCE_RULE_IPV4, - [TOKEN_IPV6] = RBCE_RULE_IPV6, - [TOKEN_DEP] = RBCE_RULE_DEP_RULE, - [TOKEN_DEP_ADD] = RBCE_RULE_DEP_RULE, - [TOKEN_DEP_DEL] = RBCE_RULE_DEP_RULE, - [TOKEN_ORDER] = RBCE_RULE_INVALID, - [TOKEN_CLASS] = RBCE_RULE_INVALID, - [TOKEN_STATE] = RBCE_RULE_INVALID, -}; - -enum op_token { - TOKEN_OP_EQUAL = RBCE_EQUAL, - TOKEN_OP_NOT = RBCE_NOT, - TOKEN_OP_LESS_THAN = RBCE_LESS_THAN, - TOKEN_OP_GREATER_THAN = RBCE_GREATER_THAN, - TOKEN_OP_DEP, - TOKEN_OP_DEP_ADD, - TOKEN_OP_DEP_DEL, - TOKEN_OP_ORDER, - TOKEN_OP_CLASS, - TOKEN_OP_STATE, -}; - -enum op_token token_to_operator[TOKEN_INVALID + 1] = { - [TOKEN_PATH] = TOKEN_OP_EQUAL, - [TOKEN_CMD] = TOKEN_OP_EQUAL, - [TOKEN_ARGS] = TOKEN_OP_EQUAL, - [TOKEN_RUID_EQ] = TOKEN_OP_EQUAL, - [TOKEN_RUID_LT] = TOKEN_OP_LESS_THAN, - [TOKEN_RUID_GT] = TOKEN_OP_GREATER_THAN, - [TOKEN_RUID_NOT] = TOKEN_OP_NOT, - [TOKEN_RGID_EQ] = TOKEN_OP_EQUAL, - [TOKEN_RGID_LT] = TOKEN_OP_LESS_THAN, - [TOKEN_RGID_GT] = TOKEN_OP_GREATER_THAN, - [TOKEN_RGID_NOT] = TOKEN_OP_NOT, - [TOKEN_EUID_EQ] = TOKEN_OP_EQUAL, - [TOKEN_EUID_LT] = TOKEN_OP_LESS_THAN, - [TOKEN_EUID_GT] = TOKEN_OP_GREATER_THAN, - [TOKEN_EUID_NOT] = TOKEN_OP_NOT, - [TOKEN_EGID_EQ] = TOKEN_OP_EQUAL, - [TOKEN_EGID_LT] = TOKEN_OP_LESS_THAN, - [TOKEN_EGID_GT] = TOKEN_OP_GREATER_THAN, - [TOKEN_EGID_NOT] = TOKEN_OP_NOT, - [TOKEN_XID_EQ] = TOKEN_OP_EQUAL, - [TOKEN_XID_LT] = TOKEN_OP_LESS_THAN, - [TOKEN_XID_GT] = TOKEN_OP_GREATER_THAN, - [TOKEN_XID_NOT] = TOKEN_OP_NOT, - [TOKEN_TAG] = TOKEN_OP_EQUAL, - [TOKEN_IPV4] = TOKEN_OP_EQUAL, - [TOKEN_IPV6] = TOKEN_OP_EQUAL, - [TOKEN_DEP] = TOKEN_OP_DEP, - [TOKEN_DEP_ADD] = TOKEN_OP_DEP_ADD, - [TOKEN_DEP_DEL] = TOKEN_OP_DEP_DEL, - [TOKEN_ORDER] = TOKEN_OP_ORDER, - [TOKEN_CLASS] = TOKEN_OP_CLASS, - [TOKEN_STATE] = TOKEN_OP_STATE -}; - -static match_table_t tokens = { - {TOKEN_PATH, "path=%s"}, - {TOKEN_CMD, "cmd=%s"}, - {TOKEN_ARGS, "args=%s"}, - {TOKEN_RUID_EQ, "uid=%d"}, - {TOKEN_RUID_LT, "uid<%d"}, - {TOKEN_RUID_GT, "uid>%d"}, - {TOKEN_RUID_NOT, "uid!%d"}, - {TOKEN_RGID_EQ, "gid=%d"}, - {TOKEN_RGID_LT, "gid<%d"}, - {TOKEN_RGID_GT, "gid>%d"}, - {TOKEN_RGID_NOT, "gid!d"}, - {TOKEN_EUID_EQ, "euid=%d"}, - {TOKEN_EUID_LT, "euid<%d"}, - {TOKEN_EUID_GT, "euid>%d"}, - {TOKEN_EUID_NOT, "euid!%d"}, - {TOKEN_EGID_EQ, "egid=%d"}, - {TOKEN_EGID_LT, "egid<%d"}, - {TOKEN_EGID_GT, "egid>%d"}, - {TOKEN_EGID_NOT, "egid!%d"}, - {TOKEN_XID_EQ, "xid=%d"}, - {TOKEN_XID_LT, "xid<%d"}, - {TOKEN_XID_GT, "xid>%d"}, - {TOKEN_XID_NOT, "xid!%d"}, - {TOKEN_TAG, "tag=%s"}, - {TOKEN_IPV4, "ipv4=%s"}, - {TOKEN_IPV6, "ipv6=%s"}, - {TOKEN_DEP, "depend=%s"}, - {TOKEN_DEP_ADD, "+depend=%s"}, - {TOKEN_DEP_DEL, "-depend=%s"}, - {TOKEN_ORDER, "order=%d"}, - {TOKEN_CLASS, "class=%s"}, - {TOKEN_STATE, "state=%d"}, - {TOKEN_INVALID, NULL} -}; - -/* - * return -EINVAL in case of failures - * returns number of terms in terms on success. - * never returns 0. - */ - -static int -rules_parse(char *rule_defn, struct rbce_rule_term **rterms, int *term_mask) -{ - char *p, *rp = rule_defn; - int option, i = 0, nterms; - struct rbce_rule_term *terms; - - *rterms = NULL; - *term_mask = 0; - if (!rule_defn) - return -EINVAL; - - nterms = 0; - while (*rp++) { - if (*rp == '>' || *rp == '<' || *rp == '=' || *rp == '!') { - nterms++; - } - } - - if (!nterms) { - return -EINVAL; - } - - terms = kmalloc(nterms * sizeof(struct rbce_rule_term), GFP_KERNEL); - if (!terms) { - return -ENOMEM; - } - - while ((p = strsep(&rule_defn, ",")) != NULL) { - - substring_t args[MAX_OPT_ARGS]; - int token; - - while (*p && isspace(*p)) - p++; - if (!*p) - continue; - - token = match_token(p, tokens, args); - - terms[i].op = token_to_ruleop[token]; - terms[i].operator = token_to_operator[token]; - switch (token) { - - case TOKEN_PATH: - case TOKEN_CMD: - case TOKEN_ARGS: - case TOKEN_TAG: - case TOKEN_IPV4: - case TOKEN_IPV6: - // all these tokens can be specified only once - if (*term_mask & (1 << terms[i].op)) { - nterms = -EINVAL; - goto out; - } - /*FALLTHRU*/ case TOKEN_CLASS: - case TOKEN_DEP: - case TOKEN_DEP_ADD: - case TOKEN_DEP_DEL: - terms[i].u.string = args->from; - break; - - case TOKEN_RUID_EQ: - case TOKEN_RUID_LT: - case TOKEN_RUID_GT: - case TOKEN_RUID_NOT: - case TOKEN_RGID_EQ: - case TOKEN_RGID_LT: - case TOKEN_RGID_GT: - case TOKEN_RGID_NOT: - case TOKEN_EUID_EQ: - case TOKEN_EUID_LT: - case TOKEN_EUID_GT: - case TOKEN_EUID_NOT: - case TOKEN_EGID_EQ: - case TOKEN_EGID_LT: - case TOKEN_EGID_GT: - case TOKEN_EGID_NOT: - case TOKEN_XID_EQ: - case TOKEN_XID_LT: - case TOKEN_XID_GT: - case TOKEN_XID_NOT: - // all these tokens can be specified only once - if (*term_mask & (1 << terms[i].op)) { - nterms = -EINVAL; - goto out; - } - /*FALLTHRU*/ case TOKEN_ORDER: - case TOKEN_STATE: - if (match_int(args, &option)) { - nterms = -EINVAL; - goto out; - } - terms[i].u.id = option; - break; - default: - nterms = -EINVAL; - goto out; - } - *term_mask |= (1 << terms[i].op); - i++; - } - *rterms = terms; - - out: - if (nterms < 0) { - kfree(terms); - *term_mask = 0; - } /* else { - for (i = 0; i < nterms; i++) { - printk(KERN_DEBUG "token: i %d; op %d, operator %d, str %ld\n", - i, terms[i].op, terms[i].operator, terms[i].u.id); - } - } */ - return nterms; -} diff --git a/kernel/ckrm_classqueue.c b/kernel/ckrm_classqueue.c deleted file mode 100644 index 80d5d495c..000000000 --- a/kernel/ckrm_classqueue.c +++ /dev/null @@ -1,211 +0,0 @@ -/* kernel/ckrm_classqueue.c : implements the class queue - * - * Copyright (C) Haoqiang Zheng, IBM Corp. 2003 - * (C) Hubertus Franke, IBM Corp. 2003 - * - * Class queue functionality for CKRM cpu controller - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ - -/* Changes - * - * Aug 28, 2003 - * Created. - * July 08, 2004 - * classqueue now has a fixed size - * major clean up - * function/structure names are changed to more intuitive ones - */ -#include -#include - -#define cq_nr_member(cq) (cq->array.nr_active) - -/** - * get_index - translate the logical priority to the real index in the queue - * - * validate the position - * a valid prio is [cq->base,cq->base + size -1] - */ -static inline unsigned long get_index(struct classqueue_struct *cq, int *prio) -{ - unsigned long index; - int max_prio; - - if (!cq_nr_member(cq)) - return 0; - - max_prio = cq->base + (CLASSQUEUE_SIZE - 1); - if (*prio > max_prio) - *prio = max_prio; - if (*prio < cq->base) - *prio = cq->base; - - index = (cq->base_offset + (*prio - cq->base)) ; - if (index >= CLASSQUEUE_SIZE) - index -= CLASSQUEUE_SIZE; - - return index; -} - -/** - * initialize a class queue object - */ -int classqueue_init(struct classqueue_struct *cq) -{ - int i; - struct cq_prio_array *array; - - array = &cq->array; - for (i = 0; i < CLASSQUEUE_SIZE; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - // delimiter for bitsearch - __set_bit(CLASSQUEUE_SIZE, array->bitmap); - array->nr_active = 0; - - cq->base = 0; - cq->base_offset = -1; //not valid yet - - return 0; -} - -/** - *classqueue_enqueue - add the class to classqueue based on its prio - */ -void classqueue_enqueue(struct classqueue_struct *cq, - cq_node_t * node, int prio) -{ - int index; - - //get real index - if (cq_nr_member(cq)) { - index = get_index(cq, &prio); - } else { //the first one - cq->base = prio; - cq->base_offset = 0; - index = 0; - } - - //add to the queue - list_add(&(node->list), &cq->array.queue[index]); - __set_bit(index, cq->array.bitmap); - cq->array.nr_active++; - - node->index = index; - node->prio = prio; -} - -void classqueue_dequeue(struct classqueue_struct *cq, cq_node_t * node) -{ - //delete from queue - list_del_init(&(node->list)); - cq->array.nr_active--; - - //check clear the bitmap - if (list_empty(&cq->array.queue[node->index])) - __clear_bit(node->index, cq->array.bitmap); -} - -void classqueue_update_prio(struct classqueue_struct *cq, - cq_node_t * node, int new_pos) -{ - int index; - - if (! cls_in_classqueue(node)) - return; - - index = get_index(cq, &new_pos); - node->prio = new_pos; - - //remove from the original position - list_del_init(&(node->list)); - if (list_empty(&cq->array.queue[node->index])) - __clear_bit(node->index, cq->array.bitmap); - - //add to new positon, round robin for classes with same priority - list_add_tail(&(node->list), &cq->array.queue[index]); - __set_bit(index, cq->array.bitmap); - node->index = index; -} - -/** - *classqueue_get_min_prio: return the priority of the last node in queue - * - * this function can be called without runqueue lock held - */ -static inline int classqueue_get_min_prio(struct classqueue_struct *cq) -{ - cq_node_t *result = NULL; - int pos; - - /* - * search over the bitmap to get the first class in the queue - */ - pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset); - //do circular search from the beginning - if (pos >= CLASSQUEUE_SIZE) - pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); - - if (pos < CLASSQUEUE_SIZE) { - result = list_entry(cq->array.queue[pos].next, cq_node_t, list); - if (list_empty(&cq->array.queue[pos])) - result = NULL; - } - if (result) - return result->prio; - else - return 0; -} - -/** - * this function must be called with runqueue lock held - */ -cq_node_t *classqueue_get_head(struct classqueue_struct *cq) -{ - cq_node_t *result = NULL; - int pos; - - /* - * search over the bitmap to get the first class in the queue - */ - pos = find_next_bit(cq->array.bitmap, CLASSQUEUE_SIZE, cq->base_offset); - //do circular search from the beginning - if (pos >= CLASSQUEUE_SIZE) - pos = find_first_bit(cq->array.bitmap, CLASSQUEUE_SIZE); - - if (pos < CLASSQUEUE_SIZE) { - BUG_ON(list_empty(&cq->array.queue[pos])); - result = list_entry(cq->array.queue[pos].next, cq_node_t, list); - } - return result; -} - -/** - * Moving the end of queue forward - * the new_base here is logical, we need to translate to the abosule position - */ -void classqueue_update_base(struct classqueue_struct *cq) -{ - int new_base; - - if (! cq_nr_member(cq)) { - cq->base_offset = -1; //not defined - return; - } - - new_base = classqueue_get_min_prio(cq); - - if (new_base > cq->base) { - cq->base_offset = get_index(cq, &new_base); - cq->base = new_base; - } -} diff --git a/kernel/ckrm_sched.c b/kernel/ckrm_sched.c deleted file mode 100644 index 7ed70d042..000000000 --- a/kernel/ckrm_sched.c +++ /dev/null @@ -1,217 +0,0 @@ -/* kernel/ckrm_sched.c - Supporting functions for ckrm scheduling - * - * Copyright (C) Haoqiang Zheng, IBM Corp. 2004 - * (C) Hubertus Franke, IBM Corp. 2004 - * - * Latest version, more details at http://ckrm.sf.net - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ -#include -#include -#include - -rwlock_t class_list_lock = RW_LOCK_UNLOCKED; -LIST_HEAD(active_cpu_classes); // list of active cpu classes; anchor - -struct ckrm_cpu_class default_cpu_class_obj; - -struct ckrm_cpu_class * get_default_cpu_class(void) { - return (&default_cpu_class_obj); -} - -/*******************************************************/ -/* CVT Management */ -/*******************************************************/ - -static inline void check_inactive_class(ckrm_lrq_t * lrq,CVT_t cur_cvt) -{ - CVT_t min_cvt; - CVT_t bonus; - - //just a safty measure - if (unlikely(! cur_cvt)) - return; - -#ifndef INTERACTIVE_BONUS_SUPPORT -#warning "ACB taking out interactive bonus calculation" - bonus = 0; -#else - /* - * Always leaving a small bonus for inactive classes - * allows them to compete for cycles immediately when the become - * active. This should improve interactive behavior - */ - bonus = INTERACTIVE_BONUS(lrq); -#endif - - //cvt can't be negative - if (cur_cvt > bonus) - min_cvt = cur_cvt - bonus; - else - min_cvt = 0; - - if (lrq->local_cvt < min_cvt) { - CVT_t lost_cvt; - - lost_cvt = scale_cvt(min_cvt - lrq->local_cvt,lrq); - lrq->local_cvt = min_cvt; - - /* add what the class lost to its savings*/ - lrq->savings += lost_cvt; - if (lrq->savings > MAX_SAVINGS) - lrq->savings = MAX_SAVINGS; - } else if (lrq->savings) { - /* - *if a class saving and falling behind - * then start to use it saving in a leaking bucket way - */ - CVT_t savings_used; - - savings_used = scale_cvt((lrq->local_cvt - min_cvt),lrq); - if (savings_used > lrq->savings) - savings_used = lrq->savings; - - if (savings_used > SAVINGS_LEAK_SPEED) - savings_used = SAVINGS_LEAK_SPEED; - - BUG_ON(lrq->savings < savings_used); - lrq->savings -= savings_used; - unscale_cvt(savings_used,lrq); - BUG_ON(lrq->local_cvt < savings_used); -#ifndef CVT_SAVINGS_SUPPORT -#warning "ACB taking out cvt saving" -#else - lrq->local_cvt -= savings_used; -#endif - } -} - -/* - * return the max_cvt of all the classes - */ -static inline CVT_t get_max_cvt(int this_cpu) -{ - struct ckrm_cpu_class *clsptr; - ckrm_lrq_t * lrq; - CVT_t max_cvt; - - max_cvt = 0; - - /*update class time, at the same time get max_cvt */ - list_for_each_entry(clsptr, &active_cpu_classes, links) { - lrq = get_ckrm_lrq(clsptr, this_cpu); - if (lrq->local_cvt > max_cvt) - max_cvt = lrq->local_cvt; - } - - return max_cvt; -} - -/** - * update_class_cputime - updates cvt of inactive classes - * -- an inactive class shouldn't starve others when it comes back - * -- the cpu time it lost when it's inactive should be accumulated - * -- its accumulated saving should be compensated (in a leaky bucket fashion) - * - * class_list_lock must have been acquired - */ -void update_class_cputime(int this_cpu) -{ - struct ckrm_cpu_class *clsptr; - ckrm_lrq_t * lrq; - CVT_t cur_cvt; - - /* - * a class's local_cvt must not be significantly smaller than min_cvt - * of active classes otherwise, it will starve other classes when it - * is reactivated. - * - * Hence we keep all local_cvt's within a range of the min_cvt off - * all active classes (approximated by the local_cvt of the currently - * running class) and account for how many cycles where thus taken - * from an inactive class building a savings (not to exceed a few seconds) - * for a class to gradually make up upon reactivation, without - * starvation of other classes. - * - */ - cur_cvt = get_local_cur_cvt(this_cpu); - - /* - * cur_cvt == 0 means the system is now idle - * in this case, we use max_cvt as cur_cvt - * max_cvt roughly represents the cvt of the class - * that has just finished running - * - * fairness wouldn't be a problem since we account for whatever lost in savings - * if the system is not busy, the system responsiveness is not a problem. - * still fine if the sytem is busy, but happened to be idle at this certain point - * since bias toward interactive classes (class priority) is a more important way to improve system responsiveness - */ - if (unlikely(! cur_cvt)) { - cur_cvt = get_max_cvt(this_cpu); - //return; - } - - /* - * - check the local cvt of all the classes - * - update total_ns received by the class - * - do a usage sampling for the whole class - */ - list_for_each_entry(clsptr, &active_cpu_classes, links) { - lrq = get_ckrm_lrq(clsptr, this_cpu); - - spin_lock(&clsptr->stat.stat_lock); - clsptr->stat.total_ns += lrq->uncounted_ns; - ckrm_sample_usage(clsptr); - spin_unlock(&clsptr->stat.stat_lock); - lrq->uncounted_ns = 0; - - check_inactive_class(lrq,cur_cvt); - } -} - -/*******************************************************/ -/* PID load balancing stuff */ -/*******************************************************/ -#define PID_SAMPLE_T 32 -#define PID_KP 20 -#define PID_KI 60 -#define PID_KD 20 - -/** - * sample pid load periodically - */ - -void ckrm_load_sample(ckrm_load_t* pid,int cpu) -{ - long load; - long err; - - if (jiffies % PID_SAMPLE_T) - return; - - adjust_local_weight(); - - load = ckrm_cpu_load(cpu); - err = load - pid->load_p; - pid->load_d = err; - pid->load_p = load; - pid->load_i *= 9; - pid->load_i += load; - pid->load_i /= 10; -} - -long pid_get_pressure(ckrm_load_t* ckrm_load, int local_group) -{ - long pressure; - pressure = ckrm_load->load_p * PID_KP; - pressure += ckrm_load->load_i * PID_KI; - pressure += ckrm_load->load_d * PID_KD; - pressure /= 100; - return pressure; -} diff --git a/kernel/exit.c b/kernel/exit.c index 8ca3c1711..a5ab322f4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,9 +25,6 @@ #include #include #include -#include -#include -#include #include #include @@ -514,7 +511,6 @@ static inline void __exit_mm(struct task_struct * tsk) task_lock(tsk); tsk->mm = NULL; up_read(&mm->mmap_sem); - ckrm_task_clear_mm(tsk, mm); enter_lazy_tlb(mm, current); task_unlock(tsk); mmput(mm); @@ -671,8 +667,6 @@ static void exit_notify(struct task_struct *tsk) struct task_struct *t; struct list_head ptrace_dead, *_p, *_n; - ckrm_cb_exit(tsk); - if (signal_pending(tsk) && !tsk->signal->group_exit && !thread_group_empty(tsk)) { /* diff --git a/kernel/fork.c b/kernel/fork.c index 1902e9d2e..a8fc22411 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -39,9 +39,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -160,8 +157,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->thread_info = ti; ti->task = tsk; - ckrm_cb_newtask(tsk); - ckrm_task_mm_init(tsk); /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); return tsk; @@ -309,7 +304,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; - ckrm_mm_init(mm); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -331,7 +325,6 @@ struct mm_struct * mm_alloc(void) if (mm) { memset(mm, 0, sizeof(*mm)); mm = mm_init(mm); - ckrm_mm_setclass(mm, ckrm_get_mem_class(current)); } return mm; } @@ -346,7 +339,6 @@ void fastcall __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); - ckrm_mm_clearclass(mm); clr_vx_info(&mm->mm_vx_info); free_mm(mm); } @@ -486,10 +478,8 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) goto free_pt; good_mm: - ckrm_mm_setclass(mm, oldmm->memclass); tsk->mm = mm; tsk->active_mm = mm; - ckrm_init_mm_to_task(mm, tsk); return 0; free_pt: @@ -902,7 +892,6 @@ static task_t *copy_process(unsigned long clone_flags, if (p->binfmt && !try_module_get(p->binfmt->module)) goto bad_fork_cleanup_put_domain; - init_delays(p); p->did_exec = 0; copy_flags(clone_flags, p); p->pid = pid; @@ -1195,9 +1184,6 @@ long do_fork(unsigned long clone_flags, clone_flags |= CLONE_PTRACE; } - if (numtasks_get_ref(current->taskclass, 0) == 0) { - return -ENOMEM; - } p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); /* @@ -1207,8 +1193,6 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; - ckrm_cb_fork(p); - if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); @@ -1239,7 +1223,6 @@ long do_fork(unsigned long clone_flags, ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); } } else { - numtasks_put_ref(current->taskclass); free_pidmap(pid); pid = PTR_ERR(p); } diff --git a/kernel/sched.c b/kernel/sched.c index 10548388c..ff272c8cb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -17,6 +17,7 @@ * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin */ + #include #include #include @@ -43,10 +44,6 @@ #include #include #include -#include -#include -#include -#include #include #include @@ -60,10 +57,6 @@ #define cpu_to_node_mask(cpu) (cpu_online_map) #endif -/* used to soft spin in sched while dump is in progress */ -unsigned long dump_oncpu; -EXPORT_SYMBOL(dump_oncpu); - /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -169,20 +162,8 @@ EXPORT_SYMBOL(dump_oncpu); #define LOW_CREDIT(p) \ ((p)->interactive_credit < -CREDIT_LIMIT) -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/* - * if belong to different class, compare class priority - * otherwise compare task priority - */ -#define TASK_PREEMPTS_CURR(p, rq) \ - ( ((p)->cpu_class != (rq)->curr->cpu_class) \ - && ((rq)->curr != (rq)->idle) && ((p) != (rq)->idle )) \ - ? class_preempts_curr((p),(rq)->curr) \ - : ((p)->prio < (rq)->curr->prio) -#else #define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio) -#endif /* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] @@ -196,7 +177,7 @@ EXPORT_SYMBOL(dump_oncpu); #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) -unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { if (p->static_prio < NICE_TO_PRIO(0)) return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); @@ -210,9 +191,15 @@ unsigned int task_timeslice(task_t *p) * These are the runqueue data structures: */ +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + typedef struct runqueue runqueue_t; -#include -#include + +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; +}; /* * This is the main, per-CPU runqueue data structure. @@ -246,12 +233,7 @@ struct runqueue { unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - struct classqueue_struct classqueue; - ckrm_load_t ckrm_load; -#else - prio_array_t *active, *expired, arrays[2]; -#endif + prio_array_t *active, *expired, arrays[2]; int best_expired_prio; atomic_t nr_iowait; @@ -585,101 +567,6 @@ static inline void sched_info_switch(task_t *prev, task_t *next) #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS */ -#ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline ckrm_lrq_t *rq_get_next_class(struct runqueue *rq) -{ - cq_node_t *node = classqueue_get_head(&rq->classqueue); - return ((node) ? class_list_entry(node) : NULL); -} - -/* - * return the cvt of the current running class - * if no current running class, return 0 - * assume cpu is valid (cpu_online(cpu) == 1) - */ -CVT_t get_local_cur_cvt(int cpu) -{ - ckrm_lrq_t * lrq = rq_get_next_class(cpu_rq(cpu)); - - if (lrq) - return lrq->local_cvt; - else - return 0; -} - -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) -{ - prio_array_t *array; - struct task_struct *next; - ckrm_lrq_t *queue; - int idx; - int cpu = smp_processor_id(); - - // it is guaranteed be the ( rq->nr_running > 0 ) check in - // schedule that a task will be found. - - retry_next_class: - queue = rq_get_next_class(rq); - // BUG_ON( !queue ); - - array = queue->active; - if (unlikely(!array->nr_active)) { - queue->active = queue->expired; - queue->expired = array; - queue->expired_timestamp = 0; - - schedstat_inc(rq, sched_switch); - if (queue->active->nr_active) - set_top_priority(queue, - find_first_bit(queue->active->bitmap, MAX_PRIO)); - else { - classqueue_dequeue(queue->classqueue, - &queue->classqueue_linkobj); - cpu_demand_event(get_rq_local_stat(queue,cpu),CPU_DEMAND_DEQUEUE,0); - } - goto retry_next_class; - } else - schedstat_inc(rq, sched_noswitch); - // BUG_ON(!array->nr_active); - - idx = queue->top_priority; - // BUG_ON (idx == MAX_PRIO); - next = task_list_entry(array->queue[idx].next); - return next; -} -#else /*! CONFIG_CKRM_CPU_SCHEDULE*/ -static inline struct task_struct * rq_get_next_task(struct runqueue* rq) -{ - prio_array_t *array; - struct list_head *queue; - int idx; - - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - schedstat_inc(rq, sched_switch); - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } else - schedstat_inc(rq, sched_noswitch); - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - return list_entry(queue->next, task_t, run_list); -} - -static inline void class_enqueue_task(struct task_struct* p, prio_array_t *array) { } -static inline void class_dequeue_task(struct task_struct* p, prio_array_t *array) { } -static inline void init_cpu_classes(void) { } -#define rq_ckrm_load(rq) NULL -static inline void ckrm_sched_tick(int j,int this_cpu,void* name) {} -#endif /* CONFIG_CKRM_CPU_SCHEDULE */ - /* * Adding/removing a task to/from a priority array: */ @@ -689,7 +576,6 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array) list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); - class_dequeue_task(p,array); } static void enqueue_task(struct task_struct *p, prio_array_t *array) @@ -699,7 +585,6 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; - class_enqueue_task(p,array); } /* @@ -713,7 +598,6 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; - class_enqueue_task(p,array); } /* @@ -740,10 +624,9 @@ static int effective_prio(task_t *p) bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; -#ifdef CONFIG_VSERVER_HARDCPU if (task_vx_flags(p, VXF_SCHED_PRIO, 0)) prio += effective_vavavoom(p, MAX_USER_PRIO); -#endif + if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; if (prio > MAX_PRIO-1) @@ -756,7 +639,7 @@ static int effective_prio(task_t *p) */ static inline void __activate_task(task_t *p, runqueue_t *rq) { - enqueue_task(p, rq_active(p,rq)); + enqueue_task(p, rq->active); rq->nr_running++; } @@ -765,7 +648,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) */ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { - enqueue_task_head(p, rq_active(p,rq)); + enqueue_task_head(p, rq->active); rq->nr_running++; } @@ -900,7 +783,6 @@ static void __deactivate_task(struct task_struct *p, runqueue_t *rq) { rq->nr_running--; dequeue_task(p, p->array); - p->array = NULL; } @@ -1299,9 +1181,6 @@ void fastcall sched_fork(task_t *p) #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#ifdef CONFIG_CKRM_CPU_SCHEDULE - cpu_demand_event(&p->demand_stat,CPU_DEMAND_INIT,0); -#endif #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which @@ -1389,7 +1268,6 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) p->array = current->array; p->array->nr_active++; rq->nr_running++; - class_enqueue_task(p,p->array); } set_need_resched(); } else @@ -1821,449 +1699,6 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } -#ifdef CONFIG_CKRM_CPU_SCHEDULE -static inline int ckrm_preferred_task(task_t *tmp,long min, long max, - int phase, enum idle_type idle) -{ - long pressure = task_load(tmp); - - if (pressure > max) - return 0; - - if ((idle == NOT_IDLE) && ! phase && (pressure <= min)) - return 0; - return 1; -} - -/* - * move tasks for a specic local class - * return number of tasks pulled - */ -static inline int ckrm_cls_move_tasks(ckrm_lrq_t* src_lrq,ckrm_lrq_t*dst_lrq, - runqueue_t *this_rq, - runqueue_t *busiest, - struct sched_domain *sd, - int this_cpu, - enum idle_type idle, - long* pressure_imbalance) -{ - prio_array_t *array, *dst_array; - struct list_head *head, *curr; - task_t *tmp; - int idx; - int pulled = 0; - int phase = -1; - long pressure_min, pressure_max; - /*hzheng: magic : 90% balance is enough*/ - long balance_min = *pressure_imbalance / 10; -/* - * we don't want to migrate tasks that will reverse the balance - * or the tasks that make too small difference - */ -#define CKRM_BALANCE_MAX_RATIO 100 -#define CKRM_BALANCE_MIN_RATIO 1 - start: - phase ++; - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (src_lrq->expired->nr_active) { - array = src_lrq->expired; - dst_array = dst_lrq->expired; - } else { - array = src_lrq->active; - dst_array = dst_lrq->active; - } - - new_array: - /* Start searching at priority 0: */ - idx = 0; - skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == src_lrq->expired && src_lrq->active->nr_active) { - array = src_lrq->active; - dst_array = dst_lrq->active; - goto new_array; - } - if ((! phase) && (! pulled) && (idle != IDLE)) - goto start; //try again - else - goto out; //finished search for this lrq - } - - head = array->queue + idx; - curr = head->prev; - skip_queue: - tmp = list_entry(curr, task_t, run_list); - - curr = curr->prev; - - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - - pressure_min = *pressure_imbalance * CKRM_BALANCE_MIN_RATIO/100; - pressure_max = *pressure_imbalance * CKRM_BALANCE_MAX_RATIO/100; - /* - * skip the tasks that will reverse the balance too much - */ - if (ckrm_preferred_task(tmp,pressure_min,pressure_max,phase,idle)) { - *pressure_imbalance -= task_load(tmp); - pull_task(busiest, array, tmp, - this_rq, dst_array, this_cpu); - pulled++; - - if (*pressure_imbalance <= balance_min) - goto out; - } - - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - out: - return pulled; -} - -static inline long ckrm_rq_imbalance(runqueue_t *this_rq,runqueue_t *dst_rq) -{ - long imbalance; - /* - * make sure after balance, imbalance' > - imbalance/2 - * we don't want the imbalance be reversed too much - */ - imbalance = pid_get_pressure(rq_ckrm_load(dst_rq),0) - - pid_get_pressure(rq_ckrm_load(this_rq),1); - imbalance /= 2; - return imbalance; -} - -/* - * try to balance the two runqueues - * - * Called with both runqueues locked. - * if move_tasks is called, it will try to move at least one task over - */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) -{ - struct ckrm_cpu_class *clsptr,*vip_cls = NULL; - ckrm_lrq_t* src_lrq,*dst_lrq; - long pressure_imbalance, pressure_imbalance_old; - int src_cpu = task_cpu(busiest->curr); - struct list_head *list; - int pulled = 0; - long imbalance; - - imbalance = ckrm_rq_imbalance(this_rq,busiest); - - if ((idle == NOT_IDLE && imbalance <= 0) || busiest->nr_running <= 1) - goto out; - - //try to find the vip class - list_for_each_entry(clsptr,&active_cpu_classes,links) { - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - - if (! lrq_nr_running(src_lrq)) - continue; - - if (! vip_cls || cpu_class_weight(vip_cls) < cpu_class_weight(clsptr) ) - { - vip_cls = clsptr; - } - } - - /* - * do search from the most significant class - * hopefully, less tasks will be migrated this way - */ - clsptr = vip_cls; - - move_class: - if (! clsptr) - goto out; - - - src_lrq = get_ckrm_lrq(clsptr,src_cpu); - if (! lrq_nr_running(src_lrq)) - goto other_class; - - dst_lrq = get_ckrm_lrq(clsptr,this_cpu); - - //how much pressure for this class should be transferred - pressure_imbalance = src_lrq->lrq_load * imbalance/src_lrq->local_weight; - if (pulled && ! pressure_imbalance) - goto other_class; - - pressure_imbalance_old = pressure_imbalance; - - //move tasks - pulled += - ckrm_cls_move_tasks(src_lrq,dst_lrq, - this_rq, - busiest, - sd,this_cpu,idle, - &pressure_imbalance); - - /* - * hzheng: 2 is another magic number - * stop balancing if the imbalance is less than 25% of the orig - */ - if (pressure_imbalance <= (pressure_imbalance_old >> 2)) - goto out; - - //update imbalance - imbalance *= pressure_imbalance / pressure_imbalance_old; - other_class: - //who is next? - list = clsptr->links.next; - if (list == &active_cpu_classes) - list = list->next; - clsptr = list_entry(list, typeof(*clsptr), links); - if (clsptr != vip_cls) - goto move_class; - out: - return pulled; -} - -/** - * ckrm_check_balance - is load balancing necessary? - * return 0 if load balancing is not necessary - * otherwise return the average load of the system - * also, update nr_group - * - * heuristics: - * no load balancing if it's load is over average - * no load balancing if it's load is far more than the min - * task: - * read the status of all the runqueues - */ -static unsigned long ckrm_check_balance(struct sched_domain *sd, int this_cpu, - enum idle_type idle, int* nr_group) -{ - struct sched_group *group = sd->groups; - unsigned long min_load, max_load, avg_load; - unsigned long total_load, this_load, total_pwr; - - max_load = this_load = total_load = total_pwr = 0; - min_load = 0xFFFFFFFF; - *nr_group = 0; - - do { - cpumask_t tmp; - unsigned long load; - int local_group; - int i, nr_cpus = 0; - - /* Tally up the load of all CPUs in the group */ - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; - - avg_load = 0; - local_group = cpu_isset(this_cpu, group->cpumask); - - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),local_group); - nr_cpus++; - avg_load += load; - } - - if (!nr_cpus) - goto nextgroup; - - total_load += avg_load; - total_pwr += group->cpu_power; - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - goto nextgroup; - } else if (avg_load > max_load) { - max_load = avg_load; - } - if (avg_load < min_load) { - min_load = avg_load; - } -nextgroup: - group = group->next; - *nr_group = *nr_group + 1; - } while (group != sd->groups); - - if (!max_load || this_load >= max_load) - goto out_balanced; - - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; - - /* hzheng: debugging: 105 is a magic number - * 100*max_load <= sd->imbalance_pct*this_load) - * should use imbalance_pct instead - */ - if (this_load > avg_load - || 100*max_load < 105*this_load - || 100*min_load < 70*this_load - ) - goto out_balanced; - - return avg_load; - out_balanced: - return 0; -} - -/** - * any group that has above average load is considered busy - * find the busiest queue from any of busy group - */ -static runqueue_t * -ckrm_find_busy_queue(struct sched_domain *sd, int this_cpu, - unsigned long avg_load, enum idle_type idle, - int nr_group) -{ - struct sched_group *group; - runqueue_t * busiest=NULL; - unsigned long rand; - - group = sd->groups; - rand = get_ckrm_rand(nr_group); - nr_group = 0; - - do { - unsigned long load,total_load,max_load; - cpumask_t tmp; - int i; - runqueue_t * grp_busiest; - - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto find_nextgroup; - - total_load = 0; - max_load = 0; - grp_busiest = NULL; - for_each_cpu_mask(i, tmp) { - load = pid_get_pressure(rq_ckrm_load(cpu_rq(i)),0); - total_load += load; - if (load > max_load) { - max_load = load; - grp_busiest = cpu_rq(i); - } - } - - total_load = (total_load * SCHED_LOAD_SCALE) / group->cpu_power; - if (total_load > avg_load) { - busiest = grp_busiest; - if (nr_group >= rand) - break; - } - find_nextgroup: - group = group->next; - nr_group ++; - } while (group != sd->groups); - - return busiest; -} - -/** - * load_balance - pressure based load balancing algorithm used by ckrm - */ -static int ckrm_load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - runqueue_t *busiest; - unsigned long avg_load; - int nr_moved,nr_group; - - avg_load = ckrm_check_balance(sd, this_cpu, idle, &nr_group); - if (! avg_load) - goto out_balanced; - - busiest = ckrm_find_busy_queue(sd,this_cpu,avg_load,idle,nr_group); - if (! busiest) - goto out_balanced; - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } - - nr_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - 0,sd, idle); - spin_unlock(&busiest->lock); - if (nr_moved) { - adjust_local_weight(); - } - } - - if (!nr_moved) - sd->nr_balance_failed ++; - else - sd->nr_balance_failed = 0; - - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - - return nr_moved; - -out_balanced: - /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - - return 0; -} - -/* - * this_rq->lock is already held - */ -static inline int load_balance_newidle(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd) -{ - int ret; - read_lock(&class_list_lock); - ret = ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - return ret; -} - -static inline int load_balance(int this_cpu, runqueue_t *this_rq, - struct sched_domain *sd, enum idle_type idle) -{ - int ret; - - spin_lock(&this_rq->lock); - read_lock(&class_list_lock); - ret= ckrm_load_balance(this_cpu,this_rq,sd,NEWLY_IDLE); - read_unlock(&class_list_lock); - spin_unlock(&this_rq->lock); - return ret; -} -#else /*! CONFIG_CKRM_CPU_SCHEDULE */ /* * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, * as part of a balancing operation within "domain". Returns the number of @@ -2656,8 +2091,6 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out: return nr_moved; } -#endif /* CONFIG_CKRM_CPU_SCHEDULE*/ - /* * idle_balance is called by schedule() if this_cpu is about to become @@ -2813,7 +2246,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, } } } -#else /* SMP*/ +#else /* * on UP we do not need to balance between CPUs: */ @@ -2844,6 +2277,7 @@ static inline int wake_priority_sleeper(runqueue_t *rq) } DEFINE_PER_CPU(struct kernel_stat, kstat); + EXPORT_PER_CPU_SYMBOL(kstat); /* @@ -2856,19 +2290,11 @@ EXPORT_PER_CPU_SYMBOL(kstat); * increasing number of running tasks. We also ignore the interactivity * if a better static_prio task has expired: */ - -#ifndef CONFIG_CKRM_CPU_SCHEDULE #define EXPIRED_STARVING(rq) \ ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ (jiffies - (rq)->expired_timestamp >= \ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ ((rq)->curr->static_prio > (rq)->best_expired_prio)) -#else -#define EXPIRED_STARVING(rq) \ - (STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * (lrq_nr_running(rq)) + 1))) -#endif /* * This function gets called by the timer code, with HZ frequency. @@ -2915,8 +2341,6 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (wake_priority_sleeper(rq)) goto out; - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); - #ifdef CONFIG_VSERVER_HARDCPU_IDLE if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) set_need_resched(); @@ -2931,7 +2355,7 @@ void scheduler_tick(int user_ticks, int sys_ticks) cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq_active(p,rq)) { + if (p->array != rq->active) { set_tsk_need_resched(p); goto out; } @@ -2954,17 +2378,12 @@ void scheduler_tick(int user_ticks, int sys_ticks) set_tsk_need_resched(p); /* put it at the end of the queue: */ - dequeue_task(p, rq_active(p,rq)); - enqueue_task(p, rq_active(p,rq)); + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); } goto out_unlock; } -#warning MEF: vx_need_resched incorpates standard kernel code, which it should not. if (vx_need_resched(p)) { -#ifdef CONFIG_CKRM_CPU_SCHEDULE - /* Hubertus ... we can abstract this out */ - ckrm_lrq_t* rq = get_task_lrq(p); -#endif dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); @@ -2975,8 +2394,8 @@ void scheduler_tick(int user_ticks, int sys_ticks) rq->expired_timestamp = jiffies; if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { enqueue_task(p, rq->expired); - if (p->static_prio < this_rq()->best_expired_prio) - this_rq()->best_expired_prio = p->static_prio; + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; } else enqueue_task(p, rq->active); } else { @@ -2999,18 +2418,17 @@ void scheduler_tick(int user_ticks, int sys_ticks) if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY(p)) && (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq_active(p,rq))) { + (p->array == rq->active)) { - dequeue_task(p, rq_active(p,rq)); + dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); - enqueue_task(p, rq_active(p,rq)); + enqueue_task(p, rq->active); } } out_unlock: spin_unlock(&rq->lock); out: - ckrm_sched_tick(jiffies,cpu,rq_ckrm_load(rq)); rebalance_tick(cpu, rq, NOT_IDLE); } @@ -3024,19 +2442,6 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) if (!(sd->flags & SD_SHARE_CPUPOWER)) return; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - if (prev != rq->idle) { - unsigned long long run = now - prev->timestamp; - ckrm_lrq_t * lrq = get_task_lrq(prev); - - lrq->lrq_load -= task_load(prev); - cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); - lrq->lrq_load += task_load(prev); - - cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); - update_local_cvt(prev, run); - } -#endif /* * Unlock the current runqueue because we have to lock in * CPU order to avoid deadlocks. Caller knows that we might @@ -3161,22 +2566,14 @@ asmlinkage void __sched schedule(void) task_t *prev, *next; runqueue_t *rq; prio_array_t *array; + struct list_head *queue; unsigned long long now; unsigned long run_time; #ifdef CONFIG_VSERVER_HARDCPU struct vx_info *vxi; int maxidle = -HZ; #endif - int cpu; - - /* - * If crash dump is in progress, this other cpu's - * need to wait until it completes. - * NB: this code is optimized away for kernels without - * dumping enabled. - */ - if (unlikely(dump_oncpu)) - goto dump_scheduling_disabled; + int cpu, idx; /* * Test if we are atomic. Since do_exit() needs to call into @@ -3226,20 +2623,6 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - if (prev != rq->idle) { - unsigned long long run = now - prev->timestamp; - ckrm_lrq_t * lrq = get_task_lrq(prev); - - lrq->lrq_load -= task_load(prev); - cpu_demand_event(&prev->demand_stat,CPU_DEMAND_DESCHEDULE,run); - lrq->lrq_load += task_load(prev); - - cpu_demand_event(get_task_lrq_stat(prev),CPU_DEMAND_DESCHEDULE,run); - update_local_cvt(prev, run); - } -#endif - if (unlikely(current->flags & PF_DEAD)) current->state = EXIT_DEAD; /* @@ -3288,7 +2671,7 @@ need_resched_nonpreemptible: if (next->static_prio < rq->best_expired_prio) rq->best_expired_prio = next->static_prio; - // printk("··· %8lu unhold %p [%d]\n", jiffies, next, next->prio); + // printk("··· %8lu unhold %p [%d]\n", jiffies, next, next->prio); break; } if ((ret < 0) && (maxidle < ret)) @@ -3330,11 +2713,23 @@ go_idle: goto go_idle; } - /* MEF: CKRM refactored code into rq_get_next_task(); make - * sure that when upgrading changes are reflected into both - * versions of the code. - */ - next = rq_get_next_task(rq); + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + rq->best_expired_prio = MAX_PRIO; + } else + schedstat_inc(rq, sched_noswitch); + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); #ifdef CONFIG_VSERVER_HARDCPU vxi = next->vx_info; @@ -3350,7 +2745,7 @@ go_idle: vx_onhold_inc(vxi); next->state |= TASK_ONHOLD; list_add_tail(&next->run_list, &rq->hold_queue); - //printk("··· %8lu hold %p [%d]\n", jiffies, next, next->prio); + //printk("··· %8lu hold %p [%d]\n", jiffies, next, next->prio); goto pick_next; } } @@ -3404,19 +2799,10 @@ switch_tasks: preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; - - return; - - dump_scheduling_disabled: - /* allow scheduling only if this is the dumping cpu */ - if (dump_oncpu != smp_processor_id()+1) { - while (dump_oncpu) - cpu_relax(); - } - return; } EXPORT_SYMBOL(schedule); + #ifdef CONFIG_PREEMPT /* * this is is the entry point to schedule() from in-kernel preemption @@ -4135,7 +3521,7 @@ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; - prio_array_t *target = rq_expired(current,rq); + prio_array_t *target = rq->expired; schedstat_inc(rq, yld_cnt); /* @@ -4146,17 +3532,14 @@ asmlinkage long sys_sched_yield(void) * array.) */ if (rt_task(current)) - target = rq_active(current,rq); + target = rq->active; -#warning MEF need to fix up SCHEDSTATS code, but I hope this is fixed by the 2.6.10 CKRM patch -#ifdef CONFIG_SCHEDSTATS if (current->array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); if (!rq->expired->nr_active) schedstat_inc(rq, yld_both_empty); } else if (!rq->expired->nr_active) schedstat_inc(rq, yld_exp_empty); -#endif dequeue_task(current, array); enqueue_task(current, target); @@ -4421,12 +3804,6 @@ void __devinit init_idle(task_t *idle, int cpu) idle->state = TASK_RUNNING; set_task_cpu(idle, cpu); -#ifdef CONFIG_CKRM_CPU_SCHEDULE - cpu_demand_event(&(idle->demand_stat),CPU_DEMAND_INIT,0); - idle->cpu_class = get_default_cpu_class(); - idle->array = NULL; -#endif - spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; set_tsk_need_resched(idle); @@ -4535,6 +3912,7 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + set_task_cpu(p, dest_cpu); if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -4545,12 +3923,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); - set_task_cpu(p, dest_cpu); activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); - } else - set_task_cpu(p, dest_cpu); + } out: double_rq_unlock(rq_src, rq_dest); @@ -5093,14 +4469,12 @@ static void __devinit arch_init_sched_domains(void) &cpu_to_phys_group); } - #ifdef CONFIG_NUMA /* Set up node groups */ init_sched_build_groups(sched_group_nodes, cpu_default_map, &cpu_to_node_group); #endif - /* Calculate CPU power for physical packages and nodes */ for_each_cpu_mask(i, cpu_default_map) { int power; @@ -5116,7 +4490,6 @@ static void __devinit arch_init_sched_domains(void) (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; - #ifdef CONFIG_NUMA if (i == first_cpu(sd->groups->cpumask)) { /* Only add "power" once for each physical package. */ @@ -5136,7 +4509,6 @@ static void __devinit arch_init_sched_domains(void) #endif cpu_attach_domain(sd, i); } - last->next = first; } #ifdef CONFIG_HOTPLUG_CPU @@ -5313,43 +4685,20 @@ int in_sched_functions(unsigned long addr) void __init sched_init(void) { runqueue_t *rq; - int i; - - init_cpu_classes(); + int i, j, k; for (i = 0; i < NR_CPUS; i++) { -#ifndef CONFIG_CKRM_CPU_SCHEDULE - int j, k; prio_array_t *array; rq = cpu_rq(i); spin_lock_init(&rq->lock); - - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } - rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; -#else - rq = cpu_rq(i); - spin_lock_init(&rq->lock); -#endif - #ifdef CONFIG_SMP rq->sd = &sched_domain_dummy; rq->cpu_load = 0; -#ifdef CONFIG_CKRM_CPU_SCHEDULE - ckrm_load_init(rq_ckrm_load(rq)); -#endif rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; @@ -5360,6 +4709,15 @@ void __init sched_init(void) #endif atomic_set(&rq->nr_iowait, 0); + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } } /* @@ -5399,37 +4757,6 @@ void __might_sleep(char *file, int line) EXPORT_SYMBOL(__might_sleep); #endif -#ifdef CONFIG_CKRM_CPU_SCHEDULE -/** - * return the classqueue object of a certain processor - */ -struct classqueue_struct * get_cpu_classqueue(int cpu) -{ - return (& (cpu_rq(cpu)->classqueue) ); -} - -/** - * _ckrm_cpu_change_class - change the class of a task - */ -void _ckrm_cpu_change_class(task_t *tsk, struct ckrm_cpu_class *newcls) -{ - prio_array_t *array; - struct runqueue *rq; - unsigned long flags; - - rq = task_rq_lock(tsk,&flags); - array = tsk->array; - if (array) { - dequeue_task(tsk,array); - tsk->cpu_class = newcls; - enqueue_task(tsk,rq_active(tsk,rq)); - } else - tsk->cpu_class = newcls; - - task_rq_unlock(rq,&flags); -} -#endif - #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) { diff --git a/kernel/sys.c b/kernel/sys.c index 85a448959..167e9f904 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -568,7 +567,6 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) current->gid = new_rgid; key_fsgid_changed(current); - ckrm_cb_gid(); return 0; } @@ -608,7 +606,6 @@ asmlinkage long sys_setgid(gid_t gid) return -EPERM; key_fsgid_changed(current); - ckrm_cb_gid(); return 0; } @@ -699,7 +696,6 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) current->fsuid = current->euid; key_fsuid_changed(current); - ckrm_cb_uid(); return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); } @@ -747,7 +743,6 @@ asmlinkage long sys_setuid(uid_t uid) current->suid = new_suid; key_fsuid_changed(current); - ckrm_cb_uid(); return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); } @@ -796,7 +791,6 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) current->suid = suid; key_fsuid_changed(current); - ckrm_cb_uid(); return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); } @@ -849,7 +843,6 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) current->sgid = sgid; key_fsgid_changed(current); - ckrm_cb_gid(); return 0; } diff --git a/kernel/vserver/context.c b/kernel/vserver/context.c index 6b1c9bec0..5cb2f9636 100644 --- a/kernel/vserver/context.c +++ b/kernel/vserver/context.c @@ -30,7 +30,6 @@ #include #include #include -#include /* needed for ckrm_cb_xid() */ #include @@ -565,9 +564,6 @@ int vx_migrate_task(struct task_struct *p, struct vx_info *vxi) } out: - - ckrm_cb_xid(p); - put_vx_info(old_vxi); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 3a911dda5..53da76461 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1728,20 +1728,15 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */ - set_delay_flag(current,PF_MEMIO); spin_lock(&mm->page_table_lock); pmd = pmd_alloc(mm, pgd, address); if (pmd) { pte_t * pte = pte_alloc_map(mm, pmd, address); - if (pte) { - int rc = handle_pte_fault(mm, vma, address, write_access, pte, pmd); - clear_delay_flag(current,PF_MEMIO); - return rc; - } + if (pte) + return handle_pte_fault(mm, vma, address, write_access, pte, pmd); } spin_unlock(&mm->page_table_lock); - clear_delay_flag(current,PF_MEMIO); return VM_FAULT_OOM; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8c206e407..0456bc956 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -275,7 +274,6 @@ free_pages_bulk(struct zone *zone, int count, /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); __free_pages_bulk(page, base, zone, area, order); - ckrm_clear_page_class(page); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -366,14 +364,8 @@ static void prep_new_page(struct page *page, int order) page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | -#ifdef CONFIG_CKRM_RES_MEM - 1 << PG_ckrm_account | -#endif 1 << PG_checked | 1 << PG_mappedtodisk); page->private = 0; -#ifdef CONFIG_CKRM_RES_MEM - page->ckrm_zone = NULL; -#endif set_page_refs(page, order); } @@ -636,10 +628,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, */ can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; - if (!ckrm_class_limit_ok((ckrm_get_mem_class(current)))) { - return NULL; - } - zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ if (unlikely(zones[0] == NULL)) { @@ -1573,10 +1561,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat, } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); -#ifndef CONFIG_CKRM_RES_MEM INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); -#endif zone->nr_scan_active = 0; zone->nr_scan_inactive = 0; zone->nr_active = 0; diff --git a/mm/swap.c b/mm/swap.c index a7eb64921..ff0c7e695 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -30,7 +30,6 @@ #include #include #include -#include /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -72,12 +71,7 @@ EXPORT_SYMBOL(put_page); */ int rotate_reclaimable_page(struct page *page) { -#ifdef CONFIG_CKRM_RES_MEM - struct ckrm_zone *ckrm_zone = page_ckrmzone(page); - struct zone *zone = ckrm_zone->zone; -#else struct zone *zone = page_zone(page); -#endif unsigned long flags; if (PageLocked(page)) @@ -92,11 +86,7 @@ int rotate_reclaimable_page(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); if (PageLRU(page) && !PageActive(page)) { list_del(&page->lru); -#ifdef CONFIG_CKRM_RES_MEM - list_add_tail(&page->lru, &ckrm_zone->inactive_list); -#else list_add_tail(&page->lru, &zone->inactive_list); -#endif inc_page_state(pgrotated); } if (!test_clear_page_writeback(page)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f7fba513..ec85f5f6c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -38,7 +38,6 @@ #include #include -#include #include @@ -538,23 +537,13 @@ keep: * For pagecache intensive workloads, the first loop here is the hottest spot * in the kernel (apart from the copy_*_user functions). */ -#ifdef CONFIG_CKRM_RES_MEM -static void shrink_cache(struct ckrm_zone *ckrm_zone, struct scan_control *sc) -#else static void shrink_cache(struct zone *zone, struct scan_control *sc) -#endif { LIST_HEAD(page_list); struct pagevec pvec; int max_scan = sc->nr_to_scan; -#ifdef CONFIG_CKRM_RES_MEM - struct zone *zone = ckrm_zone->zone; - struct list_head *inactive_list = &ckrm_zone->inactive_list; - struct list_head *active_list = &ckrm_zone->active_list; -#else struct list_head *inactive_list = &zone->inactive_list; struct list_head *active_list = &zone->active_list; -#endif pagevec_init(&pvec, 1); @@ -589,7 +578,6 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) nr_taken++; } zone->nr_inactive -= nr_taken; - ckrm_zone_dec_inactive(ckrm_zone, nr_taken); spin_unlock_irq(&zone->lru_lock); if (nr_taken == 0) @@ -616,11 +604,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) BUG(); list_del(&page->lru); if (PageActive(page)) { - ckrm_zone_inc_active(ckrm_zone, 1); zone->nr_active++; list_add(&page->lru, active_list); } else { - ckrm_zone_inc_inactive(ckrm_zone, 1); zone->nr_inactive++; list_add(&page->lru, inactive_list); } @@ -654,11 +640,7 @@ done: * But we had to alter page->flags anyway. */ static void -#ifdef CONFIG_CKRM_RES_MEM -refill_inactive_zone(struct ckrm_zone *ckrm_zone, struct scan_control *sc) -#else refill_inactive_zone(struct zone *zone, struct scan_control *sc) -#endif { int pgmoved; int pgdeactivate = 0; @@ -673,14 +655,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) long mapped_ratio; long distress; long swap_tendency; -#ifdef CONFIG_CKRM_RES_MEM - struct zone *zone = ckrm_zone->zone; - struct list_head *active_list = &ckrm_zone->active_list; - struct list_head *inactive_list = &ckrm_zone->inactive_list; -#else struct list_head *active_list = &zone->active_list; struct list_head *inactive_list = &zone->inactive_list; -#endif lru_add_drain(); pgmoved = 0; @@ -709,7 +685,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) } zone->pages_scanned += pgscanned; zone->nr_active -= pgmoved; - ckrm_zone_dec_active(ckrm_zone, pgmoved); spin_unlock_irq(&zone->lru_lock); /* @@ -770,7 +745,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) list_move(&page->lru, inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { - ckrm_zone_inc_inactive(ckrm_zone, pgmoved); zone->nr_inactive += pgmoved; spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; @@ -781,7 +755,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } - ckrm_zone_inc_inactive(ckrm_zone, pgmoved); zone->nr_inactive += pgmoved; pgdeactivate += pgmoved; if (buffer_heads_over_limit) { @@ -800,7 +773,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) list_move(&page->lru, active_list); pgmoved++; if (!pagevec_add(&pvec, page)) { - ckrm_zone_inc_active(ckrm_zone, pgmoved); zone->nr_active += pgmoved; pgmoved = 0; spin_unlock_irq(&zone->lru_lock); @@ -808,7 +780,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } - ckrm_zone_inc_active(ckrm_zone, pgmoved); zone->nr_active += pgmoved; spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); @@ -817,183 +788,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) mod_page_state(pgdeactivate, pgdeactivate); } -#ifdef CONFIG_CKRM_RES_MEM -static int -shrink_weight(struct ckrm_zone *czone) -{ - u64 temp; - struct zone *zone = czone->zone; - struct ckrm_mem_res *cls = czone->memcls; - int zone_usage, zone_guar, zone_total, guar, ret, cnt; - - zone_usage = czone->nr_active + czone->nr_inactive; - czone->active_over = czone->inactive_over = 0; - - if (zone_usage < SWAP_CLUSTER_MAX * 4) - return 0; - - if (cls->pg_guar == CKRM_SHARE_DONTCARE) { - // no guarantee for this class. use implicit guarantee - guar = cls->impl_guar / cls->nr_dontcare; - } else { - guar = cls->pg_unused / cls->nr_dontcare; - } - zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages; - temp = (u64) guar * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_guar = (int) temp; - - ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ? - (zone_usage - zone_guar) : 0; - if (ret) { - cnt = czone->nr_active - (2 * zone_guar / 3); - if (cnt > 0) - czone->active_over = cnt; - cnt = czone->active_over + czone->nr_inactive - - zone_guar / 3; - if (cnt > 0) - czone->inactive_over = cnt; - } - return ret; -} - -static void -shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc) -{ - while (czone->shrink_active || czone->shrink_inactive) { - if (czone->shrink_active) { - sc->nr_to_scan = min(czone->shrink_active, - (unsigned long)SWAP_CLUSTER_MAX); - czone->shrink_active -= sc->nr_to_scan; - refill_inactive_zone(czone, sc); - } - if (czone->shrink_inactive) { - sc->nr_to_scan = min(czone->shrink_inactive, - (unsigned long)SWAP_CLUSTER_MAX); - czone->shrink_inactive -= sc->nr_to_scan; - shrink_cache(czone, sc); - if (sc->nr_to_reclaim <= 0) { - czone->shrink_active = 0; - czone->shrink_inactive = 0; - break; - } - } - - throttle_vm_writeout(); - } -} - -/* insert an entry to the list and sort decendently*/ -static void -list_add_sort(struct list_head *entry, struct list_head *head) -{ - struct ckrm_zone *czone, *new = - list_entry(entry, struct ckrm_zone, victim_list); - struct list_head* pos = head->next; - - while (pos != head) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - if (new->shrink_weight > czone->shrink_weight) { - __list_add(entry, pos->prev, pos); - return; - } - pos = pos->next; - } - list_add_tail(entry, head); - return; -} - -static void -shrink_choose_victims(struct list_head *victims, - unsigned long nr_active, unsigned long nr_inactive) -{ - unsigned long nr; - struct ckrm_zone* czone; - struct list_head *pos, *next; - - pos = victims->next; - while ((pos != victims) && (nr_active || nr_inactive)) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - - if (nr_active && czone->active_over) { - nr = min(nr_active, czone->active_over); - czone->shrink_active += nr; - czone->active_over -= nr; - nr_active -= nr; - } - - if (nr_inactive && czone->inactive_over) { - nr = min(nr_inactive, czone->inactive_over); - czone->shrink_inactive += nr; - czone->inactive_over -= nr; - nr_inactive -= nr; - } - pos = pos->next; - } - - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - next = pos->next; - if (czone->shrink_active == 0 && czone->shrink_inactive == 0) { - list_del_init(pos); - ckrm_clear_shrink(czone); - } - pos = next; - } - return; -} - -static void -shrink_get_victims(struct zone *zone, unsigned long nr_active, - unsigned long nr_inactive, struct list_head *victims) -{ - struct list_head *pos; - struct ckrm_mem_res *cls; - struct ckrm_zone *czone; - int zoneindex = zone_idx(zone); - - if (ckrm_nr_mem_classes <= 1) { - if (ckrm_mem_root_class) { - czone = ckrm_mem_root_class->ckrm_zone + zoneindex; - if (!ckrm_test_set_shrink(czone)) { - list_add(&czone->victim_list, victims); - czone->shrink_active = nr_active; - czone->shrink_inactive = nr_inactive; - } - } - return; - } - spin_lock_irq(&ckrm_mem_lock); - list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) { - czone = cls->ckrm_zone + zoneindex; - if (ckrm_test_set_shrink(czone)) - continue; - - czone->shrink_active = 0; - czone->shrink_inactive = 0; - czone->shrink_weight = shrink_weight(czone); - if (czone->shrink_weight) { - list_add_sort(&czone->victim_list, victims); - } else { - ckrm_clear_shrink(czone); - } - } - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - pos = pos->next; - } - shrink_choose_victims(victims, nr_active, nr_inactive); - spin_unlock_irq(&ckrm_mem_lock); - pos = victims->next; - while (pos != victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - pos = pos->next; - } -} -#endif /* CONFIG_CKRM_RES_MEM */ - /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1002,9 +796,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_active; unsigned long nr_inactive; -#ifdef CONFIG_CKRM_RES_MEM - struct ckrm_zone *czone; -#endif /* * Add one to `nr_to_scan' just to make sure that the kernel will @@ -1026,24 +817,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_to_reclaim = SWAP_CLUSTER_MAX; -#ifdef CONFIG_CKRM_RES_MEM - if (nr_active || nr_inactive) { - struct list_head *pos, *next; - LIST_HEAD(victims); - - shrink_get_victims(zone, nr_active, nr_inactive, &victims); - pos = victims.next; - while (pos != &victims) { - czone = list_entry(pos, struct ckrm_zone, victim_list); - next = pos->next; - list_del_init(pos); - ckrm_clear_shrink(czone); - sc->nr_to_reclaim = czone->shrink_inactive; - shrink_ckrmzone(czone, sc); - pos = next; - } - } -#else while (nr_active || nr_inactive) { if (nr_active) { sc->nr_to_scan = min(nr_active, @@ -1061,100 +834,8 @@ shrink_zone(struct zone *zone, struct scan_control *sc) break; } } -#endif -} - -#ifdef CONFIG_CKRM_RES_MEM -// This function needs to be given more thought. -// Shrink the class to be at shrink_to%" of its limit -static void -ckrm_shrink_class(struct ckrm_mem_res *cls) -{ - struct scan_control sc; - struct zone *zone; - int zindex = 0, cnt, act_credit = 0, inact_credit = 0; - int shrink_to = ckrm_mem_get_shrink_to(); - - sc.nr_mapped = read_page_state(nr_mapped); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = 0; // always very high priority - - check_memclass(cls, "bef_shnk_cls"); - for_each_zone(zone) { - int zone_total, zone_limit, active_limit, - inactive_limit, clszone_limit; - struct ckrm_zone *czone; - u64 temp; - - czone = &cls->ckrm_zone[zindex]; - if (ckrm_test_set_shrink(czone)) - continue; - - zone->temp_priority = zone->prev_priority; - zone->prev_priority = sc.priority; - - zone_total = zone->nr_active + zone->nr_inactive - + zone->free_pages; - - temp = (u64) cls->pg_limit * zone_total; - do_div(temp, ckrm_tot_lru_pages); - zone_limit = (int) temp; - clszone_limit = (shrink_to * zone_limit) / 100; - active_limit = (2 * clszone_limit) / 3; // 2/3rd in active list - inactive_limit = clszone_limit / 3; // 1/3rd in inactive list - - czone->shrink_active = 0; - cnt = czone->nr_active + act_credit - active_limit; - if (cnt > 0) { - czone->shrink_active = (unsigned long) cnt; - } else { - act_credit += cnt; - } - - czone->shrink_inactive = 0; - cnt = czone->shrink_active + inact_credit + - (czone->nr_inactive - inactive_limit); - if (cnt > 0) { - czone->shrink_inactive = (unsigned long) cnt; - } else { - inact_credit += cnt; - } - - - if (czone->shrink_active || czone->shrink_inactive) { - sc.nr_to_reclaim = czone->shrink_inactive; - shrink_ckrmzone(czone, &sc); - } - zone->prev_priority = zone->temp_priority; - zindex++; - ckrm_clear_shrink(czone); - } - check_memclass(cls, "aft_shnk_cls"); -} - -static void -ckrm_shrink_classes(void) -{ - struct ckrm_mem_res *cls; - - spin_lock_irq(&ckrm_mem_lock); - while (!ckrm_shrink_list_empty()) { - cls = list_entry(ckrm_shrink_list.next, struct ckrm_mem_res, - shrink_list); - list_del(&cls->shrink_list); - cls->flags &= ~MEM_AT_LIMIT; - spin_unlock_irq(&ckrm_mem_lock); - ckrm_shrink_class(cls); - spin_lock_irq(&ckrm_mem_lock); - } - spin_unlock_irq(&ckrm_mem_lock); } -#else -#define ckrm_shrink_classes() do { } while(0) -#endif - /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -1489,11 +1170,7 @@ static int kswapd(void *p) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); schedule(); finish_wait(&pgdat->kswapd_wait, &wait); - - if (!ckrm_shrink_list_empty()) - ckrm_shrink_classes(); - else - balance_pgdat(pgdat, 0); + balance_pgdat(pgdat, 0); } return 0; } @@ -1505,7 +1182,7 @@ void wakeup_kswapd(struct zone *zone) { if (zone->present_pages == 0) return; - if ((zone->free_pages > zone->pages_low) && ckrm_shrink_list_empty()) + if (zone->free_pages > zone->pages_low) return; if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) return; @@ -1570,7 +1247,7 @@ static int __init kswapd_init(void) swap_setup(); for_each_pgdat(pgdat) pgdat->kswapd - = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + = find_task_by_real_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index bd15e1a34..4bb9390d9 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -349,29 +349,6 @@ config INET_TUNNEL If unsure, say Y. -config ACCEPT_QUEUES - bool "IP: TCP Multiple accept queues support" - depends on INET && NETFILTER - ---help--- - Support multiple accept queues per listening socket. If you say Y - here, multiple accept queues will be configured per listening - socket. - - Each queue is mapped to a priority class. Incoming connection - requests can be classified (see iptables(8), MARK target), depending - on the packet's src/dest address or other parameters, into one of - the priority classes. The requests are then queued to the relevant - accept queue. - - Each of the queues can be assigned a weight. The accept()ance - of packets is then scheduled in accordance with the weight - assigned to the priority class. - - Be sure to enable "Network packet filtering" if you wish - to use this feature. - - If unsure, say N. - config IP_TCPDIAG tristate "IP: TCP socket monitoring interface" depends on INET diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 281099ffa..0e37fd911 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -256,11 +256,6 @@ #include #include #include - -#ifdef CONFIG_CKRM -#include -#endif - #include #include #include @@ -464,20 +459,13 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) int tcp_listen_start(struct sock *sk) { -#ifdef CONFIG_ACCEPT_QUEUES - int i = 0; -#endif struct inet_opt *inet = inet_sk(sk); struct tcp_opt *tp = tcp_sk(sk); struct tcp_listen_opt *lopt; sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; -#ifdef CONFIG_ACCEPT_QUEUES - tp->accept_queue = NULL; -#else tp->accept_queue = tp->accept_queue_tail = NULL; -#endif rwlock_init(&tp->syn_wait_lock); tcp_delack_init(tp); @@ -491,23 +479,6 @@ int tcp_listen_start(struct sock *sk) break; get_random_bytes(&lopt->hash_rnd, 4); -#ifdef CONFIG_ACCEPT_QUEUES - tp->class_index = 0; - for (i=0; i < NUM_ACCEPT_QUEUES; i++) { - tp->acceptq[i].aq_tail = NULL; - tp->acceptq[i].aq_head = NULL; - tp->acceptq[i].aq_wait_time = 0; - tp->acceptq[i].aq_qcount = 0; - tp->acceptq[i].aq_count = 0; - if (i == 0) { - tp->acceptq[i].aq_ratio = 1; - } - else { - tp->acceptq[i].aq_ratio = 0; - } - } -#endif - write_lock_bh(&tp->syn_wait_lock); tp->listen_opt = lopt; write_unlock_bh(&tp->syn_wait_lock); @@ -523,11 +494,6 @@ int tcp_listen_start(struct sock *sk) sk_dst_reset(sk); sk->sk_prot->hash(sk); - -#ifdef CONFIG_CKRM - ckrm_cb_listen_start(sk); -#endif - return 0; } @@ -559,16 +525,7 @@ static void tcp_listen_stop (struct sock *sk) tp->listen_opt = NULL; write_unlock_bh(&tp->syn_wait_lock); -#ifdef CONFIG_CKRM - ckrm_cb_listen_stop(sk); -#endif - -#ifdef CONFIG_ACCEPT_QUEUES - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) - tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL; -#else tp->accept_queue_tail = NULL; -#endif tp->accept_queue = NULL; if (lopt->qlen) { @@ -615,11 +572,7 @@ static void tcp_listen_stop (struct sock *sk) local_bh_enable(); sock_put(child); -#ifdef CONFIG_ACCEPT_QUEUES - sk_acceptq_removed(sk, req->acceptq_class); -#else sk_acceptq_removed(sk); -#endif tcp_openreq_fastfree(req); } BUG_TRAP(!sk->sk_ack_backlog); @@ -1934,10 +1887,6 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) struct open_request *req; struct sock *newsk; int error; -#ifdef CONFIG_ACCEPT_QUEUES - int prev_class = 0; - int first; -#endif lock_sock(sk); @@ -1961,46 +1910,11 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) goto out; } -#ifndef CONFIG_ACCEPT_QUEUES req = tp->accept_queue; if ((tp->accept_queue = req->dl_next) == NULL) tp->accept_queue_tail = NULL; newsk = req->sk; sk_acceptq_removed(sk); -#else - first = tp->class_index; - /* We should always have request queued here. The accept_queue - * is already checked for NULL above. - */ - while(!tp->acceptq[first].aq_head) { - tp->acceptq[first].aq_cnt = 0; - first = (first+1) & ~NUM_ACCEPT_QUEUES; - } - req = tp->acceptq[first].aq_head; - tp->acceptq[first].aq_qcount--; - tp->acceptq[first].aq_count++; - tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp); - - for (prev_class= first-1 ; prev_class >=0; prev_class--) - if (tp->acceptq[prev_class].aq_tail) - break; - if (prev_class>=0) - tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next; - else - tp->accept_queue = req->dl_next; - - if (req == tp->acceptq[first].aq_tail) - tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL; - else - tp->acceptq[first].aq_head = req->dl_next; - - if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){ - tp->acceptq[first].aq_cnt = 0; - tp->class_index = ++first & (NUM_ACCEPT_QUEUES-1); - } - newsk = req->sk; - sk_acceptq_removed(sk, req->acceptq_class); -#endif tcp_openreq_fastfree(req); BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); release_sock(sk); @@ -2172,53 +2086,6 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } break; -#ifdef CONFIG_ACCEPT_QUEUES - case TCP_ACCEPTQ_SHARE: -#ifdef CONFIG_CKRM - // If CKRM is set then the shares are set through rcfs. - // Get shares will still succeed. - err = -EOPNOTSUPP; - break; -#else - { - char share_wt[NUM_ACCEPT_QUEUES]; - int i,j; - - if (sk->sk_state != TCP_LISTEN) - return -EOPNOTSUPP; - - if (copy_from_user(share_wt,optval, optlen)) { - err = -EFAULT; - break; - } - j = 0; - for (i = 0; i < NUM_ACCEPT_QUEUES; i++) { - if (share_wt[i]) { - if (!j) - j = share_wt[i]; - else if (share_wt[i] < j) { - j = share_wt[i]; - } - } - else - tp->acceptq[i].aq_ratio = 0; - - } - if (j == 0) { - /* Class 0 is always valid. If nothing is - * specified set class 0 as 1. - */ - share_wt[0] = 1; - j = 1; - } - for (i=0; i < NUM_ACCEPT_QUEUES; i++) { - tp->acceptq[i].aq_ratio = share_wt[i]/j; - tp->acceptq[i].aq_cnt = 0; - } - } - break; -#endif -#endif default: err = -ENOPROTOOPT; break; @@ -2359,39 +2226,6 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, val = !tp->ack.pingpong; break; -#ifdef CONFIG_ACCEPT_QUEUES - case TCP_ACCEPTQ_SHARE: - { - struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES]; - int i; - - if (sk->sk_state != TCP_LISTEN) - return -EOPNOTSUPP; - - if (get_user(len, optlen)) - return -EFAULT; - - memset(tinfo, 0, sizeof(tinfo)); - - for(i=0; i < NUM_ACCEPT_QUEUES; i++) { - tinfo[i].acceptq_wait_time = - jiffies_to_msecs(tp->acceptq[i].aq_wait_time); - tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount; - tinfo[i].acceptq_count = tp->acceptq[i].aq_count; - tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio; - } - - len = min_t(unsigned int, len, sizeof(tinfo)); - if (put_user(len, optlen)) - return -EFAULT; - - if (copy_to_user(optval, (char *)tinfo, len)) - return -EFAULT; - - return 0; - } - break; -#endif default: return -ENOPROTOOPT; }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2b2fab395..8034ab504 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -919,11 +919,7 @@ static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) lopt->syn_table[h] = req; write_unlock(&tp->syn_wait_lock); -#ifdef CONFIG_ACCEPT_QUEUES - tcp_synq_added(sk, req); -#else tcp_synq_added(sk); -#endif } @@ -1416,9 +1412,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __u32 daddr = skb->nh.iph->daddr; __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; -#ifdef CONFIG_ACCEPT_QUEUES - int class = 0; -#endif #ifdef CONFIG_SYN_COOKIES int want_cookie = 0; #else @@ -1443,31 +1436,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop; } -#ifdef CONFIG_ACCEPT_QUEUES - class = (skb->nfmark <= 0) ? 0 : - ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark); - /* - * Accept only if the class has shares set or if the default class - * i.e. class 0 has shares - */ - if (!(tcp_sk(sk)->acceptq[class].aq_ratio)) { - if (tcp_sk(sk)->acceptq[0].aq_ratio) - class = 0; - else - goto drop; - } -#endif - /* Accept backlog is full. If we have already queued enough * of warm entries in syn queue, drop request. It is better than * clogging syn queue with openreqs with exponentially increasing * timeout. */ -#ifdef CONFIG_ACCEPT_QUEUES - if (sk_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1) -#else if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) -#endif goto drop; req = tcp_openreq_alloc(); @@ -1497,10 +1471,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tp.tstamp_ok = tp.saw_tstamp; tcp_openreq_init(req, &tp, skb); -#ifdef CONFIG_ACCEPT_QUEUES - req->acceptq_class = class; - req->acceptq_time_stamp = jiffies; -#endif req->af.v4_req.loc_addr = daddr; req->af.v4_req.rmt_addr = saddr; req->af.v4_req.opt = tcp_v4_save_options(sk, skb); @@ -1595,11 +1565,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct tcp_opt *newtp; struct sock *newsk; -#ifdef CONFIG_ACCEPT_QUEUES - if (sk_acceptq_is_full(sk, req->acceptq_class)) -#else if (sk_acceptq_is_full(sk)) -#endif goto exit_overflow; if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index fb92b07b2..86cfb4c85 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -799,14 +799,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->num_sacks = 0; newtp->urg_data = 0; newtp->listen_opt = NULL; -#ifdef CONFIG_ACCEPT_QUEUES - newtp->accept_queue = NULL; - memset(newtp->acceptq, 0,sizeof(newtp->acceptq)); - newtp->class_index = 0; - -#else newtp->accept_queue = newtp->accept_queue_tail = NULL; -#endif /* Deinitialize syn_wait_lock to trap illegal accesses. */ memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock)); @@ -1046,7 +1039,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, tcp_synq_unlink(tp, req, prev); tcp_synq_removed(sk, req); - tcp_acceptq_queue(sk, req, child); + tcp_acceptq_queue(sk, req, child); return child; listen_overflow: diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c7cd2e74f..cad366fc2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -491,16 +491,7 @@ static void tcp_synack_timer(struct sock *sk) * ones are about to clog our table. */ if (lopt->qlen>>(lopt->max_qlen_log-1)) { -#ifdef CONFIG_ACCEPT_QUEUES - int young = 0; - - for(i=0; i < NUM_ACCEPT_QUEUES; i++) - young += lopt->qlen_young[i]; - - young <<= 1; -#else int young = (lopt->qlen_young<<1); -#endif while (thresh > 2) { if (lopt->qlen < young) @@ -526,11 +517,7 @@ static void tcp_synack_timer(struct sock *sk) unsigned long timeo; if (req->retrans++ == 0) -#ifdef CONFIG_ACCEPT_QUEUES - lopt->qlen_young[req->acceptq_class]--; -#else lopt->qlen_young--; -#endif timeo = min((TCP_TIMEOUT_INIT << req->retrans), TCP_RTO_MAX); req->expires = now + timeo; reqp = &req->dl_next; @@ -543,11 +530,7 @@ static void tcp_synack_timer(struct sock *sk) write_unlock(&tp->syn_wait_lock); lopt->qlen--; if (req->retrans == 0) -#ifdef CONFIG_ACCEPT_QUEUES - lopt->qlen_young[req->acceptq_class]--; -#else lopt->qlen_young--; -#endif tcp_openreq_free(req); continue; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3a5fd6254..eba10f3d1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1192,11 +1192,7 @@ static void tcp_v6_synq_add(struct sock *sk, struct open_request *req) lopt->syn_table[h] = req; write_unlock(&tp->syn_wait_lock); -#ifdef CONFIG_ACCEPT_QUEUES - tcp_synq_added(sk, req); -#else tcp_synq_added(sk); -#endif } @@ -1209,9 +1205,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) struct tcp_opt tmptp, *tp = tcp_sk(sk); struct open_request *req = NULL; __u32 isn = TCP_SKB_CB(skb)->when; -#ifdef CONFIG_ACCEPT_QUEUES - int class = 0; -#endif if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); @@ -1229,24 +1222,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop; } -#ifdef CONFIG_ACCEPT_QUEUES - class = (skb->nfmark <= 0) ? 0 : - ((skb->nfmark >= NUM_ACCEPT_QUEUES) ? 0: skb->nfmark); - /* - * Accept only if the class has shares set or if the default class - * i.e. class 0 has shares - */ - if (!(tcp_sk(sk)->acceptq[class].aq_ratio)) { - if (tcp_sk(sk)->acceptq[0].aq_ratio) - class = 0; - else - goto drop; - } - - if (sk_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1) -#else if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) -#endif goto drop; @@ -1262,10 +1238,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tmptp.tstamp_ok = tmptp.saw_tstamp; tcp_openreq_init(req, &tmptp, skb); -#ifdef CONFIG_ACCEPT_QUEUES - req->acceptq_class = class; - req->acceptq_time_stamp = jiffies; -#endif req->class = &or_ipv6; ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr); ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr); @@ -1367,11 +1339,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, opt = np->opt; -#ifdef CONFIG_ACCEPT_QUEUES - if (sk_acceptq_is_full(sk, req->acceptq_class)) -#else if (sk_acceptq_is_full(sk)) -#endif goto out_overflow; if (np->rxopt.bits.srcrt == 2 && -- 2.43.0